In [15]:
import sys
sys.path.append("/home/marius/lsv/languagemodels")

In [16]:
import torch 
import numpy as np

from languagemodels import LMFactory

In [17]:
# create a language model
lm = LMFactory.get_lm(name_or_path="dummy-lm", config={}, pre_trained=False)

In [18]:
# let's create some toy data
data = [
    "This is the first example sentence .",
    "Another one, but shorter ."
]
print(data)

['This is the first example sentence .', 'Another one, but shorter .']


In [19]:
# let's encode the data
tokenized_data = []

## tokenize input sequences
max_seq_len = np.max([len(s.split(" ")) for s in data]) # let's use the length of the longest input sequence as the max_seq_len
max_seq_len = 20 # alternatively, we could also use a constant

for s in data:
    words = s.split(" ") # word-level tokenization
    words = ["<s>"] + words + ["</s>"] # add special tokens to mark beginning and end of a sequence

    # make sure all sequences have the same length
    if len(words) < max_seq_len:
        words = words + ["<pad>" for _ in range(max_seq_len - len(words))]
    tokenized_data.append(words)

print(tokenized_data)

## encode tokenized sequences
encoded_data = []
for s in tokenized_data:
    encoded_data.append(list(np.random.randint(0, 100, size=len(s)))) # for now we just replace each token with a random integer

## convert to tensors
encoded_data = torch.tensor(encoded_data)
print(encoded_data)


[['<s>', 'This', 'is', 'the', 'first', 'example', 'sentence', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<s>', 'Another', 'one,', 'but', 'shorter', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']]
tensor([[51, 60, 28,  7, 17, 96, 33,  0, 85, 78, 39, 55, 71, 80, 99,  6, 14, 18,
         55, 78],
        [99, 34, 52, 88, 71, 52, 15, 11,  8, 63, 54, 94, 66, 74, 29, 82, 83, 26,
         11, 27]])


In [27]:
# forward the encoded data to the model and compute surprisal for each word in the sequence
surprisal = lm.compute_surprisal(encoded_data)

In [28]:
surprisal

tensor([[0.0221, 1.0767, 0.5677, 2.6798, 0.8209, 2.1161, 4.7400, 0.0816, 0.3390,
         1.1069, 0.1376, 1.9939, 3.2189, 1.3529, 1.2732, 5.0946, 0.2827, 0.4708,
         1.9352, 0.0753],
        [2.7609, 0.8220, 0.2466, 0.3541, 1.9608, 1.2629, 0.1956, 2.2882, 1.6082,
         5.4726, 2.8841, 0.3558, 1.1199, 0.1254, 2.4413, 0.2506, 2.8125, 0.3025,
         0.0356, 0.7196]])