In [376]:
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import torch
import nltk
from random import randrange
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vratnam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [377]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

## Tokenizing input data

Create a mapping of the tokenized words into text and viceversa.

- load_and_encode_data_nltk

nltk uses the nltk tokenizer to split the text into sentenses and words. \
https://www.nltk.org/api/nltk.tokenize.html

- tiktoken
[todo]
- sentence piece
[todo] 

In [378]:
def read_data(filename):
    text = None
    with open(filename, "r", encoding='utf-8') as f:
        text = f.read()
        
    return text

In [379]:
def load_and_encode_data_nltk(filename):
    word_to_lookup = {}
    lookup_to_word = {}
    encoded_data = []

    data = read_data(filename)
    print ('length of dataset = ', len(data))    
    tokenized_data = [word_tokenize(w) for w in [s for s in sent_tokenize(data)]]
    vocabulary = set()

    for s in tokenized_data:
        for w in s:
            vocabulary.add(w)

    vocabulary = sorted(vocabulary)

    for c, i in list(zip(vocabulary, range(len(vocabulary)))):
        word_to_lookup[c] = i
        lookup_to_word[i] = c

    for s in tokenized_data:
        for w in s:
            encoded_data.append(word_to_lookup[w])        
        
    return word_to_lookup, lookup_to_word, encoded_data

w2l, l2w, tokenized_data = load_and_encode_data_nltk("data/pg1400.txt")

tensor_tokenized_data = torch.tensor(tokenized_data, dtype=torch.long)

length of dataset =  1013924


In [380]:
print(tensor_tokenized_data.shape, tensor_tokenized_data.dtype)
print(tensor_tokenized_data[:100])

torch.Size([229192]) torch.int64
tensor([13697,  1241,   661,  5082,  8781,   647,   537,  1537,  5120,  7431,
         5967, 12111, 12807,  8781,  2302,  2307,  7092, 12111,  1607,  1453,
         2246,  8407,  8902,  9083,  8781, 12111, 13380,  2509,  8622,  4109,
         2246, 13322,  2188,  8622, 10315, 13172,    10,  1779,  8135,  4077,
         7440,     8,  6277,  7440,  2584,  8867,  9937,  7440, 12617, 12111,
        12069,  8781, 12111,  1241,   661,   904,  7119, 13322, 12169,  5120,
         8867,  8837,  2509, 13444,    10,   770, 13477,  2388,  8660,  7883,
         7092, 12111,  1607,  1453,     8, 13477, 13265,  6661, 12279,  3453,
        12111,  7693,  8781, 12111,  4137, 13181, 13477,  2388,  7883,  2752,
        12817, 12169,  5082,    10,  1554,    56,   647,   537,   162,    56])


Test training dataset split

In [381]:
n = int(0.8*len(tensor_tokenized_data))
train_data = tensor_tokenized_data[:n]
test_data = tensor_tokenized_data[n:]

In [382]:
train_data, test_data

(tensor([13697,  1241,   661,  ...,     8,  2246, 13181]),
 tensor([12111, 11712, 12997,  ...,  8589,  5083,    10]))

--------- EOS

In [383]:
block_size = 8
batch_size = 4

train_data[:block_size+1]

tensor([13697,  1241,   661,  5082,  8781,   647,   537,  1537,  5120])

In [384]:
" ".join([l2w[i] for i in train_data[1:7].numpy()])

'Project Gutenberg eBook of Great Expectations'

## Embedding layers

pytorch embedding layers act as a array of trainable parameters for a given vocabulary size. For example, if we have n numbers where we are using the the data in context and target as in the table below, the vocabulary size is n and we can have some embedding paramters depending on the size of the data we are trying to feed. 


### context size = 1, target size = 1
The following example embeds the sequence of numbers up to 10 as the context and target.

|context|target|
|-|-|
|1|2|
|2|3|
|3|4|
|..|..|
|n|n+1|

In [385]:
import torch.nn.functional as F

vocab_size = 10
context_size = 1

input_tensor = []
for i in range(1,vocab_size-context_size):
    input_tensor.append([[i], [i+1]])

class BiGram(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim):
        super(BiGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1,-1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
    
    
model = BiGram(vocab_size,context_size, vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()
losses = []

for epoch in range(100):
    # print('epoch: {}'.format(epoch + 1))
    running_loss = 0
    for data in input_tensor:
        word, label = data
        word = Variable(torch.LongTensor(word))
        label = Variable(torch.LongTensor(label))
        # forward
        out = model(word)
        loss = criterion(out, label)
        running_loss += loss.item()
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))

model.embeddings.weight[[1]]

Loss: 0.892305


tensor([[-0.9885, -0.2275, -0.1987,  0.5352, -2.0190, -0.4559, -1.2107, -1.1841,
          0.3540, -0.6033]], grad_fn=<IndexBackward0>)

In [386]:
pred = randrange(1, 10)
target = pred + 1
out = model(torch.tensor([pred], dtype=torch.long))
_, predict_label = torch.max(out, 1)
predict_word = predict_label.data[0].item()
print('real word is {}, predict word is {}'.format(target, predict_word))

real word is 2, predict word is 2


### context_size = 2; target_size = 1

|context|target|
|-|-|
|1,2|3|
|2,3|4|
|4,5|6|
|..|..|
|n,n+1|n+2|


In [387]:
import torch.nn.functional as F

context_size = 2
# The end of vocab size might overflow at the boundary like 9+10 = 11. Hence add context size for safely rail. 
vocab_size = 10+context_size

input_tensor = []
for i in range(1,vocab_size-context_size):
    input_tensor.append([[i, i+1], [i+2]])

class BiGram(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim):
        super(BiGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1,-1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
    
model = BiGram(vocab_size,context_size, vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()
losses = []

for epoch in range(100):
    # print('epoch: {}'.format(epoch + 1))
    running_loss = 0
    for data in input_tensor:
        word, label = data
        word = Variable(torch.LongTensor(word))
        label = Variable(torch.LongTensor(label))
        # forward
        out = model(word)
        loss = criterion(out, label)
        running_loss += loss.item()
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))

model.embeddings.weight[[1]]

Loss: 1.143240


tensor([[ 0.7767,  1.7059,  0.1578,  1.2860, -0.0387,  0.8140, -0.6479, -0.5357,
         -0.6938,  0.0054, -1.0362,  0.5366]], grad_fn=<IndexBackward0>)

In [388]:
pred = randrange(1, 10)
target = pred + 2
print ((pred, pred+1), pred+2)
out = model(torch.tensor([pred, pred+1], dtype=torch.long))
_, predict_label = torch.max(out, 1)
predict_word = predict_label.data[0].item()
print('real word is {}, predict word is {}'.format(target, predict_word))

(9, 10) 11
real word is 11, predict word is 11


## String embeddings

From our data split above, lets construct the sequences to be read from for testing data

In [389]:
def get_data_for_processing(training_data_set):
    data = train_data if training_data_set else test_data
    sample_indices = torch.randint(len(data)-block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in sample_indices])
    y = torch.stack([data[i+1:i+block_size+1] for i in sample_indices])
    return x, y

In [390]:
samples_x, samples_y = get_data_for_processing(True)
for batch in range(batch_size):
    for block in range(block_size):
        context = samples_x[batch, :block+1]
        target = samples_y[batch, block]
        print (" ".join([l2w[i] for i in context.numpy()]), l2w[target.item()])

vocab_size = len(l2w)
print (vocab_size)

a pair
a pair of
a pair of pigeons
a pair of pigeons are
a pair of pigeons are portable
a pair of pigeons are portable property
a pair of pigeons are portable property all
a pair of pigeons are portable property all the
was comparatively
was comparatively early
was comparatively early days
was comparatively early days with
was comparatively early days with him
was comparatively early days with him then
was comparatively early days with him then ,
was comparatively early days with him then , and
at the
at the end
at the end of
at the end of it
at the end of it she
at the end of it she stopped
at the end of it she stopped ,
at the end of it she stopped , and
he had
he had done
he had done for
he had done for the
he had done for the night
he had done for the night .
he had done for the night . Then
he had done for the night . Then I
13698


In [391]:
class LLM(nn.Module):
    def __init__(self, vocab_size, context, embedded_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        logits = self.embedding(idx)
        
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1,  :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1) 
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    
model = LLM(vocab_size, block_size, vocab_size)
logits, loss = model(samples_x, samples_y)
print (logits.shape, loss)

torch.Size([32, 13698]) tensor(10.3414, grad_fn=<NllLossBackward0>)


In [392]:
g = model.generate(idx=torch.tensor([[4002]], dtype=torch.long), max_new_tokens=40)
" ".join([l2w[i] for i in g.numpy()[0]])

'contemptuously Mr. drawing vessel joints stands ring suffered _She half-brother appeared. between torn-up splendid rind reflect condescend robe Buttons snorting sever necromantic unreservedly growl boatmen breakfast limited resentment here.—It Rag paid. re sustained —whether enthralling staring sooty decide convenience tarnished piano-forte'