In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import torch
import nltk
from random import randrange
nltk.download('punkt')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


In [2]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

## Tokenizing input data

Create a mapping of the tokenized words into text and viceversa.

- load_and_encode_data_nltk

nltk uses the nltk tokenizer to split the text into sentenses and words. \
https://www.nltk.org/api/nltk.tokenize.html

- tiktoken
[todo]
- sentence piece
[todo]

In [3]:
def read_data(filename):
    text = None
    with open(filename, "r", encoding='utf-8') as f:
        text = f.read()

    return text

In [4]:
def load_and_encode_data_nltk(filename):
    word_to_lookup = {}
    lookup_to_word = {}
    encoded_data = []

    data = read_data(filename)
    print ('length of dataset = ', len(data))
    tokenized_data = [word_tokenize(w) for w in [s for s in sent_tokenize(data)]]
    vocabulary = set()

    for s in tokenized_data:
        for w in s:
            vocabulary.add(w)

    vocabulary = sorted(vocabulary)

    for c, i in list(zip(vocabulary, range(len(vocabulary)))):
        word_to_lookup[c] = i
        lookup_to_word[i] = c

    for s in tokenized_data:
        for w in s:
            encoded_data.append(word_to_lookup[w])

    return word_to_lookup, lookup_to_word, encoded_data

w2l, l2w, tokenized_data = load_and_encode_data_nltk("/content/drive/MyDrive/colab/pg1400.txt")

tensor_tokenized_data = torch.tensor(tokenized_data, dtype=torch.long)

length of dataset =  1013924


In [5]:
print(tensor_tokenized_data.shape, tensor_tokenized_data.dtype)
print(tensor_tokenized_data[:100])

torch.Size([229192]) torch.int64
tensor([13697,  1241,   661,  5082,  8781,   647,   537,  1537,  5120,  7431,
         5967, 12111, 12807,  8781,  2302,  2307,  7092, 12111,  1607,  1453,
         2246,  8407,  8902,  9083,  8781, 12111, 13380,  2509,  8622,  4109,
         2246, 13322,  2188,  8622, 10315, 13172,    10,  1779,  8135,  4077,
         7440,     8,  6277,  7440,  2584,  8867,  9937,  7440, 12617, 12111,
        12069,  8781, 12111,  1241,   661,   904,  7119, 13322, 12169,  5120,
         8867,  8837,  2509, 13444,    10,   770, 13477,  2388,  8660,  7883,
         7092, 12111,  1607,  1453,     8, 13477, 13265,  6661, 12279,  3453,
        12111,  7693,  8781, 12111,  4137, 13181, 13477,  2388,  7883,  2752,
        12817, 12169,  5082,    10,  1554,    56,   647,   537,   162,    56])


Test training dataset split

In [6]:
n = int(0.8*len(tensor_tokenized_data))
train_data = tensor_tokenized_data[:n]
test_data = tensor_tokenized_data[n:]

In [7]:
train_data, test_data

(tensor([13697,  1241,   661,  ...,     8,  2246, 13181]),
 tensor([12111, 11712, 12997,  ...,  8589,  5083,    10]))

In [8]:
block_size = 8
batch_size = 4

train_data[:block_size+1]

tensor([13697,  1241,   661,  5082,  8781,   647,   537,  1537,  5120])

In [9]:
" ".join([l2w[i] for i in train_data[1:7].numpy()])

'Project Gutenberg eBook of Great Expectations'

## Embedding layers

pytorch embedding layers act as a array of trainable parameters for a given vocabulary size. For example, if we have n numbers where we are using the the data in context and target as in the table below, the vocabulary size is n and we can have some embedding paramters depending on the size of the data we are trying to feed.


### context size = 1, target size = 1
The following example embeds the sequence of numbers up to 10 as the context and target.

|context|target|
|-|-|
|1|2|
|2|3|
|3|4|
|..|..|
|n|n+1|

In [10]:
import torch
import torch.nn as nn

import torch.nn.functional as F

vocab_size = 10
context_size = 1

input_tensor = []
for i in range(1,vocab_size-context_size):
    input_tensor.append([[i], [i+1]])

class BiGram(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim):
        super(BiGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1,-1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


model = BiGram(vocab_size,context_size, vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()
losses = []

for epoch in range(100):
    # print('epoch: {}'.format(epoch + 1))
    running_loss = 0
    for data in input_tensor:
        word, label = data
        word = torch.LongTensor(word)
        label = torch.LongTensor(label)
        # forward
        out = model(word)
        loss = F.cross_entropy(out, label)
        running_loss += loss.item()
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
print('Loss: {:.6f}'.format(running_loss / vocab_size))

model.embeddings.weight[[1]]

Loss: 0.881331


tensor([[-0.2687, -0.7669, -1.5136,  1.0216, -1.2418, -0.4616, -0.3318,  0.0050,
          1.2334,  0.6881]], grad_fn=<IndexBackward0>)

In [11]:
pred = randrange(1, 10)
target = pred + 1
out = model(torch.tensor([pred], dtype=torch.long))
_, predict_label = torch.max(out, 1)
predict_word = predict_label.data[0].item()
print('real word is {}, predict word is {}'.format(target, predict_word))

real word is 10, predict word is 5


### context_size = 2; target_size = 1

|context|target|
|-|-|
|1,2|3|
|2,3|4|
|4,5|6|
|..|..|
|n,n+1|n+2|


In [12]:
import torch.nn.functional as F

context_size = 2
# The end of vocab size might overflow at the boundary like 9+10 = 11. Hence add context size for safely rail.
vocab_size = 10+context_size

input_tensor = []
for i in range(1,vocab_size-context_size):
    input_tensor.append([[i, i+1], [i+2]])

class BiGram(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim):
        super(BiGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1,-1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

model = BiGram(vocab_size,context_size, vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()
losses = []

for epoch in range(100):
    # print('epoch: {}'.format(epoch + 1))
    running_loss = 0
    for data in input_tensor:
        word, label = data
        word = torch.LongTensor(word)
        label = torch.LongTensor(label)
        # forward
        out = model(word)
        loss = F.cross_entropy(out, label)
        running_loss += loss.item()
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
print('Loss: {:.6f}'.format(running_loss / vocab_size))

model.embeddings.weight[[1]]

Loss: 0.823904


tensor([[-0.8044, -2.0680, -0.3244,  0.8827, -0.5343,  1.4524, -0.1523, -1.1447,
          1.0708,  0.2457, -1.1162,  1.9035]], grad_fn=<IndexBackward0>)

In [13]:
pred = randrange(1, 10)
target = pred + 2
print ((pred, pred+1), pred+2)
out = model(torch.tensor([pred, pred+1], dtype=torch.long))
_, predict_label = torch.max(out, 1)
predict_word = predict_label.data[0].item()
print('real word is {}, predict word is {}'.format(target, predict_word))

(9, 10) 11
real word is 11, predict word is 11


## String embeddings

From our data split above, lets construct the sequences to be read from for testing data

In [14]:
def get_data_for_processing(training_data_set):
    data = train_data if training_data_set else test_data
    sample_indices = torch.randint(len(data)-block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in sample_indices])
    y = torch.stack([data[i+1:i+block_size+1] for i in sample_indices])
    return x.to(device), y.to(device)

In [15]:
samples_x, samples_y = get_data_for_processing(True)
for batch in range(batch_size):
    for block in range(block_size):
        context = samples_x[batch, :block+1]
        target = samples_y[batch, block]
        print (" ".join([l2w[i] for i in context.cpu().numpy()]), l2w[target.item()])

vocab_size = len(l2w)
print (vocab_size)

do I
do I ,
do I , ”
do I , ” said
do I , ” said Mr.
do I , ” said Mr. Pumblechook
do I , ” said Mr. Pumblechook ,
do I , ” said Mr. Pumblechook , getting
When she
When she left
When she left off—and
When she left off—and she
When she left off—and she had
When she left off—and she had not
When she left off—and she had not laughed
When she left off—and she had not laughed languidly
buying such
buying such household
buying such household stuffs
buying such household stuffs and
buying such household stuffs and goods
buying such household stuffs and goods as
buying such household stuffs and goods as required
buying such household stuffs and goods as required a
who had
who had never
who had never been
who had never been heard
who had never been heard of
who had never been heard of before
who had never been heard of before )
who had never been heard of before ) coming
13698


In [16]:
@torch.no_grad()
def estimation_loss():
  out = {}
  model.eval()
  for split in ['train', 'test']:
    losses = torch.zeros(evaluation_iters)
    for k in range(evaluation_iters):
      X,Y = get_data_for_processing(True if split == 'train' else False)
      logits, loss = model(X,Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out



In [17]:
class LLM(nn.Module):
    def __init__(self, vocab_size, context, embedded_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.embedding(idx)


        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1,  :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = LLM(vocab_size, block_size, vocab_size).to(device)
logits, loss = model(samples_x, samples_y)
print (logits.shape, loss)

torch.Size([32, 13698]) tensor(9.9515, device='cuda:0', grad_fn=<NllLossBackward0>)


In [18]:
print(device)
g = model.generate(idx=torch.tensor([[4002]], dtype=torch.long).to(device), max_new_tokens=40)
" ".join([l2w[i] for i in g.cpu().numpy()[0]])

cuda


'contemptuously choicer donkey heaven enormous shedding incurred unassertive hauled giants enough. safety tokens Sessions tools confront cleared. empty-handed snake eyes—a —Then hurt. hurried glass credence Skiffins dexterity conviction blackest-looking shorter contradicted mo— consorted surveyor hesitated pencil apologetically scandalised injudicious preceded one-eyed'

In [19]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [20]:
batch_size = 32
evaluation_iters = 200
for step in range(1000):
    if step % evaluation_iters == 0:
      losses = estimation_loss()
      print (losses)
      print("Training data loss = ", losses['train'], "Test data loss = ", losses['test'])
    samples_x, samples_y = get_data_for_processing(True)
    logits, loss = model(samples_x, samples_y)
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()
    if step % 100 == 0:
        print (loss.item())


{'train': tensor(9.9856), 'test': tensor(9.9900)}
Training data loss =  tensor(9.9856) Test data loss =  tensor(9.9900)
10.066347122192383
9.968426704406738
{'train': tensor(9.8687), 'test': tensor(9.8892)}
Training data loss =  tensor(9.8687) Test data loss =  tensor(9.8892)
9.848640441894531
9.766807556152344
{'train': tensor(9.7412), 'test': tensor(9.7805)}
Training data loss =  tensor(9.7412) Test data loss =  tensor(9.7805)
9.731710433959961
9.754940032958984
{'train': tensor(9.5936), 'test': tensor(9.6585)}
Training data loss =  tensor(9.5936) Test data loss =  tensor(9.6585)
9.703038215637207
9.542753219604492
{'train': tensor(9.4666), 'test': tensor(9.5497)}
Training data loss =  tensor(9.4666) Test data loss =  tensor(9.5497)
9.387492179870605
9.384162902832031


In [25]:
g = model.generate(idx=torch.tensor([[3000]], dtype=torch.long).to(device), max_new_tokens=40)
" ".join([l2w[i] for i in g.cpu().numpy()[0]])

'books spring weight boarder common performing scores eighteen-pence expense— joviality depreciation Tramping material shriek sorrowful gird retiring merits THIS Naturally kissing 3 effectually home-voice hailing blubbered sluice charmed threshold tall On-common seventh unfortunately available lounging honoured : saddler respect. awaits carriages'