# Decoder only model
The following implementation uses word embeddings using nltk tokenizer for words and tries to use the multi headed self attention model to train on text.


In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import torch
import nltk
import torch.nn as nn
from random import randrange
import torch.nn.functional as F
nltk.download('punkt')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)
from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


cuda
Mounted at /content/drive


In [2]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

## Tokenizing input data

Create a mapping of the tokenized words into text and viceversa.

- load_and_encode_data_nltk

nltk uses the nltk tokenizer to split the text into sentenses and words. \
https://www.nltk.org/api/nltk.tokenize.html

- tiktoken
[todo]
- sentence piece
[todo]

In [3]:
def read_data(filename):
    text = None
    with open(filename, "r", encoding='utf-8') as f:
        text = f.read()

    return text

In [4]:
def load_and_encode_data_nltk(filename):
    word_to_lookup = {}
    lookup_to_word = {}
    encoded_data = []

    data = read_data(filename)
    print ('length of dataset = ', len(data))
    tokenized_data = [word_tokenize(w) for w in [s for s in sent_tokenize(data)]]
    vocabulary = set()

    for s in tokenized_data:
        for w in s:
            vocabulary.add(w)

    vocabulary = sorted(vocabulary)

    for c, i in list(zip(vocabulary, range(len(vocabulary)))):
        word_to_lookup[c] = i
        lookup_to_word[i] = c

    for s in tokenized_data:
        for w in s:
            encoded_data.append(word_to_lookup[w])

    return word_to_lookup, lookup_to_word, encoded_data

w2l, l2w, tokenized_data = load_and_encode_data_nltk("/content/drive/MyDrive/colab/pg1400.txt")

tensor_tokenized_data = torch.tensor(tokenized_data, dtype=torch.long)

length of dataset =  1013924


In [5]:
print(tensor_tokenized_data.shape, tensor_tokenized_data.dtype)
print(tensor_tokenized_data[:100])

torch.Size([229192]) torch.int64
tensor([13697,  1241,   661,  5082,  8781,   647,   537,  1537,  5120,  7431,
         5967, 12111, 12807,  8781,  2302,  2307,  7092, 12111,  1607,  1453,
         2246,  8407,  8902,  9083,  8781, 12111, 13380,  2509,  8622,  4109,
         2246, 13322,  2188,  8622, 10315, 13172,    10,  1779,  8135,  4077,
         7440,     8,  6277,  7440,  2584,  8867,  9937,  7440, 12617, 12111,
        12069,  8781, 12111,  1241,   661,   904,  7119, 13322, 12169,  5120,
         8867,  8837,  2509, 13444,    10,   770, 13477,  2388,  8660,  7883,
         7092, 12111,  1607,  1453,     8, 13477, 13265,  6661, 12279,  3453,
        12111,  7693,  8781, 12111,  4137, 13181, 13477,  2388,  7883,  2752,
        12817, 12169,  5082,    10,  1554,    56,   647,   537,   162,    56])


## Test training dataset split

In [6]:
n = int(0.8*len(tensor_tokenized_data))
train_data = tensor_tokenized_data[:n]
test_data = tensor_tokenized_data[n:]

In [7]:
train_data, test_data

(tensor([13697,  1241,   661,  ...,     8,  2246, 13181]),
 tensor([12111, 11712, 12997,  ...,  8589,  5083,    10]))

In [8]:
no_of_embeddings = 384 ## Number of trainable parameters per token.
attention_head_size = 16
batch_size = 64
block_size = 256
no_of_heads = 6
evaluation_iters = 200
total_iters = 5000
dropout = 0.2 # to prevent overfitting

train_data[:block_size+1]

tensor([13697,  1241,   661,  5082,  8781,   647,   537,  1537,  5120,  7431,
         5967, 12111, 12807,  8781,  2302,  2307,  7092, 12111,  1607,  1453,
         2246,  8407,  8902,  9083,  8781, 12111, 13380,  2509,  8622,  4109,
         2246, 13322,  2188,  8622, 10315, 13172,    10,  1779,  8135,  4077,
         7440,     8,  6277,  7440,  2584,  8867,  9937,  7440, 12617, 12111,
        12069,  8781, 12111,  1241,   661,   904,  7119, 13322, 12169,  5120,
         8867,  8837,  2509, 13444,    10,   770, 13477,  2388,  8660,  7883,
         7092, 12111,  1607,  1453,     8, 13477, 13265,  6661, 12279,  3453,
        12111,  7693,  8781, 12111,  4137, 13181, 13477,  2388,  7883,  2752,
        12817, 12169,  5082,    10,  1554,    56,   647,   537,   162,    56,
          321,   441,  1301,  4336,    56,   847,    11,     8,    36,  1785,
         5082,     1,    33,  1786,  1026,  9989, 12780,    56,  1109,    51,
            8,    40,   891,    56,   505,   395,    56,   122, 

In [9]:
" ".join([l2w[i] for i in train_data[1:7].numpy()])

'Project Gutenberg eBook of Great Expectations'

In [10]:
def get_data_for_processing(training_data_set):
    data = train_data if training_data_set else test_data
    sample_indices = torch.randint(len(data)-block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in sample_indices])
    y = torch.stack([data[i+1:i+block_size+1] for i in sample_indices])
    return x.to(device), y.to(device)

In [11]:
samples_x, samples_y = get_data_for_processing(True)
for batch in range(batch_size):
    for block in range(1):
        context = samples_x[batch, :block+1]
        target = samples_y[batch, block]
        print (" ".join([l2w[i] for i in context.cpu().numpy()]), l2w[target.item()])

vocab_size = len(l2w)
print (vocab_size)

and said
tapping the
left in
make me
my eyes
‘ And
merely stopping
that I
there !
in great
, and
the forehead
by your
how poor
of the
“ Yes
As I
Joe ?
and said
there were
soldiers .
dead sister
believe he
was many
the man
t got
were some
girl is
on my
beast .
eyes ,
’ She
or vagrants
Biddy to
_that_ would
on no
one occasion
, and
glad to
secret from
! I
forge ,
And how
the boy
his attention
I took
believe ,
of hearing
until at
objected to
was not
downstairs .
speak yet
tell me
was going
my indignant
here ,
extent ,
sir !
cleaning up
hope ,
the right
inaccessibility which
! ”
13698


In [12]:
@torch.no_grad()
def estimation_loss():
  out = {}
  model.eval()
  for split in ['train', 'test']:
    losses = torch.zeros(evaluation_iters)
    for k in range(evaluation_iters):
      X,Y = get_data_for_processing(True if split == 'train' else False)
      logits, loss = model(X,Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

## Attention blocks implentation.
The various classes below signify the implmenetation from various parts of the attention model for decoder.

### Self-attention head

In [13]:
class SelfAttentionHead(nn.Module):
  def __init__(self, attention_head_size):
    super(SelfAttentionHead, self).__init__()
    self.key = nn.Linear(no_of_embeddings, attention_head_size, bias=False)
    self.query = nn.Linear(no_of_embeddings, attention_head_size, bias=False)
    self.value = nn.Linear(no_of_embeddings, attention_head_size, bias=False)
    self.dropout = nn.Dropout(dropout)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
  def forward(self, idx):
    bts, bs, no_of_embeddings = idx.shape
    # print ('idx = ', idx.shape)
    k = self.key(idx)
    q = self.query(idx)

    weights = q @ k.transpose(-2,-1) * no_of_embeddings**-0.5
    # print ('w after transpose mul= ', weights.shape)
    weights = weights.masked_fill(self.tril[:bs][:bs] == 0, float('-inf'))

    weights = F.softmax(weights, dim=-1)
    weights = self.dropout(weights)

    values = self.value(idx)
    # print (weights.shape, values.shape)

    out = weights @ values

    return out

### Multi attention head

In [14]:
class MultiHeadAttn(nn.Module):
  def __init__(self, num_heads, attention_head_size):
    super(MultiHeadAttn, self).__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(attention_head_size) for _ in range(num_heads)])
    self.self_projection = nn.Linear(no_of_embeddings, no_of_embeddings)
    self.dropout = nn.Dropout(dropout)
  def forward(self, idx):
    self_attn_out = torch.cat([head(idx) for head in self.heads], dim=-1)
    return self.dropout(self.self_projection(self_attn_out))

### Feed forward


In [15]:
class FeedForward(nn.Module):
  def __init__(self, no_of_embeddings):
    super(FeedForward, self).__init__()
    self.network = nn.Sequential(
        nn.Linear(no_of_embeddings, 4*no_of_embeddings),
        nn.ReLU(),
        nn.Linear(4*no_of_embeddings, no_of_embeddings), # This is projection.
        nn.Dropout(dropout)
    )

  def forward(self, idx):
    return self.network(idx)

### Block
General block structure to hold other pieces

In [16]:
class Block(nn.Module):
  def __init__(self, no_of_embeddings, no_of_heads):
    super(Block, self).__init__()
    self.self_attn_heads = MultiHeadAttn(no_of_heads, no_of_embeddings//no_of_heads)
    self.feed_forward = FeedForward(no_of_embeddings)
    self.layer_norm1 = nn.LayerNorm(no_of_embeddings)
    self.layer_norm2 = nn.LayerNorm(no_of_embeddings)

  # idx is added as a concept from residual image nets to preserve the original x input.
  def forward(self, idx):
    self_attn_heads_out = idx + self.self_attn_heads(self.layer_norm1(idx))
    ffwd_out = idx + self.feed_forward(self.layer_norm2(self_attn_heads_out))
    return ffwd_out

### Main model

In [20]:
class LLM(nn.Module):
    def __init__(self, vocab_size, context, embedded_dim):
      super(LLM, self).__init__()
      self.embedding = nn.Embedding(vocab_size, no_of_embeddings) # batches, block_size, embeddings
      self.position_embedding = nn.Embedding(block_size, no_of_embeddings) # block_size, embeddings

      self.blocks = nn.Sequential(
          Block(no_of_embeddings, no_of_heads),
          Block(no_of_embeddings, no_of_heads),
          Block(no_of_embeddings, no_of_heads),
          Block(no_of_embeddings, no_of_heads),
          Block(no_of_embeddings, no_of_heads),
          Block(no_of_embeddings, no_of_heads),
          nn.LayerNorm(no_of_embeddings)
      )
      self.language_model_head = nn.Linear(no_of_embeddings, vocab_size)

    def forward(self, idx, targets=None):
      batch, blocksize = idx.shape
      token_embedding = self.embedding(idx)
      block_size_values = torch.arange(blocksize, device=device)
      position_embedding = self.position_embedding(block_size_values)
      tokpos_embedding = token_embedding + position_embedding
      # print (tokpos_embedding.shape, token_embedding.shape, position_embedding.shape)
      self_attn_out = self.blocks(tokpos_embedding)
      logits = self.language_model_head(self_attn_out)

      if targets is None:
          loss = None
      else:
          B,T,C = logits.shape
          logits = logits.view(B*T,C)
          targets = targets.view(B*T)
          loss = F.cross_entropy(logits, targets)

      return logits, loss

    def generate(self, idx, max_new_tokens):
      for _ in range(max_new_tokens):
          idx_condition = idx[:, -block_size:]
          # print (idx_condition)
          logits, loss = self(idx_condition)
          logits = logits[:, -1,  :]
          probs = F.softmax(logits, dim=-1)
          idx_next = torch.multinomial(probs, num_samples=1)
          idx = torch.cat((idx, idx_next), dim=1)
      return idx

model = LLM(vocab_size, block_size, vocab_size).to(device)
logits, loss = model(samples_x, samples_y)
print (logits.shape, loss)

torch.Size([16384, 13698]) tensor(9.6857, device='cuda:0', grad_fn=<NllLossBackward0>)


In [21]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

### Training

In [22]:
for step in range(total_iters):
    if step % evaluation_iters == 0:
      losses = estimation_loss()
      print("Training data loss = ", losses['train'].item(), "Test data loss = ", losses['test'].item())
    samples_x, samples_y = get_data_for_processing(True)
    logits, loss = model(samples_x, samples_y)
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()


Training data loss =  9.685654640197754 Test data loss =  9.68315315246582
Training data loss =  4.157632827758789 Test data loss =  5.438475131988525
Training data loss =  3.4296507835388184 Test data loss =  5.6463623046875
Training data loss =  2.558993339538574 Test data loss =  6.084308624267578
Training data loss =  1.604283094406128 Test data loss =  6.748580455780029
Training data loss =  0.9546141028404236 Test data loss =  7.504760265350342
Training data loss =  0.5714260935783386 Test data loss =  8.206672668457031
Training data loss =  0.38156673312187195 Test data loss =  8.755910873413086
Training data loss =  0.2747141420841217 Test data loss =  9.267433166503906
Training data loss =  0.21708902716636658 Test data loss =  9.593343734741211
Training data loss =  0.18563181161880493 Test data loss =  9.927550315856934
Training data loss =  0.16241146624088287 Test data loss =  10.208747863769531
Training data loss =  0.1444084495306015 Test data loss =  10.530871391296387


## Example generation

In [63]:
g = model.generate(torch.tensor([tokenized_data[8000:8256]], dtype=torch.long, device=device), max_new_tokens=256)

print ("INPUT sequence (256 chars)")
print (" ".join([l2w[i] for i in tokenized_data[8000:8256]]))

print ("Output generated (256 chars)")
print(" ".join([l2w[i] for i in g.cpu().numpy()[0][256:]]))



INPUT sequence (256 chars)
, I ’ ll bet you. ” He was gobbling mincemeat , meatbone , bread , cheese , and pork pie , all at once : staring distrustfully while he did so at the mist all round us , and often stopping—even stopping his jaws—to listen . Some real or fancied sound , some clink upon the river or breathing of beast upon the marsh , now gave him a start , and he said , suddenly , — “ You ’ re not a deceiving imp ? You brought no one with you ? ” “ No , sir ! No ! ” “ Nor giv ’ no one the office to follow you ? ” “ No ! ” “ Well , ” said he , “ I believe you . You ’ d be but a fierce young hound indeed , if at your time of life you could help to hunt a wretched warmint hunted as near death and dunghill as this poor wretched warmint is ! ” Something clicked in his throat as if he had works in him like a clock , and was going to strike . And he smeared his ragged rough sleeve over his eyes . Pitying his desolation , and watching him as he gradually settled down upon the pie , I 