The notebook refers the work in this [Paper](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
!wget https://raw.githubusercontent.com/suvigyajain0101/NLP/llms/LLMs/names.txt

--2023-05-06 21:48:17--  https://raw.githubusercontent.com/suvigyajain0101/NLP/llms/LLMs/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2023-05-06 21:48:17 (8.31 MB/s) - ‘names.txt’ saved [228145/228145]



In [8]:
words = open('names.txt').read().splitlines()
words[0:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [9]:
len(words)

32033

In [11]:
# Build vocab of characters and mapping to and from integers
chars = sorted(list(set(''.join(words))))
assert len(chars) == 26

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

print(stoi)
print(itos)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


### Data Preparation

In [12]:
# Context Length - How many characters are used to predict the next one
block_size = 3

X, Y = [], []
for w in words[0:5]:

  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)

    print(''.join(itos[i] for i in context), '------>', itos[ix])

    context = context[1:] + [ix]


X = torch.tensor(X)
Y = torch.tensor(Y)


emma
... ------> e
..e ------> m
.em ------> m
emm ------> a
mma ------> .
olivia
... ------> o
..o ------> l
.ol ------> i
oli ------> v
liv ------> i
ivi ------> a
via ------> .
ava
... ------> a
..a ------> v
.av ------> a
ava ------> .
isabella
... ------> i
..i ------> s
.is ------> a
isa ------> b
sab ------> e
abe ------> l
bel ------> l
ell ------> a
lla ------> .
sophia
... ------> s
..s ------> o
.so ------> p
sop ------> h
oph ------> i
phi ------> a
hia ------> .


In [17]:
# Example data, label in the dataset
print(X[1], Y[1])

X.shape, X.dtype, Y.shape, Y.dtype

tensor([0, 0, 5]) tensor(13)


(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

#### Embedding Matrix

The awesomeness of Pytorch indexing!

In [18]:
# Let's assume we want to embed chars to 2 axes
C = torch.randn((27, 2))

In [19]:
# in order to get the element from C, we can use 2 approaches - 
# 1. Index the tensor, just like python list
# 2. Matrix multiplication -> remember, one-hot matrix when multiplied with another matrix gives the row of interest
# Obviously, it's much faster to just use indexing, and we'll park the idea of matrix multiplication for now
C[5]

tensor([0.4540, 0.2732])

In [21]:
# But instead of 1 int, we need to index a matrix of shape (27, 2)
# Turns out, Pytorch can take care of this
C[X].shape

torch.Size([32, 3, 2])

2 elements of the embedding matrix for every element (32, 3) of the input

In [23]:
# We can verify

test_output = C[X]

test_row = 3
print('input row: ', X[test_row])
print('embedding matrix row: ', C[X[test_row]])
print('Indexed row: ', test_output[test_row])

input row:  tensor([ 5, 13, 13])
embedding matrix row:  tensor([[ 0.4540,  0.2732],
        [ 1.8485, -0.3960],
        [ 1.8485, -0.3960]])
Indexed row:  tensor([[ 0.4540,  0.2732],
        [ 1.8485, -0.3960],
        [ 1.8485, -0.3960]])


Matches perfectly!

In [24]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

#### Hidden layer

In [25]:
W1 = torch.randn((6, 100))
# 6 because for every context block of 3 chars, we have 2-dim embedding
# 100 is a hyper-parameter

b1 = torch.randn(100)

In [26]:
# In order to do emb @ W1 + b, we need to re-arrange the matrices
# For now, these are not multipliable
# There are multiple ways in Pytorch to do this

In [30]:
# 1. using cat
torch.cat([emb[:, 0, :] , emb[:, 1, :] , emb[:, 2, :]], 1).shape

# Need to update code everytime when block_size is changes

torch.Size([32, 6])

In [33]:
# 2. using unbind
torch.cat(torch.unbind(emb, 1), 1).shape

# Costly, specially memory wise

torch.Size([32, 6])

In [34]:
# 3. re-arranging using torch internals - update the view of the tensor
emb.view(32, 6).shape

# Fastest and cheapest method of doing this

torch.Size([32, 6])

In [43]:
# Verify if the results are correct
# torch.all checks condition on all element of the tensor
print(torch.all((torch.cat([emb[:, 0, :] , emb[:, 1, :] , emb[:, 2, :]], 1) == torch.cat(torch.unbind(emb, 1), 1)) == True))
print(torch.all((torch.cat([emb[:, 0, :] , emb[:, 1, :] , emb[:, 2, :]], 1) == emb.view(32, 6)) == True))


tensor(True)
tensor(True)


Getting back to the hidden layer

In [45]:
h0 = emb.view(-1, 6) @ W1 + b1 # If -1, pytorch infers the dims
h = torch.tanh(h0)

h.shape

torch.Size([32, 100])

#### Softmax layer and Loss

In [46]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [47]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [53]:
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)

# Notice the indexing of probs - we need to extract the probs at that particular Y index
loss = -probs[torch.arange(32),Y].log().mean()
loss

tensor(17.3551)

In [54]:
# The above is just for educational purpose and is rarely used in practice
loss = F.cross_entropy(logits, Y)
loss

tensor(17.3551)

Few reasons
1. Efficient under-the-hood implementation. Pytorch won't create so many additional tensors and save on memory. Also, both forward and backward passes are much more efficient
2. Exp overflow - When input increases in +ve direction, exp() tends to reach inf. Internally, Pytorch subtracts max of the input and avoids inf (since, [10,15,20,25] will have same output as [9,14,19,24])

#### Training Loop

Putting it all together

In [65]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [66]:
g = torch.Generator().manual_seed(10)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [67]:
print('Total trainable parameters: ', sum(p.nelement() for p in parameters))

Total trainable parameters:  3481


In [68]:
# Make sure all the parameters support gradient and updates
for p in parameters:
  p.requires_grad = True


In [69]:
for k in range(10):
  # Forward Pass
  emb = C[X]
  h0 = emb.view(-1, 6) @ W1 + b1
  h = torch.tanh(h0)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Y)
  print(f'Epoch {k} -> Loss = {loss.item()} ')

  # Backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # Gradient Update
  for p in parameters:
    p.data += -0.1 * p.grad

Epoch 0 -> Loss = 19.009180068969727 
Epoch 1 -> Loss = 15.267385482788086 
Epoch 2 -> Loss = 12.383593559265137 
Epoch 3 -> Loss = 10.268889427185059 
Epoch 4 -> Loss = 8.576563835144043 
Epoch 5 -> Loss = 7.1071906089782715 
Epoch 6 -> Loss = 5.814657688140869 
Epoch 7 -> Loss = 4.639155387878418 
Epoch 8 -> Loss = 3.6986703872680664 
Epoch 9 -> Loss = 3.0502424240112305 


#### On Complete dataset 

In [81]:
# Context Length - How many characters are used to predict the next one
block_size = 3

X, Y = [], []
for w in words:
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)

    context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [82]:
g = torch.Generator().manual_seed(10)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [83]:
print('Total trainable parameters: ', sum(p.nelement() for p in parameters))

Total trainable parameters:  3481


In [84]:
# Make sure all the parameters support gradient and updates
for p in parameters:
  p.requires_grad = True

In [85]:
for k in range(1000):
  # Forward Pass
  emb = C[X]
  h0 = emb.view(-1, 6) @ W1 + b1
  h = torch.tanh(h0)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Y)

  if k%100 == 0:
    print(f'Epoch {k} -> Loss = {loss.item()} ')

  # Backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # Gradient Update
  for p in parameters:
    p.data += -0.1 * p.grad

Epoch 0 -> Loss = 18.250364303588867 
Epoch 100 -> Loss = 3.490938901901245 
Epoch 200 -> Loss = 2.9825150966644287 
Epoch 300 -> Loss = 2.7816600799560547 
Epoch 400 -> Loss = 2.689009428024292 
Epoch 500 -> Loss = 2.6442503929138184 
Epoch 600 -> Loss = 2.613981008529663 
Epoch 700 -> Loss = 2.590808153152466 
Epoch 800 -> Loss = 2.573031187057495 
Epoch 900 -> Loss = 2.55879545211792 
