The notebook refers the work in this [Paper](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
!wget https://raw.githubusercontent.com/suvigyajain0101/NLP/llms/LLMs/names.txt

--2023-05-06 21:48:17--  https://raw.githubusercontent.com/suvigyajain0101/NLP/llms/LLMs/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2023-05-06 21:48:17 (8.31 MB/s) - ‘names.txt’ saved [228145/228145]



In [8]:
words = open('names.txt').read().splitlines()
words[0:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [9]:
len(words)

32033

In [11]:
# Build vocab of characters and mapping to and from integers
chars = sorted(list(set(''.join(words))))
assert len(chars) == 26

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

print(stoi)
print(itos)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


### Data Preparation

In [12]:
# Context Length - How many characters are used to predict the next one
block_size = 3

X, Y = [], []
for w in words[0:5]:

  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)

    print(''.join(itos[i] for i in context), '------>', itos[ix])

    context = context[1:] + [ix]


X = torch.tensor(X)
Y = torch.tensor(Y)


emma
... ------> e
..e ------> m
.em ------> m
emm ------> a
mma ------> .
olivia
... ------> o
..o ------> l
.ol ------> i
oli ------> v
liv ------> i
ivi ------> a
via ------> .
ava
... ------> a
..a ------> v
.av ------> a
ava ------> .
isabella
... ------> i
..i ------> s
.is ------> a
isa ------> b
sab ------> e
abe ------> l
bel ------> l
ell ------> a
lla ------> .
sophia
... ------> s
..s ------> o
.so ------> p
sop ------> h
oph ------> i
phi ------> a
hia ------> .


In [17]:
# Example data, label in the dataset
print(X[1], Y[1])

X.shape, X.dtype, Y.shape, Y.dtype

tensor([0, 0, 5]) tensor(13)


(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

#### Embedding Matrix

The awesomeness of Pytorch indexing!

In [18]:
# Let's assume we want to embed chars to 2 axes
C = torch.randn((27, 2))

In [19]:
# in order to get the element from C, we can use 2 approaches - 
# 1. Index the tensor, just like python list
# 2. Matrix multiplication -> remember, one-hot matrix when multiplied with another matrix gives the row of interest
# Obviously, it's much faster to just use indexing, and we'll park the idea of matrix multiplication for now
C[5]

tensor([0.4540, 0.2732])

In [21]:
# But instead of 1 int, we need to index a matrix of shape (27, 2)
# Turns out, Pytorch can take care of this
C[X].shape

torch.Size([32, 3, 2])

2 elements of the embedding matrix for every element (32, 3) of the input

In [23]:
# We can verify

test_output = C[X]

test_row = 3
print('input row: ', X[test_row])
print('embedding matrix row: ', C[X[test_row]])
print('Indexed row: ', test_output[test_row])

input row:  tensor([ 5, 13, 13])
embedding matrix row:  tensor([[ 0.4540,  0.2732],
        [ 1.8485, -0.3960],
        [ 1.8485, -0.3960]])
Indexed row:  tensor([[ 0.4540,  0.2732],
        [ 1.8485, -0.3960],
        [ 1.8485, -0.3960]])


Matches perfectly!