In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn.functional as F

In [3]:
words = open('names.txt').read().splitlines()
len(words), words[:8]

(32033,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [4]:
# let's only have one special token, and let's have it at index 0, offset others by 1
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
num_classes = len(stoi)

In [5]:
# Whole dataset

block_size = 3 # How many characters do we take to predict the next one : 3 chars to predict the 4th
X, Y, = [], [] # X, input | Y, labels

for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # crop and append moving window

X = torch.tensor(X)
Y = torch.tensor(Y)

X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [6]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((3*2, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]


sum(p.nelement() for p in parameters) # total number of parameters

3481

In [7]:
# parameters learnable
for p in parameters:
    p.requires_grad = True

In [8]:
# we have now figured out that a good LR is 0.1 for this dataset from prev notebook
# lets crank up the number of iterations

In [9]:
lr = 0.1
steps = 100000

In [10]:
for i in range(steps):
    # minibatches
    bs = 32
    ix = torch.randint(0, X.shape[0], (bs,))
    
    # forward pass
    emb = C[X[ix]] # [32, 3, 2]
    h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1) # [32, 100]
    logits = h @ W2 + b2 # [32, 27]
    loss = F.cross_entropy(logits, Y[ix]) # also Ys have to be at the same index
    if (i+1)%(steps/10) == 0:
        print(f"Iteration {i}: Minibatch loss : {loss.item()}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -lr * p.grad

Iteration 9999: Minibatch loss : 2.532820701599121
Iteration 19999: Minibatch loss : 2.8786001205444336
Iteration 29999: Minibatch loss : 2.597663640975952
Iteration 39999: Minibatch loss : 2.59043550491333
Iteration 49999: Minibatch loss : 2.40293288230896
Iteration 59999: Minibatch loss : 2.1534323692321777
Iteration 69999: Minibatch loss : 2.4668455123901367
Iteration 79999: Minibatch loss : 2.1520745754241943
Iteration 89999: Minibatch loss : 2.352787971496582
Iteration 99999: Minibatch loss : 2.2285616397857666


In [11]:
# lets now get loss for all of X
emb = C[X]
h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss

tensor(2.3382, grad_fn=<NllLossBackward0>)

In [12]:
# Remember that our bigram loss was ~ 2.45

In [13]:
# Now let's try a concept called learning rate decay
# Towards the end of our training loop, esp. when the loss starts to pleateau a bit
# we progressively decay our learning rate, so as to take smaller steps
# for the sake of loss stability

In [14]:
lr = 0.01
steps = 100000

In [15]:
for i in range(steps):
    # minibatches
    bs = 32
    ix = torch.randint(0, X.shape[0], (bs,))
    
    # forward pass
    emb = C[X[ix]] # [32, 3, 2]
    h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1) # [32, 100]
    logits = h @ W2 + b2 # [32, 27]
    loss = F.cross_entropy(logits, Y[ix]) # also Ys have to be at the same index
    if (i+1)%(steps/10) == 0:
        print(f"Iteration {i}: Minibatch loss : {loss.item()}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -lr * p.grad

Iteration 9999: Minibatch loss : 2.143094539642334
Iteration 19999: Minibatch loss : 2.0366854667663574
Iteration 29999: Minibatch loss : 2.177292585372925
Iteration 39999: Minibatch loss : 2.514849901199341
Iteration 49999: Minibatch loss : 2.857970952987671
Iteration 59999: Minibatch loss : 2.5583720207214355
Iteration 69999: Minibatch loss : 2.4936392307281494
Iteration 79999: Minibatch loss : 2.499969482421875
Iteration 89999: Minibatch loss : 2.360161542892456
Iteration 99999: Minibatch loss : 2.292132616043091


In [16]:
# lets now get loss for all of X
emb = C[X]
h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss

tensor(2.2586, grad_fn=<NllLossBackward0>)

In [18]:
# Dramatic improvement over the bigram model ~= 2.45