In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn.functional as F

In [3]:
words = open('names.txt').read().splitlines()
len(words), words[:8]

(32033,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [4]:
# let's only have one special token, and let's have it at index 0, offset others by 1
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
num_classes = len(stoi)

In [5]:
# Whole dataset

block_size = 3 # How many characters do we take to predict the next one : 3 chars to predict the 4th
X, Y, = [], [] # X, input | Y, labels

for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # crop and append moving window

X = torch.tensor(X)
Y = torch.tensor(Y)

X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [6]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((3*2, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]


sum(p.nelement() for p in parameters) # total number of parameters

3481

In [7]:
# takes a while since we're doing the full loop through the whole dataset
# Enter MINIBATCHES
# Lets generate random integers so we can index into them to pick the minibatch
# random integer between 0 - dataset len
# number of batch members ; minibatch size; 32

In [8]:
torch.randint(0, X.shape[0], (32,))

tensor([194483,  22928,  66699, 216253,  29443, 161234,  12611,   1401,  94299,
         62581,  39780, 183158,  29152, 146424, 161355,  97137,  96009,  62925,
        159493,  88772,  78550, 179464, 198123, 138012,  63116,     76, 166575,
        108796, 120224, 190953,   1930, 107308])

In [9]:
_ix = torch.randint(0, X.shape[0], (32,))
C[X].shape, C[X[_ix]].shape

(torch.Size([228146, 3, 2]), torch.Size([32, 3, 2]))

In [10]:
# parameters learnable
for p in parameters:
    p.requires_grad = True

In [11]:
for i in range(1000):
    # minibatches
    bs = 32
    ix = torch.randint(0, X.shape[0], (bs,))
    
    # forward pass
    emb = C[X[ix]] # [32, 3, 2]
    h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1) # [32, 100]
    logits = h @ W2 + b2 # [32, 27]
    loss = F.cross_entropy(logits, Y[ix]) # also Ys have to be at the same index
    if (i+1)%100 == 0:
        print(f"Iteration {i}: Minibatch loss : {loss.item()}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -0.1 * p.grad

Iteration 99: Minibatch loss : 4.2988104820251465
Iteration 199: Minibatch loss : 3.0233213901519775
Iteration 299: Minibatch loss : 2.5719573497772217
Iteration 399: Minibatch loss : 2.8726754188537598
Iteration 499: Minibatch loss : 2.424332618713379
Iteration 599: Minibatch loss : 2.704420804977417
Iteration 699: Minibatch loss : 2.407240867614746
Iteration 799: Minibatch loss : 2.705643653869629
Iteration 899: Minibatch loss : 2.7069225311279297
Iteration 999: Minibatch loss : 2.4282259941101074


In [12]:
# Now that we're only doing minibatches the quality of our gradient is lower
# the direction is not as reliable - not the actual gradient direction
# but it's good enough even when it's estimating only on 32 examples
# it's much better to have an approximate gradient and take more steps than to calculate exact gradient and take few steps

# lets now get loss for all of X
emb = C[X]
h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss

tensor(2.7056, grad_fn=<NllLossBackward0>)

In [13]:
# How do we determine that we are stepping with right learning rate ?
# 0.1 used above
# in the next nb we find a good learning rate