In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn.functional as F

In [3]:
words = open('names.txt').read().splitlines()
len(words), words[:8]

(32033,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [4]:
# let's only have one special token, and let's have it at index 0, offset others by 1
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
num_classes = len(stoi)

In [5]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((3*2, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]


sum(p.nelement() for p in parameters) # total number of parameters

3481

In [6]:
# As the capacity of the NN grows (the number of parameters)
# it becomes more and more capable of overfitting the training set
# that means that loss on the training set becomes very low
# the model basically starts memorizing the training set
# NOT what we want : we want the model to generalize to cases(within the same context) that it hasn't seen before
# If you sample from it, you'll get examples exactly from the training set, NOT very useful
# Loss will be very high if you try to evaluate it on a withheld set
# NOT a good model

In [7]:
# Standard is to split the dataset into 3 splits (80/10/10)
# Training set     : Used for training, get the loss, optimize the parameters
# Validation split : Used for calculating the metrics, even the loss but don't use the loss to optimize the model, instead regularize
# Test split       : Used towards the end of the project, used very sparingly, do not use the feedback of this to retrain, might overfit to test as well otherwise, not a good test then

In [8]:
def build_dataset(words):
    block_size = 3 # How many characters do we take to predict the next one : 3 chars to predict the 4th
    X, Y, = [], [] # X, input | Y, labels

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append moving window

    X = torch.tensor(X)
    Y = torch.tensor(Y)    
    return X, Y

In [9]:
# Splits
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xva, Yva = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

Xtr.shape, Ytr.shape, Xva.shape, Yva.shape, Xte.shape, Yte.shape

(torch.Size([182625, 3]),
 torch.Size([182625]),
 torch.Size([22655, 3]),
 torch.Size([22655]),
 torch.Size([22866, 3]),
 torch.Size([22866]))

In [10]:
# Lets train with traing splits now

In [11]:
# parameters learnable
for p in parameters:
    p.requires_grad = True

In [12]:
lr = 0.1
steps = 500000

In [13]:
for i in range(steps):
    # minibatches
    bs = 32
    ix = torch.randint(0, Xtr.shape[0], (bs,))
    
    # forward pass
    emb = C[Xtr[ix]] # [32, 3, 2]
    h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1) # [32, 100]
    logits = h @ W2 + b2 # [32, 27]
    loss = F.cross_entropy(logits, Ytr[ix]) # also Ys have to be at the same index
    if (i+1)%(steps/10) == 0:
        print(f"Iteration {i}: Minibatch loss : {loss.item()}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -lr * p.grad

Iteration 49999: Minibatch loss : 2.114920139312744
Iteration 99999: Minibatch loss : 2.352707624435425
Iteration 149999: Minibatch loss : 2.2840514183044434
Iteration 199999: Minibatch loss : 1.9364819526672363
Iteration 249999: Minibatch loss : 2.3005363941192627
Iteration 299999: Minibatch loss : 2.2722997665405273
Iteration 349999: Minibatch loss : 2.257194995880127
Iteration 399999: Minibatch loss : 2.0225448608398438
Iteration 449999: Minibatch loss : 2.1056764125823975
Iteration 499999: Minibatch loss : 2.3153648376464844


In [14]:
# Lets now get Training loss
emb = C[Xtr]
h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

tensor(2.2589, grad_fn=<NllLossBackward0>)

In [15]:
# Lets now get Validation loss
emb = C[Xva]
h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Yva)
loss

tensor(2.2702, grad_fn=<NllLossBackward0>)

In [16]:
# We see that the training and validation losses are about equal
# So we're def. not overfitting, i.e. memorizing the training set
# We could even possibly be underfitting, because we sort of expect the training loss to be lower than validation
# after all the network has seen the training set, so it's expected to do somewhat better with it
# Good idea to start out first by overfitting a bit
# In this case, it means that our network is very tiny, it's can memorize the training set well enough
# We can improve performance by scaling up, increase the size of the NN, so it's more powerful so it can memorize a bit more

In [17]:
# That's for the next nb