In [47]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [48]:
# read all words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [49]:
# build vocab of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [63]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one ?
X, Y = [], []
for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)

In [64]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [77]:
# build the dataset
def build_dataset(words):

    block_size = 3 # context length: how many characters do we take to predict the next one ?
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182580, 3]) torch.Size([182580])
torch.Size([22767, 3]) torch.Size([22767])
torch.Size([22799, 3]) torch.Size([22799])


In [84]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator = g)
W1 = torch.randn((6,300), generator = g)
b1 = torch.randn(300, generator = g)
W2 = torch.randn((300, 27), generator = g)
b2 = torch.randn(27, generator = g)
parameters = [C, W1, b1, W2, b2]

In [85]:
sum(p.nelement() for p in parameters)

10281

In [86]:
for p in parameters:
    p.requires_grad = True

In [68]:
lre = torch.linspace(-3,0,1000)
lrs = 10**lre

In [95]:
lri, lossi = [], []

for i in range(30000):
    
    #minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (32,))

    emb = C[Xtr[ix]] #(32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @W1 + b1) #(32, 100)
    logits = h @ W2 + b2 #(32, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
    print(loss.item())
    #backward pass
    lr = 0.05
    for p in parameters:
        p.grad = None
    loss.backward()
    #update
    for p in parameters:
        p.data += -lr * p.grad
    # lri.append(lr)
    # lossi.append(loss.item())

2.2425196170806885
2.2019970417022705
2.4545223712921143
2.7106192111968994
2.5698540210723877
2.3921825885772705
2.617460250854492
2.6487767696380615
2.6906039714813232
2.281003713607788
2.3687198162078857
2.848817825317383
2.260282039642334
2.545578956604004
2.699298620223999
2.5364465713500977
2.364370107650757
2.291623830795288
2.4340121746063232
2.3388848304748535
2.197533369064331
2.3290677070617676
2.55251145362854
2.303994655609131
2.4293205738067627
2.508007287979126
2.7027435302734375
2.653895854949951
2.3761627674102783
2.236894130706787
2.7624294757843018
2.4786064624786377
2.470118522644043
2.542712450027466
2.1761810779571533
2.2927286624908447
2.6530935764312744
2.400623083114624
2.633798122406006
2.570307493209839
2.4069747924804688
2.3272817134857178
2.7741379737854004
2.5524888038635254
2.7065224647521973
2.4177310466766357
2.5169029235839844
2.4600865840911865
2.4994120597839355
1.9876492023468018
2.430772304534912
2.303936243057251
2.7695624828338623
2.7236242294311

In [96]:
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.4749, grad_fn=<NllLossBackward0>)

In [None]:
# training split, dev/validation split, test split
# 80%, 10%, 10%