In [163]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [164]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [165]:
len(words)

32033

In [166]:
# build the vocab of chars and mapping to/from ints
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [167]:
# build the dataset
block_size = 3 # context length: how many chars we take to predict the next?
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [168]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [169]:
C = torch.randn((27,2))

In [170]:
C[5]

tensor([-1.3687,  0.8670])

In [171]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [172]:
#torch.cat(torch.unbind(emb, 1), 1).shape # this is inefficient vs .view() because it creates a new tensor after concatinating taking up memory
#emb.view(32, 6) == torch.cat(torch.unbind(emb, 1), 1) this comparison returns true showing view functions the same as cat and unbind while reducing memory use. view is just more efficient

In [173]:
W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [174]:
# instead of emb.shape[0] you can put -1 and pytorch will infer the shape
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

In [175]:
h

tensor([[ 0.9858, -0.9993,  0.9861,  ...,  0.5308, -0.9999,  0.9579],
        [ 0.7664, -0.9973, -0.9330,  ...,  0.9999, -0.9986,  0.9998],
        [ 0.8577,  0.9886, -0.8883,  ...,  0.6646, -0.9996, -0.8549],
        ...,
        [-0.9469, -0.9917,  0.6785,  ...,  0.9808, -0.7611,  0.9899],
        [ 0.6100,  0.6857, -0.1497,  ...,  0.9634, -0.9990,  0.5112],
        [ 0.4269,  0.9952,  0.9121,  ..., -0.9945, -0.9961,  0.9595]])

In [176]:
h.shape

torch.Size([32, 100])

In [177]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [178]:
logits = h @ W2 + b2

In [179]:
logits.shape

torch.Size([32, 27])

In [180]:
counts = logits.exp()

In [181]:
prob = counts / counts.sum(1, keepdim=True)

In [182]:
prob.shape

torch.Size([32, 27])

In [183]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(17.4266)

In [184]:
# reorganize everything above-------------------------------------------------------------------------------------------------

In [185]:
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [186]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [187]:
sum(p.nelement() for p in parameters) # num params in total

3481

In [188]:
for p in parameters:
    p.requires_grad = True

In [189]:
for _ in range(100):
    # forward pass
    emb = C[X] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    #counts = logits.exp()                                  | these 3 lines are equivalent to F.cross_entropy(logits, Y)
    #prob = counts / counts.sum(1, keepdim=True)            | in F.cross_entropy, pytorch doesn't make all the tensors 
    #loss = -prob[torch.arange(32), Y].log().mean()         | like in the 3 lines here so it is more efficient as extra memory isnt being taken up
    loss = F.cross_entropy(logits, Y) # reasons to do cross_entropy over the 3 lines, forward/backward pass is more efficient, and numerically more well behaved
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    for p in parameters:
        p.data += -0.1 * p.grad


17.769712448120117
13.656402587890625
11.298768997192383
9.452457427978516
7.984262466430664
6.891321182250977
6.1000142097473145


5.452036380767822
4.898151874542236
4.414664268493652
3.985849142074585
3.6028308868408203
3.262141704559326
2.961381196975708
2.6982972621917725
2.469712972640991
2.271660804748535
2.101283550262451
1.9571771621704102
1.8374855518341064
1.7380964756011963
1.6535117626190186
1.579089879989624
1.5117664337158203
1.449604868888855
1.3913120031356812
1.3359923362731934
1.283052682876587
1.2321912050247192
1.18338143825531
1.1367988586425781
1.092664361000061
1.0510926246643066
1.0120269060134888
0.9752706289291382
0.9405567049980164
0.9076126217842102
0.8761922717094421
0.8460891246795654
0.8171356916427612
0.78919917345047
0.7621746063232422
0.7359814047813416
0.7105579972267151
0.6858610510826111
0.6618654131889343
0.638565719127655
0.6159819960594177
0.5941659808158875
0.573210597038269
0.5532563328742981
0.5344882011413574
0.5171167850494385
0.5013313293457031
0.4872424304485321
0.47484028339385986
0.4639976918697357
0.4545143246650696
0.44617074728012085
0.4387662708759308
0.43213310