# makemore MLP
If we take more context, more characters, it quickly get's unmanageable. e.g. 4 chars = 27^3 ~ 19200 possibilities

In [1]:
import torch, torch.nn.functional as F, matplotlib.pyplot as plt

In [2]:
words = open("../names.txt").read().splitlines()

In [4]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [5]:
# build the vocabulary of characters and mappings to and from ints
chars = sorted(list(set(''.join(words))) + ['.'])
itos = {idx: ch for idx, ch in enumerate(chars)}
stoi = {v:k for k, v in itos.items()}

In [6]:
# build the dataset
block_size = 3
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(f"{''.join(itos[i] for i in context)} --> {itos[ix]}")
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
ava
... --> a
..a --> v
.av --> a
ava --> .
isabella
... --> i
..i --> s
.is --> a
isa --> b
sab --> e
abe --> l
bel --> l
ell --> a
lla --> .
sophia
... --> s
..s --> o
.so --> p
sop --> h
oph --> i
phi --> a
hia --> .


In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [8]:
# tagent: create embedding matrix. We are embedding 27 characters in a 2D embedding
C = torch.randn((27,2))
C.dtype

torch.float32

In [104]:
one_hot_method = F.one_hot(torch.tensor(5), num_classes=27).float() @ C # must have same datatype, one_hot default is log
index_directly = C[5] 
torch.equal(one_hot_method, index_directly)

True

In [105]:
C[5]

tensor([1.0599, 0.4453])

In [106]:
# index using lists
C[[5,6,7]]

tensor([[ 1.0599,  0.4453],
        [-0.7089, -0.5231],
        [-0.7756, -0.7479]])

In [107]:
# can index multiple times and get result in multiple
C[torch.tensor([5,6,7,7,7])]

tensor([[ 1.0599,  0.4453],
        [-0.7089, -0.5231],
        [-0.7756, -0.7479],
        [-0.7756, -0.7479],
        [-0.7756, -0.7479]])

In [108]:
C.shape

torch.Size([27, 2])

In [110]:
C[X].shape # gets embedding vector for each X

torch.Size([32, 3, 2])

In [113]:
# integer is 1
X[13,2]

tensor(1)

In [116]:
# check that the integer at that location of C is the same
C[X][13,2], C[1]

(tensor([-0.6016,  1.6488]), tensor([-0.6016,  1.6488]))

In [115]:
C[1]

tensor([-0.6016,  1.6488])

In [176]:
##########################
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [177]:
# setup first layer
W1 = torch.rand((6,100))
b1 = torch.rand(100)

In [137]:
# tangent: different way to use torch and torch internals
# want to matmul emb @ W1 but they are different sizes: (32,3,2) @ (6,100). 
# need to transform the emb to do this:
first_char_embed = emb[:,0,:]
second_char_embed = emb[:,1,:]
third_char_embed = emb[:,2,:]

# we want to concat across dim 1
torch.cat((first_char_embed,second_char_embed, third_char_embed), dim=1).shape

torch.Size([32, 6])

In [138]:
# above works, but we'd need to change this code if we use a bigger block_size, instead can use unbind:
# unbind will return a tuple of all slices along a given dimension
torch.unbind(emb, 1)
torch.unbind(emb,1)[0].shape # 3 slices in tuple with shape 32, 3

torch.Size([32, 2])

In [165]:
torch.cat(torch.unbind(emb, 1), dim=1).shape # but this inefficient as a whole new tensor is created for this op, uses more memory

torch.Size([32, 6])

In [141]:
# a better way:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [142]:
a.shape

torch.Size([18])

In [159]:
a.view(2,9), a.view(3,3, 2), a.view(9,2), a.view(-1,9) # -1 means whatever is left

(tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
         [ 9, 10, 11, 12, 13, 14, 15, 16, 17]]),
 tensor([[[ 0,  1],
          [ 2,  3],
          [ 4,  5]],
 
         [[ 6,  7],
          [ 8,  9],
          [10, 11]],
 
         [[12, 13],
          [14, 15],
          [16, 17]]]),
 tensor([[ 0,  1],
         [ 2,  3],
         [ 4,  5],
         [ 6,  7],
         [ 8,  9],
         [10, 11],
         [12, 13],
         [14, 15],
         [16, 17]]),
 tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
         [ 9, 10, 11, 12, 13, 14, 15, 16, 17]]))

In [149]:
a.storage() #physical storage 

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [157]:
torch.cat(torch.unbind(emb, 1), dim=1) == emb.view(32,6)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [158]:
###########################################
emb.view(32,6) @ W1 + b1


tensor([[ 2.9032,  4.1361,  4.5650,  ...,  1.9633,  2.6685,  4.3327],
        [ 2.5082,  3.9130,  4.3207,  ...,  1.9953,  2.3136,  4.0329],
        [ 0.7555,  2.4680,  3.0508,  ...,  2.0188,  0.9817,  2.4596],
        ...,
        [-0.8331, -1.2928, -0.7480,  ...,  0.3800,  1.0624,  0.5667],
        [-0.9798, -0.5661, -1.9993,  ..., -0.4268, -1.9186, -0.0536],
        [-0.6883,  0.9239,  0.4850,  ...,  1.1965,  0.1382,  1.4256]])

In [205]:
# how to deal with dim 0 being variable for X:
h1 = emb.view(32,6) @ W1 + b1
h2 = emb.view(emb.shape[0],6) @ W1 + b1
h3 = emb.view(-1,6) @ W1 + b1 #  -1 means whatever is left.
torch.equal(h1,h2), torch.equal(h1,h3)

(True, True)

In [206]:
################################## 
# back to the hidden layer:
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
h.shape

torch.Size([32, 100])

In [207]:
# check bias broadcasting when adding:
b1.shape, (emb.view(-1,6) @ W1).shape
# emb @ w1 32, 100
# b1        1, 100 << start at trailing dim, prepend 1 to the dims of the tensor
# copied the b vertically to all the rows, which is what we want


(torch.Size([100]), torch.Size([32, 100]))

In [208]:
W2 = torch.rand((100,27))
b2 = torch.rand(27)

In [209]:
logits = h @ W2 + b2

In [210]:
logits.shape

torch.Size([32, 27])

In [213]:
counts = logits.exp()

In [214]:
prob = counts / counts.sum(1,keepdims=True)

In [215]:
prob.shape

torch.Size([32, 27])

In [217]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [222]:
loss = -prob[torch.arange(32), Y].log().mean() # neg log liklihood loss

In [10]:
# ------------------------- 
# bring the above together
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [9]:
g = torch.Generator().manual_seed(2147483647)
C = torch.rand((27,2), generator = g)
W1 = torch.rand((6,100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.rand((100,27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]


In [298]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [299]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32,27)
counts = logits.exp()
prob = counts / counts.sum(1, keepdim=True)
loss = - prob[torch.arange(32), Y].log().mean()
loss


tensor(7.3572)

In [258]:
# tangent: cross_entropy means: 
# - creating the intermediate steps and do them together using fused kernels, 
# - expressions can take a simplier form, similar to forward and backward pass of tanh()
# - cross_entry can be much better numericall behaved i.e dealing wtih higher logits

F.cross_entropy(logits, Y)

tensor(7.3572)

In [271]:
# cross_entropy performed much better numericall behaved. As we are exp() the logits, if the logits are high (100) can get nan, -100 is ok:
logits = torch.tensor([-100, -3, 0, 100])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([0., 0., 0., nan])

In [272]:
# you can add offset to this and get the same answer:
logits = torch.tensor([-5, -3, 0, 5]) - 10
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([4.5079e-05, 3.3309e-04, 6.6903e-03, 9.9293e-01])

In [273]:
# same as: 
logits = torch.tensor([-5, -3, 0, 5]) - 20
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([4.5079e-05, 3.3309e-04, 6.6903e-03, 9.9293e-01])

In [276]:
logits = torch.tensor([-100, -3, 0, 100]) - 100
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([0.0000e+00, 1.4013e-45, 3.7835e-44, 1.0000e+00])

In [301]:
for p in parameters:
    p.requires_grad = True

In [308]:
# ----------------------------------------
for _ in range(1000):
    # forward pass
    emb = C[X] # (32,3,2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32,27)
    loss = F.cross_entropy(logits, Y)
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    for p in parameters:
        p.data += -0.1 * p.grad

print(loss.item())

0.2525448799133301


In [327]:
# return the max index for each row and compare it to the expected index. 
# so some match but other's dont. e.g index 0, as ... predicts either e,o,a,i or s all are possible, from the training set.
# this means it's not possilbe to get loss to 0, and overfit
logits.max(1),Y

(torch.return_types.max(
 values=tensor([ 43.6240,  64.4105,  58.8915,  68.1924,   0.9242,  43.6240,  17.9316,
          11.4024, -17.6828,  33.6481,  45.3112,   8.1901,  43.6240,  23.4733,
          43.4074, -12.2734,  43.6240,  41.4969,  60.3677,  27.2958,  12.0153,
          25.7188,  35.0194,  61.3420, -11.1273,  43.6240,  55.2428,  36.2484,
          31.1155,  21.6926,  57.4884,  13.5841], grad_fn=<MaxBackward0>),
 indices=tensor([ 1, 13, 13,  1,  0,  1, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  1, 19,
          1,  2,  5, 12, 12,  1,  0,  1, 15, 16,  8,  9,  1,  0])),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
          1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]))

In [328]:
# how max works
x = torch.arange(10)
y = x.view(-1,5)
y, y.max(1)

(tensor([[0, 1, 2, 3, 4],
         [5, 6, 7, 8, 9]]),
 torch.return_types.max(
 values=tensor([4, 9]),
 indices=tensor([4, 4])))

In [11]:
# build the full dataset
block_size = 3
X, Y = [], []

for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [12]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [13]:
g = torch.Generator().manual_seed(2147483647)
C = torch.rand((27,2), generator = g)
W1 = torch.rand((6,100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.rand((100,27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [14]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [15]:
for p in parameters:
    p.requires_grad = True

In [351]:
for _ in range(10):
    # forward pass
    emb = C[X] # (32000,3,2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32000, 100)
    logits = h @ W2 + b2 # (32000,27)
    loss = F.cross_entropy(logits, Y)
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    for p in parameters:
        p.data += -0.1 * p.grad

7.668493747711182
5.1129469871521
3.8875772953033447
3.495539665222168
3.3378922939300537
3.225546360015869
3.1348354816436768
3.0562543869018555
2.988011598587036
2.932504177093506


In [352]:
# mini-batch to reduce training time, randomly select a batch from the training set and train on that:
torch.randint(0, X.shape[0], (32,)) # low, high and size (as tuple) 

tensor([119797, 135103,  68519,  78310,   1849,  10334, 175244,  22232,  84835,
        132659,  57062, 195604,  94142, 157832,  64927,  78522,  13348,  31850,
        201084, 144172,  40730,  14861,  38417, 140192,  83381, 157992,  29477,
        129486, 162059,  81486,  70888, 223125])

In [17]:
# why minibatch. minibatch will get an approximate gradient rather than exact and it's better to make many steps using 
# an approx gradient, rather than few steps using a more accurate gradient.
for _ in range(100):
    # minibatch construct
    ix = torch.randint(0, X.shape[0], (32,)) # minibatch of 32 items
    # forward pass
    emb = C[X[ix]] # (32,3,2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32,27)
    loss = F.cross_entropy(logits, Y[ix]) # use minibatch index to get labelled result

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    for p in parameters:
        p.data += -0.1 * p.grad
print(loss.item())

2.9648590087890625
2.6528027057647705
2.8100643157958984
2.866823196411133
2.707019567489624
2.848673105239868
3.189858913421631
2.8352770805358887
2.7900948524475098
3.111520528793335
2.859539031982422
3.046997547149658
3.02547287940979
2.5750844478607178
2.832350254058838
2.7211475372314453
3.0934629440307617
3.118736505508423
2.8435776233673096
2.912137508392334
3.017094850540161
2.841599464416504
3.002427101135254
2.844202756881714
2.8834824562072754
3.145188331604004
2.7380101680755615
2.755811929702759
2.7141408920288086
2.876512050628662
2.6551740169525146
3.136817693710327
2.709643840789795
2.808337450027466
3.1058692932128906
2.8435585498809814
2.694821357727051
2.602365732192993
2.595930337905884
2.8852570056915283
2.8886642456054688
2.5396482944488525
2.806668281555176
2.8390884399414062
2.9100406169891357
3.0885636806488037
2.7862789630889893
2.788360595703125
3.2058510780334473
2.909269332885742
3.024956226348877
2.8851728439331055
2.802968740463257
2.913724660873413
2.850

In [20]:
# loss for the full training set
emb  = C[X] 
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss

tensor(2.7607, grad_fn=<NllLossBackward0>)

In [48]:
# learning rates, checking rate
g = torch.Generator().manual_seed(2147483647)
C = torch.rand((27,2), generator = g)
W1 = torch.rand((6,100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.rand((100,27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True


In [47]:
# find the lower and upper bounds, a search range, where loss does not decrease anymore
# 0.0001 loss doesn't go down, but does at 0.001, 

LEARNING_RATE = 1 # tried 0.001, 0.01, 0.1, 1, 10
for _ in range(100):
    # minibatch construct
    ix = torch.randint(0, X.shape[0], (32,)) # minibatch of 32 items
    # forward pass
    emb = C[X[ix]] # (32,3,2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32,27)
    loss = F.cross_entropy(logits, Y[ix]) # use minibatch index to get labelled result
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    for p in parameters:
        p.data += -LEARNING_RATE * p.grad 
print(loss.item())

8.227935791015625
11.546402931213379
33.976444244384766
19.65176010131836
19.65262794494629
15.977632522583008
11.893694877624512
14.985189437866211
13.15888500213623
13.031699180603027
10.335827827453613
10.280573844909668
6.444364547729492
6.754821300506592
6.7885637283325195
5.312748432159424
5.288608551025391
7.065597057342529
8.048784255981445
6.6396307945251465
10.49025821685791
13.85434341430664
8.11581802368164
5.430596828460693
6.216933727264404
8.831623077392578
8.386439323425293
8.574119567871094
9.745198249816895
6.018150806427002
8.387862205505371
8.014059066772461
6.881567478179932
6.885052680969238
6.732348442077637
5.167070388793945
6.403532981872559
4.921173572540283
3.718583106994629
3.827099323272705
3.861178398132324
4.794559955596924
5.1293864250183105
6.014892578125
3.825348138809204
6.906620502471924
11.068583488464355
8.912542343139648
5.730525970458984
8.777033805847168
6.629668712615967
6.366147041320801
8.535271644592285
8.418272972106934
4.983580112457275
4.

In [None]:
lre = torch.linspace(