# Makemore lesson 2

The task is to implement this network:

![MLP](images/MLP.png)


In [24]:
import torch
import torch.nn.functional as F
%matplotlib inline 

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:8 ]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
chars = sorted(list(set(''.join(words))))
# string to integer
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# integer to string
itos = {i:s for s,i in stoi.items()}

26

In [15]:
# build the dataset

# context length: how many characters we take to predict the next one
block_size = 3

X, Y = [], []
for w in words[:5]:
    print(w)
    
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print("".join(itos[i] for i in context), "--->", itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


X is now a vector of integers representing different words that have occurred together in the dataset. 

In [16]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

We create a vector C with rows equal to the number of characters in the dataset (27) and columns the size of the embeddings (2). 

In [12]:
C = torch.randn(27, 2)
C.shape

torch.Size([27, 2])

Embed simultaneously all integers of X. 

We can see that the result is a vector of of the same shape as X, but with a new dimension of 2, which is the embedding for each word. 

In [22]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [21]:
emb[1, 2]

tensor([-0.7150,  2.8327])

Now let's initialize weights of the hidden layer

In [25]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

Now let's multiply the weight with a 2 dimensional representation of input layer.



In [30]:
h_no_activation = emb.view(-1, 6) @ W1 + b1 
h_no_activation.shape

torch.Size([32, 100])

In [31]:
h_no_activation

tensor([[-0.6017, -3.5166,  0.0954,  ...,  0.6654,  0.1866, -0.5502],
        [ 0.9613, -1.4713,  0.4734,  ...,  1.2299, -1.2621, -1.1406],
        [ 3.1213,  3.2348,  1.0119,  ...,  1.4983, -3.4846, -1.2529],
        ...,
        [-1.5172, -3.7798,  0.2104,  ..., -2.8301,  1.5355, -1.7390],
        [ 2.3055, -6.3487,  2.7768,  ..., -2.5099, -0.0297, -4.2553],
        [ 6.8435, -4.0417,  2.1428,  ...,  0.6571, -0.1208, -1.3713]])

Next we apply the activation function tanh

In [32]:
h = torch.tanh(h_no_activation)
h

tensor([[-0.5382, -0.9982,  0.0951,  ...,  0.5819,  0.1845, -0.5006],
        [ 0.7449, -0.8998,  0.4409,  ...,  0.8426, -0.8516, -0.8146],
        [ 0.9961,  0.9969,  0.7666,  ...,  0.9048, -0.9981, -0.8491],
        ...,
        [-0.9082, -0.9990,  0.2074,  ..., -0.9931,  0.9114, -0.9401],
        [ 0.9803, -1.0000,  0.9923,  ..., -0.9869, -0.0297, -0.9996],
        [ 1.0000, -0.9994,  0.9728,  ...,  0.5765, -0.1202, -0.8790]])

Now we define the output layer which has number of rows equal to the size of the hidden layer (100) and number of columns equal to the number of characters in the dataset (27)

In [36]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

Now we calculate the logits by multiplying the outputs of the hidden layer with the output layer.

We can see that the shape aligns with rows (n observations) and columns (output units)

In [38]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [55]:
counts = logits.exp()
counts.shape

torch.Size([32, 27])

We apply Softmax

In [45]:
prob = counts / counts.sum(dim=1, keepdim=True)
prob.shape

torch.Size([32, 27])

Next we find the output probability for the correct word

In [53]:
probs = prob[torch.arange(32), Y]
probs.shape

torch.Size([32])

With the loss function

In [54]:
loss = -probs.log().mean()
loss

tensor(18.9005)

This is actually equivalent to using the cross entropy loss in pytorch

In [59]:
F.cross_entropy(logits, Y)

tensor(18.9005)

Summarizing the parameters

In [56]:
parameters = [C, W1, b1, W2, b2]

In [63]:
s = 0
for p in parameters:
    print(p.nelement())
    s += p.nelement()
    
print(f'sum of parameters: {s}')

54
600
100
2700
27
sum of parameters: 3481


## Back Propagation

In [64]:
for p in parameters:
    p.requires_grad = True

In [65]:
for i in range(1000):
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    
    # backward pass
    for p in parameters:
        p.grad = None    
    loss.backward()
    
    for p in parameters:
        p.data += -0.1 * p.grad
    
print(loss.item())

0.2547752559185028
