# Building makemore Part 2: MLP

[Andrej Karpathy](https://karpathy.ai/)

[YouTube video link](https://youtu.be/TCH_1BHY58I?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ)

> We implement a multilayer perceptron (MLP) character-level language model. In this video we also introduce many basics of machine learning (e.g. model training, learning rate tuning, hyperparameters, evaluation, train/dev/test splits, under/overfitting, etc.).

https://github.com/karpathy/makemore

Paper: [A Neural Probabilistic Language Model - Bengio, et al](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

Requires the training file: `names.txt`

In [None]:
# Run ONCE to update any new kernel instance.
# You MUST restart the kernel after updating.
!pip install --upgrade pip
!pip install graphviz
!apt-get update
!apt-get install -y graphviz
!pip install torch
print('Complete! You\'re good-to-go!')

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# read in all the words from the training set
words = open('names.txt', 'r').read().splitlines()
words[:8]

In [None]:
len(words)

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

In [None]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
    
    #print(w)
    context = [0] * block_size # build out padded starting context ("...")
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
X.shape, X.dtype, Y.shape, Y.dtype

In [None]:
# build the dataset

def build_dataset(words):
    block_size = 3 # context length: how many characters do we take to predict the next one?
    X, Y = [], []
    for w in words:

        #print(w)
        context = [0] * block_size # build out padded starting context ("...")
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

# split data into 3 sets
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words)) # 80%
n2 = int(0.9*len(words)) # 90%

Xtr, Ytr = build_dataset(words[:n1])     # training set
Xdev, Ydev = build_dataset(words[n1:n2]) # dev/validation set
Xte, Yte = build_dataset(words[n2:])     # test set

## Build the 1st (input) layer of the network

In [None]:
C = torch.randn((27, 2)) # each 27 characters will have a 2D embedding

In [None]:
# 2 ways to access the row vector in C for index 5
# The one-hot tensor: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# Matrix multiplying by C effectively plucks out the 5th row of C.
C[5] # or...
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

# also we can access multiple rows with a list
C[[5,6,7]]
# or a tensor (repetition is also supported)
C[torch.tensor([5,6,7,7,7])]

# also supported multidimensional tensors
C[X] # X.shape from above is [32, 3]

In [None]:
# using all this, build our embedding for the first layer of the network
emb = C[X]
emb.shape

In [None]:
emb

## Build the hidden layer of the network

In [None]:
W1 = torch.randn((6, 100)) # context.size (i.e. inputs) * embedding-dimensions = 3*2 = 6. 100 is the size of the hidden layer
b1 = torch.randn(100)

In [None]:
# Doesn't "just work":
# emb @ W1 + b1 ---> "RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)"

# what we need to do is concatenate the 3 input dimensions of the embedding (shape = [32, 3, 2])
# grab the [32, 2] tensor for each of the 3 inputs
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1) # <-- block_size is hardcoded to 3 here

In [None]:
# a more general way to do the same thing (concatenate across dimension 1)
torch.cat(torch.unbind(emb, 1), 1)

### Exploring PyTorch Internals

a.k.a. An even better (more efficient) way to do this concatenation...

In [None]:
# an even better (more efficient) way to do this concatenation...
a = torch.arange(18)
a

In [None]:
a.shape

In [None]:
# view() is extremely efficient
a.view(2,9)
a.view(3, 3, 2)

In [None]:
# internally, a tensor is always stored as a 1D array, with offsets, strides, etc dictating access.
a.untyped_storage()

In [None]:
# MOST EFFICIENT concatenation using view()
emb.view(32, 6)

# emb.view(32, 6) == torch.cat(torch.unbind(emb, 1), 1) # <--- prove they are the same

### The "real" hidden layer

In [None]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # pytorch will derive the shape when we pass -1 to make things for general

# Note: we have to be careful w/ the "+ b1" addition and broadcasting
#  it works because the broadcasting shifts the 1D row size of b (b.shape = [100]) to the right:
#  32, 100 <-- result of emb @ W1
#   1, 100 <-- b1

In [None]:
h.shape # (32, 100)... gives 100 activations for each of our embeddings

In [None]:
h

## Build the output layer

In [None]:
W2 = torch.randn((100, 27)) # inputs from hidden layer -> 27 outputs
b2 = torch.randn(27)

In [None]:
logits = h @ W2 + b2

In [None]:
logits.shape

In [None]:
counts = logits.exp()

In [None]:
prob = counts / counts.sum(1, keepdims=True)

In [None]:
prob.shape

In [None]:
prob[0].sum()

In [None]:
torch.arange(32)

In [None]:
Y

In [None]:
prob[torch.arange(32), Y] # look at the predicited probabilities for each of the training set expected values (Y). Ideally, they would all be 1.0

In [None]:
# negative log likelihood
loss = -prob[torch.arange(32), Y].log().mean()
loss

## Making it all respectable

In [None]:
X.shape, Y.shape # dataset

In [None]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g) # simple linear input layer
W1 = torch.randn((6, 100), generator=g) # hidden layer
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g) # output layer
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters) # number of parameters in total

### Side-tangent about cross_entropy

In [None]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)

# Everything that follows is just classification so can use PyTorch cross_entropy()
#  counts = logits.exp()
#  prob = counts / counts.sum(1, keepdims=True)
#  loss = -prob[torch.arange(32), Y].log().mean()
loss = F.cross_entropy(logits, Y) # <-- more efficient (no intermediate tensors, fused kernels simplify backward pass)
loss

In [None]:
# an example of why cross_entropy() is more numerically well-behaved
logits = torch.tensor([-100, -3, 0, 100])
counts = logits.exp()
probs = counts / counts.sum()
probs

In [None]:
counts

## Let's train for real... or "back to the respectible part" :)

Note 1: Larger models with 100,000's of parameters can easily over-fit the data. You will see this when your loss is very low for the training set, but high for the "held back" test set.

Note 2: You are only allowed to test on the test set a few times, otherwise you risk training on the test set also.

### Data Splits
- Training set: 80%
- Dev/validation set: 10% (used to determine hyperparameters)
- Test set: 10%

In [None]:
Xtr.shape, Ytr.shape # dataset

In [None]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
#C = torch.randn((27, 2), generator=g) # simple linear input layer
#W1 = torch.randn((6, 100), generator=g) # hidden layer
#b1 = torch.randn(100, generator=g)
#W2 = torch.randn((100, 27), generator=g) # output layer

# == Let's increase the size of the embeddings and hidden layer
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)

b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters) # number of parameters in total

In [None]:
for p in parameters:
    p.requires_grad = True

In [None]:
# we can create a 1D tensor of different learning rates to test for the best value
lre = torch.linspace(-3, 0, 1000) # creates exponents from [-3,0] in 1000 steps
lrs = 10**lre

lri = []   # learning rate exponent index
lossi = [] # loss at the exponent index
stepi = [] # training steps

In [None]:
for i in range(100000):
        
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (64,)) # batch size: 32. We may want to increase if training loss "noise" per step is too high
    
    # forward pass
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    #print(loss.item())

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    #lr = lrs[i]
    #lr = 0.1 # 10**-1.0
    #lr = 0.01 # learning rate decay in later training stages
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad
    
    # track stats
    #lri.append(lre[i])
    stepi.append(i)
    lossi.append(loss.log10().item())

#print(loss.item())

In [None]:
# plot log loss
plt.plot(stepi, lossi)

In [None]:
# graph the learning rate exponents against the loss to find the optimal learning rate (answer: -1.0)
plt.plot(lri, lossi)

In [None]:
# calculate the loss across the test training set
emb = C[Xtr]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

In [None]:
# calculate the loss across the dev training set
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
loss

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha='center', va='center', color='white')
plt.grid('minor')

## Sample from the model

In [None]:
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    
    out = []
    context = [0] * block_size # init with all "..."
    while True:
        emb = C[torch.tensor([context])] # (1,block_size,d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
        
    print(''.join(itos[i] for i in out))