## PART II: Multi Layer Perceptron (MLP)

This implementation is based along the lines of [Bengio et al. 2003](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

### Reading and exploring the dataset

In [1]:
import json

names = json.loads(open("names.txt", "r").read())
names = names["payload"]["blob"]["rawLines"]
names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [2]:
len(names)

32033

In [3]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(names))))
chtoi = {ch:i+1 for i, ch in enumerate(chars)}
chtoi["."] = 0
itoch = {i:ch for ch, i in chtoi.items()}

### Building the training dataset

In [4]:
import torch

context_len = 3     # number of characters to use as context
X, y = [], []       # training data split into inputs and targets

for n in names[:1]:
    print(n)
    context = [0] * context_len

    for ch in n + ".":
        ix = chtoi[ch]
        X.append(context)
        y.append(ix)
        print("".join([itoch[i] for i in context]), "---->", itoch[ix])
        context = context[1:] + [ix]
        print(context)
        
X = torch.tensor(X)
y = torch.tensor(y)

emma
... ----> e
[0, 0, 5]
..e ----> m
[0, 5, 13]
.em ----> m
[5, 13, 13]
emm ----> a
[13, 13, 1]
mma ----> .
[13, 1, 0]


In [5]:
X.shape, X.dtype, y.shape, y.dtype

(torch.Size([5, 3]), torch.int64, torch.Size([5]), torch.int64)

### Implementing the embedding lookup table

In [6]:
C = torch.randn(27, 2)     # randomly-initialized embedding matrix (27 characters, 2 dimensions)
emb = C[X]                 # embeddings for the input sequence
emb

tensor([[[ 0.1671, -0.1747],
         [ 0.1671, -0.1747],
         [ 0.1671, -0.1747]],

        [[ 0.1671, -0.1747],
         [ 0.1671, -0.1747],
         [ 0.2937, -0.5093]],

        [[ 0.1671, -0.1747],
         [ 0.2937, -0.5093],
         [-0.0251, -0.5869]],

        [[ 0.2937, -0.5093],
         [-0.0251, -0.5869],
         [-0.0251, -0.5869]],

        [[-0.0251, -0.5869],
         [-0.0251, -0.5869],
         [ 0.5192, -1.9340]]])

### Implementing the tanh hidden layer

In [7]:
W1 = torch.randn(6, 100)                      # randomly-initialized hidden layer weights (6 inputs, 100 neurons)
                                              # (6 inputs = 2-d embeddings * 3 context characters)
b1 = torch.randn(100)                         # randomly-initialized hidden layer biases (100 neurons)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)     # hidden layer activations
h.shape

torch.Size([5, 100])

### Implementing the output layer

In [8]:
W2 = torch.randn(100, 27)                     # randomly-initialized output layer weights (100 neurons, 27 outputs)
b2 = torch.randn(27)                          # randomly-initialized output layer biases (27 outputs)

logits = h @ W2 + b2                          # output layer activations
logits.shape

torch.Size([5, 27])

In [9]:
counts = logits.exp()                        # convert activations to counts
prob = counts / counts.sum(1, keepdims=True) # softmax
prob.shape

torch.Size([5, 27])

### Implementing the negative log likelihood loss

In [10]:
y

tensor([ 5, 13, 13,  1,  0])

In [11]:
loss = -prob[torch.arange(prob.shape[0]), y].log().mean()
loss

tensor(18.9712)

In [12]:
import torch.nn.functional as F

loss = F.cross_entropy(logits, y)
loss

tensor(18.9712)

### Summary of the full network

In [13]:
parameters = [C, W1, b1, W2, b2]
sum(p.numel() for p in parameters)

3481

### Implementing the training loop

In [14]:
for p in parameters:
    p.requires_grad = True

for _ in range(100):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # parameter updates
    for p in parameters:
        p.data -= 0.01 * p.grad

print("overfitted loss:", loss.item())

overfitted loss: 0.0331801138818264


The loss is overfitting because we have only 5 training examples, being fitted by 3481 parameters. We need more data to train this network.

### Training on the full dataset

In [24]:
X, y = [], []

for n in names:
    context = [0] * context_len
    for ch in n + ".":
        ix = chtoi[ch]
        X.append(context)
        y.append(ix)
        context = context[1:] + [ix]
        
X = torch.tensor(X)
y = torch.tensor(y)

C = torch.randn(27, 2)
W1 = torch.randn(6, 100)
b1 = torch.randn(100)
W2 = torch.randn(100, 27)
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

for _ in range(10):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print("loss:", loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # parameter updates
    for p in parameters:
        p.data -= 0.01 * p.grad
    """
    """

loss: 16.304555892944336
loss: 16.18372917175293
loss: 16.063337326049805
loss: 15.943434715270996
loss: 15.824076652526855
loss: 15.705314636230469
loss: 15.587221145629883
loss: 15.469862937927246
loss: 15.353293418884277
loss: 15.23757266998291
