[A Neural Probabilistic Language Model](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

[pytorch-internals](http://blog.ezyang.com/2019/05/pytorch-internals/)


In [1]:
import pandas as pd
import matplotlib.pyplot as plt 

import torch
import torch.nn.functional as F

%matplotlib inline
pd.set_option('display.max_columns', None)

In [3]:
words = open("../../data/names.txt", "r").read().splitlines()

words[:5], len(words)

(['emma', 'olivia', 'ava', 'isabella', 'sophia'], 32033)

In [4]:
chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}

In [6]:
block_size = 3
X, Y = [], []

for w in words[:2]:
    print(w)
    context = [0] * 3
    for c in w + ".":
        ix = stoi[c]
        X.append(context)
        Y.append(ix)
        print([itos[c] for c in context], "-->", itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
['.', '.', '.'] --> e
['.', '.', 'e'] --> m
['.', 'e', 'm'] --> m
['e', 'm', 'm'] --> a
['m', 'm', 'a'] --> .
olivia
['.', '.', '.'] --> o
['.', '.', 'o'] --> l
['.', 'o', 'l'] --> i
['o', 'l', 'i'] --> v
['l', 'i', 'v'] --> i
['i', 'v', 'i'] --> a
['v', 'i', 'a'] --> .


In [7]:
X.shape

torch.Size([12, 3])

### Embedding C lookup table

27 possible chars in 2 dim space


In [11]:
C = torch.randn(27, 2)

In [12]:
C[5]

tensor([-1.9639,  0.6679])

In [13]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-1.9639,  0.6679])

In [14]:
C[[5, 6, 7, 7]]

tensor([[-1.9639,  0.6679],
        [ 1.2873,  0.2881],
        [ 0.0747,  0.7510],
        [ 0.0747,  0.7510]])

In [15]:
X[4, 1]

tensor(13)

In [16]:
C[X].shape

torch.Size([12, 3, 2])

In [13]:
C[X][4, 1]

tensor([-0.6996,  0.8613])

In [14]:
C[13]

tensor([-0.6996,  0.8613])

---


In [15]:
C[X][4]

tensor([[-0.6996,  0.8613],
        [-0.6996,  0.8613],
        [ 0.5887, -0.5127]])

In [16]:
X[4]

tensor([13, 13,  1])

In [17]:
C[1]

tensor([ 0.5887, -0.5127])

In [18]:
C[13]

tensor([-0.6996,  0.8613])

In [19]:
emb = C[X]
emb.shape

torch.Size([12, 3, 2])

In [20]:
W1 = torch.randn(6, 100)
b1 = torch.randn(100)

In [21]:
# emb @ W1 + b # won't work [32, 3, 2] * [6 * 100]

In [22]:
emb[:, 0, :].shape

torch.Size([12, 2])

In [23]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape

torch.Size([12, 6])

In [24]:
len(torch.unbind(emb, 1))

3

In [25]:
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([12, 6])

In [26]:
torch.arange(5).storage()

  torch.arange(5).storage()


 0
 1
 2
 3
 4
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 5]

In [27]:
emb.view(12, 6)

tensor([[ 0.5953, -1.2872,  0.5953, -1.2872,  0.5953, -1.2872],
        [ 0.5953, -1.2872,  0.5953, -1.2872, -2.2058,  0.2553],
        [ 0.5953, -1.2872, -2.2058,  0.2553, -0.6996,  0.8613],
        [-2.2058,  0.2553, -0.6996,  0.8613, -0.6996,  0.8613],
        [-0.6996,  0.8613, -0.6996,  0.8613,  0.5887, -0.5127],
        [ 0.5953, -1.2872,  0.5953, -1.2872,  0.5953, -1.2872],
        [ 0.5953, -1.2872,  0.5953, -1.2872, -0.7440, -2.3250],
        [ 0.5953, -1.2872, -0.7440, -2.3250, -0.9170, -1.2680],
        [-0.7440, -2.3250, -0.9170, -1.2680,  1.1061, -1.7982],
        [-0.9170, -1.2680,  1.1061, -1.7982, -1.4039,  1.4524],
        [ 1.1061, -1.7982, -1.4039,  1.4524,  1.1061, -1.7982],
        [-1.4039,  1.4524,  1.1061, -1.7982,  0.5887, -0.5127]])

In [28]:
emb.view(12, 6).shape

torch.Size([12, 6])

### Hidden Layer

[12, 6] \* [6, 100] = [12, 100]

-1 in view means infer (12)


In [29]:
h = emb.view(-1, 6) @ W1 + b1
h.shape

torch.Size([12, 100])

In [30]:
# between 0 & 1
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h.shape

torch.Size([12, 100])

### Final Layer

[12 100] \* [100 27] = [12, 27]


In [31]:
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

In [32]:
logits = h @ W2 + b2
logits.shape

torch.Size([12, 27])

In [33]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [34]:
prob[0].sum()

tensor(1.)

In [35]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0])

In [36]:
prob[torch.arange(12), Y]

tensor([4.6439e-08, 4.9771e-10, 4.8357e-08, 1.1818e-01, 3.4079e-13, 1.4759e-10,
        6.4308e-08, 1.8508e-06, 8.8823e-06, 1.0907e-05, 1.8415e-07, 5.0378e-03])

### loss


In [37]:
loss = -prob[torch.arange(12), Y].log().mean()
loss

tensor(15.1871)

In [38]:
F.cross_entropy(logits, Y)

tensor(15.1871)

##### Why F.cross_entropy is better?

1. efficiently calculates cross_entropy for fwrd, bwrd pass


In [39]:
torch.tensor([1, 2, 100]).exp()

tensor([2.7183, 7.3891,    inf])

In [40]:
(torch.tensor([1, 2, 100]) - 100).exp()

tensor([1.0089e-43, 2.7465e-43, 1.0000e+00])

### Respectable


In [41]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn(27, 2)
W1 = torch.randn(6, 100)
b1 = torch.randn(100)
W2 = torch.randn(100, 27)
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]

In [42]:
sum([p.nelement() for p in parameters])

3481

In [43]:
for p in parameters:
    p.requires_grad = True

[...] can be many values, therefore loss wont overfit to be 0


In [44]:
for _ in range(10):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -0.1 * p.grad

17.68153953552246
13.330147743225098
10.072994232177734
7.930679798126221
6.375665664672852
5.144957065582275
4.106070041656494
3.1789493560791016
2.4108657836914062
1.8663972616195679


In [45]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0])

In [46]:
logits.max(1)

torch.return_types.max(
values=tensor([ 7.1389,  4.8175,  7.2210, 12.9024, 18.8147,  7.1389,  5.3278,  6.6342,
         3.3612,  4.3872,  6.9367, 12.7887], grad_fn=<MaxBackward0>),
indices=tensor([15, 18, 19,  1,  0, 15,  9,  9, 18,  9,  1,  0]))

In [47]:
logits.shape

torch.Size([12, 27])

bottleneck: h/tanh h layer is too big, emdedding/C layer is 2 dimensional & we are craming way too many characters into 2 dimensions.
