Lecture 2

See [Bengio et al.](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

In [None]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

random.seed(42)

In [None]:
# read in all the words
words = open('../../names.txt', 'r').read().splitlines()
words[:8]

In [None]:
len(words)

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

In [None]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for i, w in enumerate(words[:5]):

  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

- `X` is now a list of all the trigrams we have seen (32 distinct)
- `Y` is the next character (target)

In [None]:
X.shape, X.dtype, Y.shape, Y.dtype

In [None]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

shuffled = list(words)
random.shuffle(shuffled)
n1 = int(0.8*len(shuffled))
n2 = int(0.9*len(shuffled))

# Training set
Xtr, Ytr = build_dataset(shuffled[:n1])
# Dev / validation set = for tuning, checking generalization
Xdev, Ydev = build_dataset(shuffled[n1:n2])
# Final test set
Xte, Yte = build_dataset(shuffled[n2:])


## Building the MLP

We're going to build a similar MLP as shown in Figure 1 of [Bengio et al.](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf). They use sentences, with 17k words; we're making words, using length-3 trigrams, but otherwise, same idea.

### Embeddings

Now we make the embeddings. We choose 2 dimensions, because... why not.

We assign random vectors to all 27 characters.

In [None]:
gen = torch.random.manual_seed(42)
C = torch.randn((27, 2), generator=gen)

For each character-index in each trigram, fetch the 2-d vector.

This uses PyTorch's fancy indexing; `X` is a $N \times 3$ array of integers, where `N` is the number of inputs/examples. `C[i]` will fetch the 2D vector associated with character-index `i`. So `C[X]` fetches the 2D vector for every character-index in the $N \times 3$ array, yielding a $N \times 3 \times 2$ array of floats.

In [None]:
emb = C[X]
emb.shape, emb.dtype

### Hidden layer

Now we make the 'hidden layer' of neurons. We choose to use 100 neurons. Each neuron needs a weight for each dimension of each trigram, so six weights.

So - for each input example, each 'hidden layer' neuron has 6 inputs (3 characters, mapped to 2d space), and outputs 1 value.

In [None]:
gen = torch.random.manual_seed(43)
W1 = torch.randn((6, 100), generator=gen)
b1 = torch.randn(100, generator=gen)


#### Torch array manipulation

Now normally we would do $\tanh\left(E w + b\right)$, where $E$ is $I\times d$, where $I$ is input size, and $d$ is number of dimensions; then $w$ is $d\times N$, where $N$ is number of neurons.

But... our embeddings `emb` is $I\times 3 \times 2$, rather than $I\times 6$, so we need to do some flattening.

In [None]:
# Concatenate the middle dimension, squashing it
embcat = torch.cat([emb[:,0,:], emb[:,1,:], emb[:,2,:]], 1)
h = torch.tanh(embcat @ W1 + b1)
h.shape

##### Dimensions

For each example (32), each neuron (100) has one output. So we have an array of shape $\text{examples} \times \text{neuron outputs}$.

##### Generalizing a bit

In [None]:
# Concatenate the middle dimension, squashing it
emb_unbound = torch.cat(torch.unbind(emb, 1), 1)
h = torch.tanh(emb_unbound @ W1 + b1)
h.shape

##### But... there's an easier way

There's a simpler way: simply view it as already being $I \times 6$.

In [None]:
print(emb.shape)
# View it as I x 6, instead of I x 3 x 2.
# The -1 means 'inferred' - as in, whatever size necessary to fit.
print(emb.view(-1, 6).shape)

##### Making the layer

So now we can do $\tanh \left( E w + b \right)$. Note that $b$ is broadcast.

In [None]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h.shape

In [None]:
h

In [None]:
h.shape

### Second Final Layer

Now we make the next layer of neurons. These are fully connected, so have 100 inputs; and they output 27 outputs, one for each 'character'

In [None]:
gen = torch.random.manual_seed(44)
W2 = torch.randn((100, 27), generator=gen)
b2 = torch.randn(27, generator=gen)

So now let's get our initial output. The outputs are  log-of-probabilities-not-yet-normalized, 1 for each character plus 'word boundary'.

In [None]:
logits = h @ W2 + b2

In [None]:
logits.shape

For each example (row) $r$, we have logits `logits[r][i]` $= z_{r,i}$ for each class i.

$z_{r,i} = \log\left(Z_r\, p_{r,i}\right)$, where $\exp(z_{r,i})$ is a pseudo-count (unnormalized weight) for class $i$ in row $r$.

The inverse is $p_{r,i} = \frac{1}{Z_r} \exp(z_{r,i})$,

where $Z_r = \sum_j \exp(z_{r,j})$ is the per-row normalization constantâ€”the sum of those pseudo-counts for row $r$.

In [None]:
counts = logits.exp()

Here we do the normalization - this is a `softmax` operation, done manually.

In [None]:
prob = counts / counts.sum(1, keepdims=True)

In [None]:
prob.shape

In [None]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

Note that 3 lines above can be written as a single line:

```python
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
loss = -prob[torch.arange(32), Y].log().mean()
```

This does a softmax normalization to get a loss function. PyTorch has a function for this, `cross_entropy`, which does the same thing but more efficiently (fewer steps / in-memory matrices).

In [None]:
loss = F.cross_entropy(logits, Y)
loss

## now made respectable :)

In [None]:
Xtr.shape, Ytr.shape # dataset

In [None]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility

# First round parameters
number_of_characters = 27 # 26 + start/stop (.)
block_size = 3            # Number of characters in each 'block' (trigram => 3)
embedding_dimensions = 2  # Number of dimensions in the embeddings
hidden_units = 100        # Number of neurons in the the model's hidden layer(s)
steps = 10000

#  Te above underfits; the training set loss is the same as that on the
# dev set (2.33). So the network is too small.

# Second attempt - more hidden units
# embedding_dimensions = 2
# hidden_units = 300
# steps = 30000

# Larger size - more embedding dimensions, more hidden units
embedding_dimensions = 10
hidden_units = 200
steps = 100000

Tdim = block_size * embedding_dimensions  # Dimensions of a trigram vector
C = torch.randn((number_of_characters, embedding_dimensions), generator=g)
W1 = torch.randn((Tdim, hidden_units), generator=g)
b1 = torch.randn(hidden_units, generator=g)
W2 = torch.randn((hidden_units, number_of_characters), generator=g)
b2 = torch.randn(number_of_characters, generator=g)

parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters) # number of parameters in total

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre

In [None]:
lri = []
lossi = []
stepi = []

Loss step size

I set a loss step size that drops logarithmically; Karpathy does it manually changing it.

In [None]:
lrs = torch.logspace(0, -3, steps)

for i in range(steps):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 10)
  h = torch.tanh(emb.view(-1, Tdim) @ W1 + b1) # (32, 200)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  # lr = 0.1 if i < 100000 else 0.01
  # lr = 0.2 if i < 100000 else 0.02
  lr = lrs[i]
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  #lri.append(lre[i])
  stepi.append(i)
  lossi.append(loss.item())


  if i % (steps // 100) == 0:
    emb_full = C[Xtr]
    h_full = torch.tanh(emb_full.view(-1, Tdim) @ W1 + b1)
    logits_full = h_full @ W2 + b2
    loss_full = F.cross_entropy(logits_full, Ytr)
    perc = (i // (steps // 100))
    print(f'{perc:2d}% - {i:6d} ({lr:9.3g}): {loss:8.4f} {loss_full:8.4f}')

#print(loss.item())

### Plotting

- I switched to plotting the actual losses, with a log axis, rather than plotting the log of the losses.
- I plotted with a moving average, so you can see the curve.

In [None]:
def moving_average(data_set, periods=3):
    import numpy as np
    weights = np.ones(periods) / periods
    return np.convolve(data_set, weights, mode='same')

def gaussian_moving_average(x, data, sigma=3):
    import numpy as np
    radius = int(3 * sigma)
    print(radius)
    xw = np.arange(-radius, radius + 1)
    w = np.exp(-(xw**2) / (2 * sigma**2))
    w /= w.sum()
    print(len(w))

    xs = np.asarray(x)[radius:-radius]

    return xs, np.convolve(data, w, mode="valid")


In [None]:
plt.plot(stepi, lossi)
plt.plot(*gaussian_moving_average(stepi, lossi, len(stepi)//200), color='black')

plt.yscale('log')

In [None]:
emb = C[Xtr] # (32, 3, 2)
h = torch.tanh(emb.view(-1, Tdim) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr)
loss

In [None]:
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, Tdim) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

In [None]:
# visualize dimensions 0 and 1 of the embedding matrix C for all characters
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white')
plt.grid('minor')

In [None]:
# training split, dev/validation split, test split
# 80%, 10%, 10%

In [None]:
context = [0] * block_size
C[torch.tensor([context])].shape

In [None]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break

    print(''.join(itos[i] for i in out))