- logits almost at zero at initialization => init to smaller params values
- saturation of tanh => why we don't want tails distributions (dead neurons that never activate and update less frequently) [ occurs for activation function with flat tails ]
- activation gain: https://pytorch.org/docs/stable/nn.init.html
used to adjust the standard deviation of the weights due to the non-linear activation. This strategy cannot be used for deeper neural network because we don't really know the gain we should use, so we use batch normalization at each layer instead.
- initializing tensor with kaiming normal
- batch normalization with gaussian only at initialization: normalize + scale and shift => calculate running mean/std instead of overall mean/std to avoid having to re-train
- when applying normalization, we don't need to add bias because it doesn't do anything
- smaller training set requires smaller momentum 

https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part3_bn.ipynb

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
import random

## 1. MLP Revisited

### 1.1. Making the dataset

In [None]:
# --- 0. reading the dataset
with open('names.txt', 'r') as f:
    names = f.readlines()
names = [name.strip() for name in names]

In [7]:
# letters dict
chrs = '.abcdefghijklmnopqrstuvwxyz'
ctoi = {c: i for i, c in enumerate(chrs)}
itoc = {i: c for i, c in enumerate(chrs)}

In [85]:
# --- 1. creating the dataset
block_size = 3
X, y = [], []
for name in names:
    context = [0] * block_size
    for char in name:
        idx = ctoi[char]
        context = context[1:] + [idx]
        X.append(context)
        y.append(idx)
X = np.array(X)
y = np.array(y)

In [86]:
# split into training and test dataset
n = X.shape[0]
training_split = 0.8
training_indexes = random.sample(range(n), int(n*training_split))
test_indexes = list(set(range(n)) - set(training_indexes))

# 
X_train, y_train = X[training_indexes], y[training_indexes]
X_test, y_test = X[test_indexes], y[test_indexes]

### 1.2. MLP 2.0

In [111]:
# --- 2. init mlp params with adjustment
n_emb = 10 # size of character embedding
n_hidden = 20 # number of neurons in hidden layer
vocab_size = len(chrs) # 27

# parameters: C, W1, b1, W2, b2, bnmean, bnstd
# first layer: tanh -> gain: 5/3 * std
# second layer: sigmoid -> gain: 1 * std
# std: sqrt(n_emb * block_size)
g = torch.Generator().manual_seed(420)
C = torch.randn((vocab_size, n_emb), generator=g)
W1 = torch.randn((n_emb * block_size, n_hidden), generator=g) * 5/3 * (n_emb * block_size)**2
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0.0

# Batch norm parameters
bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bn_mean_running = torch.zeros((1, n_hidden))
bn_std_running = torch.ones((1, n_hidden))

params = [C, W1, W2, b2, bngain, bnbias]
for param in params:
    param.requires_grad = True

In [112]:
sum(p.nelement() for p in params)

1477

In [114]:
# --- 3. train MLP with (1) batch gradient descent (2) batch normalization 
#        (3) decay learning rate
num_epochs = 200000
batch_size = 32

for i in range(num_epochs):
    # create batch size
    idxs = random.sample(range(X_train.shape[0]), batch_size)
    Xb, yb = torch.tensor(X_train[idxs]), torch.tensor(y_train[idxs])

    # -- forward pass
    emb = C[Xb]
    embcat = emb.view(emb.shape[0], -1)
    # linear layer
    h1 = embcat @ W1 # + b1
    # batch normalization
    bnmeani = h1.mean(0, keepdim=True)
    bnstdi = h1.std(0, keepdim=True)
    h2 = bngain * (h1 - bnmeani) / bnstdi + bnbias
    with torch.no_grad():
        bn_mean_running = 0.999 * bn_mean_running + 0.001 * bnmeani
        bn_std_running = 0.999 * bn_std_running + 0.001 * bnstdi
    # non-linear
    h3 = torch.tanh(h2)
    logits = h3 @ W2 + b2
    loss = torch.nn.functional.cross_entropy(logits, yb)

    # -- backward pass
    for p in params:
        p.grad = None
    loss.backward()

    # -- update weights
    lr = 0.1 if i < 100000 else 0.01
    for p in params:
        p.data += -lr * p.grad

    # -- print stats
    if i % 10000 == 0:
        print(f"Epoch {i}: {loss:4f}")

Epoch 0: 0.007345
Epoch 10000: 0.006155
Epoch 20000: 0.004254
Epoch 30000: 0.002095
Epoch 40000: 0.002000
Epoch 50000: 0.001576
Epoch 60000: 0.001404
Epoch 70000: 0.002732
Epoch 80000: 0.000926
Epoch 90000: 0.038557
Epoch 100000: 0.000888
Epoch 110000: 0.001859
Epoch 120000: 0.000832
Epoch 130000: 0.000341
Epoch 140000: 0.001595
Epoch 150000: 0.012919
Epoch 160000: 0.006076
Epoch 170000: 0.000597
Epoch 180000: 0.000756
Epoch 190000: 0.001005


In [None]:
# --- 4. make inference

### 2. MLP with PyTorch