## makemore: becoming a backprop ninja

In [None]:
# There no change in the first several cells from last lecture

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt  # for making figures
% matplotlib inline

In [2]:
# download the names.txt file from github
!wget https: // raw.githubusercontent.com / karpathy / makemore / master / names.txt

dyld[59552]: Library not loaded: '/usr/local/opt/libunistring/lib/libunistring.2.dylib'
  Referenced from: '/usr/local/Cellar/wget/1.21.1/bin/wget'
  Reason: tried: '/usr/local/opt/libunistring/lib/libunistring.2.dylib' (no such file), '/usr/local/lib/libunistring.2.dylib' (no such file), '/usr/lib/libunistring.2.dylib' (no such file), '/usr/local/Cellar/libunistring/1.1/lib/libunistring.2.dylib' (no such file), '/usr/local/lib/libunistring.2.dylib' (no such file), '/usr/lib/libunistring.2.dylib' (no such file)


In [3]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [492]:
# build the dataset
block_size = 3  # context length: how many characters do we take to predict the next one?


def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]  # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])  # 80%
Xdev, Ydev = build_dataset(words[n1:n2])  # 10%
Xte, Yte = build_dataset(words[n2:])  # 10%

torch.Size([182484, 3]) torch.Size([182484])
torch.Size([22869, 3]) torch.Size([22869])
torch.Size([22793, 3]) torch.Size([22793])


In [None]:
# ok boilerplate done, now we get to the action:

In [493]:
n_embd = 10  # the dimensionality of the character embedding vectors
n_hidden = 64  # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647)  # for reproducibility
C = torch.randn((vocab_size, n_embd), generator=g)

# Layer 1
W1 = (
    torch.randn((n_embd * block_size, n_hidden), generator=g) * 
        (5/3) / ((n_embd * block_size) ** 0.5)
)
# using b1 just for fun, it's useless because of BN:
b1 = torch.randn(n_hidden, generator=g) * 0.1

# Layer 2
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1
b2 = torch.randn(vocab_size, generator=g) * 0.1
# BatchNorm parameters
bn_gain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bn_bias = torch.randn((1, n_hidden)) * 0.1

# Note: I am initializing many of these parameters in non-standard ways
# because sometimes initializing with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bn_gain, bn_bias]
print(sum(p.nelement() for p in parameters))  # number of parameters in total
for p in parameters:
    p.requires_grad = True

4137


In [494]:
batch_size = 32
n = batch_size  # a shorter variable also, for convenience
# construct a minibatch:
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix]  # batch X,Y

In [495]:
# forward pass, "chunkated" into smaller steps that are possible to backward one at a time
emb = C[Xb]  # embed the characters into vectors
emb_cat = emb.view(emb.shape[0], -1)  # concatenate the vectors
# Linear layer 1
h_pre_bn = emb_cat @ W1 + b1  # hidden layer pre-activation
# BatchNorm layer
bn_mean_i = 1 / n * h_pre_bn.sum(0, keepdim=True)
bn_diff = h_pre_bn - bn_mean_i
bn_diff_squared = bn_diff ** 2
# note: Bessel's correction (dividing by n-1, not n)
bn_var = 1 / (n - 1) * bn_diff_squared.sum(0, keepdim=True)  
bn_var_inv = (bn_var + 1e-5) ** -0.5
bn_raw = bn_diff * bn_var_inv
h_pre_act = bn_gain * bn_raw + bn_bias
# Non-linearity
h = torch.tanh(h_pre_act)  # hidden layer
# Linear layer 2
logits = h @ W2 + b2  # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes  # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True)
counts_sum_inv = counts_sum ** -1  # if I use (1.0 / counts_sum) instead then I can't get backprop to be a bit exact...
probs = counts * counts_sum_inv
log_probs = probs.log()
loss = -log_probs[range(n), Yb].mean()

# PyTorch backward pass
for p in parameters:
    p.grad = None
for t in [
    # afaik there is no cleaner way
    log_probs, probs, counts, counts_sum, counts_sum_inv,
    norm_logits, logit_maxes, logits, h, h_pre_act, bn_raw,
    bn_var_inv, bn_var, bn_diff_squared, bn_diff, h_pre_bn, bn_mean_i,
    emb_cat, emb
]:
    t.retain_grad()
loss.backward()
loss

tensor(3.5976, grad_fn=<NegBackward0>)

In [569]:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt: torch.Tensor, t: torch.Tensor):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    diff = (dt - t.grad).abs()
    maxdiff_idx = torch.argmax(diff)
    print(
        f'{s:15s}'
        f' | exact: {str(ex):5s}'
        f' | approx: {str(app):5s}'
        f' | max diff: {diff.flatten()[maxdiff_idx]}'
        f' | max expected: {t.grad.flatten()[maxdiff_idx]}'
        f' | max calculated: {dt.flatten()[maxdiff_idx]}'
    )

# Exercise 1: backprop through the whole thing manually, backpropagating through exactly 
# all the variables as they are defined in the forward pass above, one by one.

# -----------------
# loss = -log_probs[range(n), Yb].mean()
# 1. x.mean() = x.sum() / size
# 2. x.sum() sums all elements in x. However, we are interested in only how a single element impacts 
# the objective function. So for us, x.sum() = x1 + a1 + a2 + a3..., with a1, a2, a3, ... being constants
# that do not affect the gradient at all. So the gradient of x.sum() with respect to x1 is 1.0.
# 3. For x.mean(), we should divide that gradient by the total number of elements of x. So the effect of
# one element x1 is only its share in the array x, which is 1.0 / len(x).
# 4. So the gradient of loss with respect to one element in log_probs is: -1.0 / log_probs[range(n), Yb].nelement(),
# but only for those elements in log_probs[range(n), Yb]; for the rest of elements in log_probs, it's 0.
# 5. For every element in log_probs, it would be:
d_log_probs = torch.zeros_like(log_probs)
size = log_probs[range(n), Yb].nelement()
d_log_probs[range(n), Yb] = -1.0 / size
# cmp('log_probs', d_log_probs, log_probs)

# log_probs = probs.log()
# derivative of log(x) is 1/x
d_probs = probs**-1 * d_log_probs
# cmp('probs', d_probs, probs)

# probs = counts * counts_sum_inv
# counts_sum_inv is (32, 1), so its second dimension is broadcast to make it (32, 27).
# So this multiplication is in fact equivalent to something like: 
# probs = counts * [counts_sum_inv, counts_sum_inv, counts_sum_inv, ...].
# The derivative of just the multiplication is just counts (times d_probs for the chain rule),
# and the derivative of broadcasting is accumulating the gradient 27 times (each element in 
# counts_sum_inv impacts probs one time for each column in counts), so we can call
# .sum(dim=1, keepdim=True) to accumulate and make it finally the same shape as counts_sum_inv.
d_counts_sum_inv = (d_probs * counts).sum(dim=1, keepdim=True)
# cmp('counts_sum_inv', d_counts_sum_inv, counts_sum_inv)

# counts_sum_inv = counts_sum ** -1
d_counts_sum = d_counts_sum_inv * -counts_sum**-2
# cmp('counts_sum', d_counts_sum, counts_sum)

# counts used twice:
# 1. counts_sum = counts.sum(1, keepdim=True)
d_counts1 = torch.ones_like(counts) * d_counts_sum
# 2. probs = counts * counts_sum_inv = counts * (counts_sum ** -1) = counts * (counts.sum(1, keepdim=True) ** -1)
d_counts2 = counts_sum_inv * d_probs
# ...so we calculate its derivative twice and summing it up
d_counts = d_counts1 + d_counts2
# cmp('counts', d_counts, counts)

# counts = norm_logits.exp() -> d_counts /d_norm_logits = norm_logits.exp() = counts
d_norm_logits = counts * d_counts
# cmp('norm_logits', d_norm_logits, norm_logits)

# norm_logits = logits - logit_maxes  # subtract max for numerical stability
d_logit_maxes = -d_norm_logits.sum(dim=1, keepdim=True)
# cmp('logit_maxes', d_logit_maxes, logit_maxes)

# logits is used twice:
# 1. norm_logits = logits - logit_maxes  # subtract max for numerical stability
d_logits1 = d_norm_logits
# 2. logit_maxes = logits.max(1, keepdim=True).values
indices = logits.max(1).indices
d_logits2 = torch.zeros_like(logits)
d_logits2[range(32), indices] = d_logit_maxes.view(32)
# another way of doing it:
d_logits2 = F.one_hot(indices, num_classes=logits.shape[1]) * d_logit_maxes
# finally, summing up all impacts
d_logits = d_logits1 + d_logits2
# cmp('logits', d_logits, logits)

# logits = h @ W2 + b2  # output layer
# print(f'{d_logits.shape=}, {h.shape=}, {W2.shape=}')
# print(f'{W2.transpose(0, 1).shape=}, {(d_logits @ W2.transpose(0, 1)).shape=}')
dh = d_logits @ W2.transpose(0, 1)
# cmp('h', dh, h)

# print(f'{W2.shape=}, {h.shape=}, {(h.transpose(0, 1) @ d_logits).shape=}')
dW2 = h.transpose(0, 1) @ d_logits
# cmp('W2', dW2, W2)

# print(f'{b2.shape=}')
db2 = d_logits.sum(dim=0)
# cmp('b2', db2, b2)

# h = torch.tanh(h_pre_act)  # hidden layer
d_h_pre_act = dh * (1 - h**2)
# cmp('h_pre_act', d_h_pre_act, h_pre_act)

# h_pre_act = bn_gain * bn_raw + bn_bias
# print(f'{h_pre_act.shape=} {bn_gain.shape=} {bn_raw.shape=} {bn_bias.shape=}')
# print(f'{bn_raw.sum(0, keepdim=True).shape=}, {((d_h_pre_act * bn_raw).sum(0, keepdim=True)).shape=}')
d_bn_gain = (d_h_pre_act * bn_raw).sum(0, keepdim=True)
# cmp('bn_gain', d_bn_gain, bn_gain)

d_bn_bias = d_h_pre_act.sum(0, keepdim=True)
# cmp('bn_bias', d_bn_bias, bn_bias)

d_bn_raw = d_h_pre_act * bn_gain
# cmp('bn_raw', d_bn_raw, bn_raw)

# bn_raw = bn_diff * bn_var_inv
# print(f'{bn_raw.shape=} = {bn_diff.shape=} * {bn_var_inv.shape=}')
d_bn_var_inv = (d_bn_raw * bn_diff).sum(0, keepdim=True)
# cmp('bn_var_inv', d_bn_var_inv, bn_var_inv)

d_bn_diff1 = d_bn_raw * bn_var_inv

# bn_var_inv = (bn_var + 1e-5) ** -0.5
d_bn_var = -0.5 * (bn_var + 1e-5)**-1.5 * d_bn_var_inv
# cmp('bn_var', d_bn_var, bn_var)

# bn_var = 1 / (n - 1) * bn_diff_squared.sum(0, keepdim=True)  
# print(f'{bn_var.shape=}, {bn_diff_squared.shape=}')
# print(f'{torch.ones_like(bn_diff_squared).shape=}')
d_bn_diff_squared = torch.ones_like(bn_diff_squared) * 1 / (n - 1) * d_bn_var
# cmp('bn_diff_squared', d_bn_diff_squared, bn_diff_squared)

# bn_diff_squared = bn_diff ** 2
d_bn_diff2 = 2 * bn_diff * d_bn_diff_squared
d_bn_diff = d_bn_diff1 + d_bn_diff2
# cmp('bn_diff', d_bn_diff, bn_diff)

# bn_diff = h_pre_bn - bn_mean_i
# print(f'{h_pre_bn.shape=}, {bn_mean_i.shape=}, {bn_diff.shape=}')
d_bn_mean_i = -d_bn_diff.sum(0, keepdim=True)
# cmp('bn_mean_i', d_bn_mean_i, bn_mean_i)

# bn_mean_i = 1 / n * h_pre_bn.sum(0, keepdim=True)
d_h_pre_bn1 = d_bn_diff
# print(f'{d_h_pre_bn1.shape=}, {h_pre_bn.shape=}, {d_bn_mean_i.shape=}')
d_h_pre_bn2 = torch.ones_like(h_pre_bn) * 1 / n * d_bn_mean_i
d_h_pre_bn = d_h_pre_bn1 + d_h_pre_bn2
# cmp('h_pre_bn', d_h_pre_bn, h_pre_bn)

# h_pre_bn = emb_cat @ W1 + b1  # hidden layer pre-activation
# print(f'{h_pre_bn.shape=} = {emb_cat.shape=} @ {W1.shape=} + {b1.shape=}')
d_emb_cat = d_h_pre_bn @ W1.T
dW1 = emb_cat.T @ d_h_pre_bn
db1 = d_h_pre_bn.sum(0)
# print(f'{b1.shape=} {db1.shape=} {h_pre_bn.shape=}')
# cmp('emb_cat', d_emb_cat, emb_cat)
# cmp('W1', dW1, W1)
# cmp('b1', db1, b1)

# emb_cat = emb.view(emb.shape[0], -1)  # concatenate the vectors
d_emb = d_emb_cat.view(emb.shape)
# print(f'{emb.shape=}, {emb_cat.shape=}, {d_emb.shape=}')
# cmp('emb', d_emb, emb)

# emb = C[Xb]  # embed the characters into vectors
# equivalent to: emb = F.one_hot(Xb, num_classes=27).float() @ C
print(f'{emb.shape=} = {F.one_hot(Xb, num_classes=27).shape=} @ {C.shape=}')
print()
print(f'We have: {d_emb.shape=} and {F.one_hot(Xb, num_classes=27).shape=}')
print(f'We need shape: {C.shape=}')
permuted = F.one_hot(Xb, num_classes=27).permute(2, 0, 1)
print(f'First, we permute the one-hot vector: {permuted.shape=}')
print(f'Then we dot-multiply: {permuted.shape} @ {d_emb.shape}')
dC = torch.tensordot(permuted.float(), d_emb, dims=2)
print(f'Then we dot-multiply: {dC.shape}')
assert dC.shape == C.shape
print(f'{dC.shape=}')
cmp('C', dC, C)

RuntimeError: The size of tensor a (200) must match the size of tensor b (64) at non-singleton dimension 1

In [None]:
# Exercise 2: backprop through cross_entropy but all in one go
# to complete this challenge look at the mathematical expression of the loss,
# take the derivative, simplify the expression, and just write it out

# forward pass

# before:
# logit_maxes = logits.max(1, keepdim=True).values
# norm_logits = logits - logit_maxes # subtract max for numerical stability
# counts = norm_logits.exp()
# counts_sum = counts.sum(1, keepdim=True)
# counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
# probs = counts * counts_sum_inv
# logprobs = probs.log()
# loss = -logprobs[range(n), Yb].mean()

# now:
loss_fast = F.cross_entropy(logits, Yb)
print(loss_fast.item(), 'diff:', (loss_fast - loss).item())

In [563]:
# backward pass

# -----------------
# YOUR CODE HERE :)
# loss_fast = F.cross_entropy(logits, Yb)
d_logits = F.softmax(logits , 1)
d_logits[torch.arange(n), Yb] -= 1
size = d_logits[torch.arange(n), Yb].nelement()
d_logits /= size
print(f'{d_logits.shape=}, {probs.shape=}, {logits.shape=}')
# -----------------

cmp('logits', d_logits, logits) # I can only get approximate to be true, my maxdiff is 6e-9

d_logits.shape=torch.Size([32, 27]), probs.shape=torch.Size([32, 27]), logits.shape=torch.Size([32, 27])
logits          | exact: False | approx: True  | max diff: 7.2177499532699585e-09 | max expected: 0.003694217186421156 | max calculated: 0.003694224404171109


In [565]:
# Exercise 3: backprop through batchnorm but all in one go
# to complete this challenge look at the mathematical expression of the output of batchnorm,
# take the derivative w.r.t. its input, simplify the expression, and just write it out
# BatchNorm paper: https://arxiv.org/abs/1502.03167

# forward pass

# before:
# bnmeani = 1/n*hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani
# bndiff2 = bndiff**2
# bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
# bnvar_inv = (bnvar + 1e-5)**-0.5
# bnraw = bndiff * bnvar_inv
# hpreact = bngain * bnraw + bnbias

print(f'{h_pre_act.shape=} {h_pre_bn.shape=}')
print(f'{bn_gain.shape=} {bn_bias.shape=}')

# now:
h_pre_act_fast = (
    bn_gain * 
    (h_pre_bn - h_pre_bn.mean(0, keepdim=True)) / 
    torch.sqrt(h_pre_bn.var(0, keepdim=True, unbiased=True) + 1e-5) + 
    bn_bias
)
print('max diff:', (h_pre_act_fast - h_pre_act).abs().max())

h_pre_act.shape=torch.Size([32, 64]) h_pre_bn.shape=torch.Size([32, 64])
bn_gain.shape=torch.Size([1, 64]) bn_bias.shape=torch.Size([1, 64])
max diff: tensor(4.7684e-07, grad_fn=<MaxBackward1>)


In [None]:
# backward pass

# before we had:
# dbnraw = bngain * dhpreact
# dbndiff = bnvar_inv * dbnraw
# dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
# dbnvar = (-0.5*(bnvar + 1e-5)**-1.5) * dbnvar_inv
# dbndiff2 = (1.0/(n-1))*torch.ones_like(bndiff2) * dbnvar
# dbndiff += (2*bndiff) * dbndiff2
# dhprebn = dbndiff.clone()
# dbnmeani = (-dbndiff).sum(0)
# dhprebn += 1.0/n * (torch.ones_like(hprebn) * dbnmeani)

# calculate dhprebn given dhpreact (i.e. backprop through the batchnorm)
# (you'll also need to use some of the variables from the forward pass up above)

# -----------------
# YOUR CODE HERE :)
# d_h_pre_bn = None  # TODO. my solution is 1 (long) line
d_h_pre_bn = (
    bn_gain * bn_var_inv / n * (
        n * d_h_pre_act - d_h_pre_act.sum(0) - n / (n - 1) * bn_raw * (
            d_h_pre_act * bn_raw).sum(0)
    )
)  
# -----------------

# I can only get approximate to be true, my maxdiff is 9e-10
cmp('hprebn', d_h_pre_bn, h_pre_bn)

In [617]:
# Exercise 4: putting it all together!
# Train the MLP neural net with your own backward pass

# init
n_embd = 10  # the dimensionality of the character embedding vectors
n_hidden = 200  # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647)  # for reproducibility
C = torch.randn((vocab_size, n_embd), generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5 / 3) / (
        (n_embd * block_size) ** 0.5)
b1 = torch.randn(n_hidden, generator=g) * 0.1
# Layer 2
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1
b2 = torch.randn(vocab_size, generator=g) * 0.1
# BatchNorm parameters
bn_gain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bn_bias = torch.randn((1, n_hidden)) * 0.1

parameters = [C, W1, b1, W2, b2, bn_gain, bn_bias]
print(sum(p.nelement() for p in parameters))  # number of parameters in total
for p in parameters:
    p.requires_grad = True

# same optimization as last time
max_steps = 200000
batch_size = 32
n = batch_size  # convenience
loss_i = []

# use this context manager for efficiency once your backward pass is written (TODO)
#with torch.no_grad():

# kick off optimization
for i in range(max_steps):

    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]  # batch X,Y

    # forward pass
    emb = C[Xb]  # embed the characters into vectors
    emb_cat = emb.view(emb.shape[0], -1)  # concatenate the vectors
    # Linear layer
    h_pre_bn = emb_cat @ W1 + b1  # hidden layer pre-activation
    # BatchNorm layer
    # -------------------------------------------------------------
    bn_mean = h_pre_bn.mean(0, keepdim=True)
    bn_var = h_pre_bn.var(0, keepdim=True, unbiased=True)
    bn_var_inv = (bn_var + 1e-5) ** -0.5
    bn_raw = (h_pre_bn - bn_mean) * bn_var_inv
    h_pre_act = bn_gain * bn_raw + bn_bias
    # -------------------------------------------------------------
    # Non-linearity
    h = torch.tanh(h_pre_act)  # hidden layer
    logits = h @ W2 + b2  # output layer
    loss = F.cross_entropy(logits, Yb)  # loss function

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()  # use this for correctness comparisons, delete it later!

    # manual backprop! #swole_doge_meme
    # -----------------
    d_logits = F.softmax(logits, 1)
    d_logits[range(n), Yb] -= 1
    d_logits /= n
    
    dh = d_logits @ W2.T
    dW2 = h.T @ d_logits
    db2 = d_logits.sum(0)
    d_h_pre_act = dh * (1 - h**2)

    d_bn_gain = (bn_raw * d_h_pre_act).sum(0, keepdim=True)
    d_bn_bias = d_h_pre_act.sum(0, keepdim=True)
    d_h_pre_bn = (
        bn_gain * bn_var_inv / n * (
            n * d_h_pre_act - d_h_pre_act.sum(0) - n / (n - 1) * bn_raw * (d_h_pre_act * bn_raw).sum(0)
        )
    )  
   
    d_emb_cat = d_h_pre_bn @ W1.T
    dW1 = emb_cat.T @ d_h_pre_bn
    db1 = d_h_pre_bn.sum(0)
    d_emb = d_emb_cat.view(emb.shape)
    permuted = F.one_hot(Xb, num_classes=27).permute(2, 0, 1)
    dC = torch.tensordot(permuted.float(), d_emb, dims=2)

    grads = [dC, dW1, db1, dW2, db2, d_bn_gain, d_bn_bias]
    # -----------------

    # update
    lr = 0.1 if i < 100000 else 0.01  # step learning rate decay
    for pi, (p, grad) in enumerate(zip(parameters, grads)):
        assert p.data.shape == grad.shape, (p.data.shape, grad.shape)
        p.data += -lr * p.grad  # old way of cheems doge (using PyTorch grad from .backward())
        # p.data += -lr * grad # new way of swole doge TODO: enable

    # track stats
    if i % 10000 == 0:  # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    loss_i.append(loss.log10().item())

    if i >= 2000:  # TODO: delete early breaking when you're ready to train the full net
        break

12297
      0/ 200000: 3.5973


In [618]:
# useful for checking your gradients
for p,g in zip(parameters, grads):
  cmp(str(tuple(p.shape)), g, p)

(27, 10)        | exact: False | approx: True  | max diff: 1.210719347000122e-08 | max expected: -0.01540443953126669 | max calculated: -0.01540442742407322
(30, 200)       | exact: False | approx: True  | max diff: 1.1175870895385742e-08 | max expected: -0.04835912585258484 | max calculated: -0.048359137028455734
(200,)          | exact: False | approx: True  | max diff: 5.587935447692871e-09 | max expected: 4.6566128730773926e-09 | max calculated: -9.313225746154785e-10
(200, 27)       | exact: False | approx: True  | max diff: 1.30385160446167e-08 | max expected: -0.01817333698272705 | max calculated: -0.018173350021243095
(27,)           | exact: False | approx: True  | max diff: 7.450580596923828e-09 | max expected: 0.03223457559943199 | max calculated: 0.03223458305001259
(1, 200)        | exact: False | approx: True  | max diff: 2.7939677238464355e-09 | max expected: -0.013373886235058308 | max calculated: -0.013373889029026031
(1, 200)        | exact: False | approx: True  | ma

In [609]:
# calibrate the batch norm at the end of training

with torch.no_grad():
    # pass the training set through
    emb = C[Xtr]
    emb_cat = emb.view(emb.shape[0], -1)
    h_pre_act = emb_cat @ W1 + b1
    # measure the mean/std over the entire training set
    bn_mean = h_pre_act.mean(0, keepdim=True)
    bn_var = h_pre_act.var(0, keepdim=True, unbiased=True)


In [610]:
# evaluate train and val loss

@torch.no_grad()  # this decorator disables gradient tracking
def split_loss(split):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    emb = C[x]  # (N, block_size, n_embd)
    emb_cat = emb.view(emb.shape[0], -1)  # concat into (N, block_size * n_embd)
    h_pre_act = emb_cat @ W1 + b1
    h_pre_act = bn_gain * (h_pre_act - bn_mean) * (bn_var + 1e-5) ** -0.5 + bn_bias
    h = torch.tanh(h_pre_act)  # (N, n_hidden)
    logits = h @ W2 + b2  # (N, vocab_size)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())


split_loss('train')
split_loss('val')

train 2.2159318923950195
val 2.2280664443969727


In [None]:
# I achieved:
# train 2.0718822479248047
# val 2.1162495613098145

In [611]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size  # initialize with all ...
    while True:
        # forward pass
        emb = C[torch.tensor([context])]  # (1,block_size,d)      
        emb_cat = emb.view(emb.shape[0], -1)  # concat into (N, block_size * n_embd)
        h_pre_act = emb_cat @ W1 + b1
        h_pre_act = bn_gain * (h_pre_act - bn_mean) * (bn_var + 1e-5) ** -0.5 + bn_bias
        h = torch.tanh(h_pre_act)  # (N, n_hidden)
        logits = h @ W2 + b2  # (N, vocab_size)
        # sample
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break

    print(''.join(itos[i] for i in out))


carlah.
amille.
khirmili.
taty.
hacassie.
rahnen.
den.
rhc.
laqhi.
nellara.
chaiivon.
leigh.
ham.
pora.
quinn.
sulilea.
jambiron.
trogdiaryxian.
cen.
dus.
