# PART 4 - Backpropagation

In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# read in all names
words = open('names.txt', 'r').read().splitlines()
print('first 10 words:\n')
print(words[:10])
print('\nlen of words: ', len(words))

first 10 words:

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']

len of words:  32033


In [5]:
# build vocab
chars = sorted(
    list( set(''.join(words) ) )
)
stoi = {s:i+1 for i, s in enumerate(chars) }
stoi['.'] = 0
itos = {i:s for s, i in stoi.items() }
print(itos)

vocab_size = len(itos)
print('vocab size: ', vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
vocab size:  27


In [6]:
block_size = 3

def build_dataset(words):
    
    X, Y = [], [] # inputs, targets
    
    for w in words:
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context) # context words
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)

    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

# train/dev/test split
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [26]:
# util function for comparing handcraft gradient vs autograd from torch
def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approx: {str(app):5s} | maxdiff: {maxdiff}')

In [15]:
n_embd = 10 # embedding dimension
n_hidden = 64 # neuron size

g = torch.Generator().manual_seed(2147483647) # reproducibility
# embedding, 2d for each vocab, 27 total
C = torch.randn((vocab_size, n_embd), generator=g)
# layer 1, [context, hidden_size]=[3 word embedding, 100]=[6, 100]
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)*((n_embd * block_size)**-0.5)
b1 = torch.randn(n_hidden, generator=g) * 0.1 # keep bias, not needed but good to test gradient
# layer 2, [100, 27], output softmax
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1
b2 = torch.randn(vocab_size, generator=g) * 0.1 # 0 init value in bias could mask errors in gradient

bngain = torch.ones((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.zeros((1, n_hidden)) * 0.1

bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
for p in parameters:
    p.requires_grad = True

print('\nparams count: ', sum(p.nelement() for p in parameters) ) # total number of params in network


params count:  4137


In [16]:
batch_size = 32
n = batch_size # shorter name for convenience

# mini batch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix]

In [17]:
# expanded FOWARD PASS
# the expansion steps in chunks helps manual gradient calculation
emb = C[Xb] # embed the chars into vectors
embcat = emb.view(emb.shape[0], -1) # concat to 1 embedding size of context

# linear layer 1
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# batch norm layer, keep hidden state dist normal
bnmeani = 1/n * hprebn.sum(0, keepdim=True) # batch avg of layer 1
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2 # diff squared
bnvar = 1/(n-1) * (bndiff2).sum(0, keepdim=True) # avg variance, note: Bessels'correction, use n-1, not n
bnvar_inv = (bnvar + 1e-5)**-0.5 # sqr root + epsilon to avoid divide by 0
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias # normalize

# non-linearity activation
h = torch.tanh(hpreact) # hidden state

# linear layer 2
logits = h @ W2 + b2 # output layer
# cross entropy loss, same as F.cross_entroy(logits, Yb)
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp() # logits=log counts, exponentiate to get count
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = counts_sum ** -1 # if use (1.0/count_sum) then cant get backprop to be exact
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

# pytorch backward pass
for p in parameters:
    p.grad = None

for t in [logprobs, probs, counts, counts_sum, counts_sum_inv,
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
          bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
          embcat, emb]:
    t.retain_grad()

loss.backward()
loss


tensor(3.3482, grad_fn=<NegBackward0>)

In [19]:
# excercise 1: back prop through the whole thing manually
# all the variable defined in forward pass above 1 by 1

# logprobs
# logprobs shape [32, 27]
print(logprobs.shape)

# dlogprobs = dloss/dlogprobs = loss change due to all probs = same shape as logprobs

torch.Size([32, 27])


In [23]:
print('labeles (indices):\n', Yb, '\n') # labels
print('logprobs:\n', logprobs, '\n') # [batch_size, vocabsize], prob for each word in batch
print('prob for each char in batch:\n', logprobs[range(n), Yb], '\n')

labeles (indices):
 tensor([ 8, 14, 15, 22,  0, 19,  9, 14,  5,  1, 20,  3,  8, 14, 12,  0, 11,  0,
        26,  9, 25,  0,  1,  1,  7, 18,  9,  3,  5,  9,  0, 18]) 

logprobs:
 tensor([[-2.6153, -2.4396, -4.0066, -2.9607, -3.9532, -2.4704, -3.7684, -3.3390,
         -4.0580, -3.4449, -3.3145, -3.2976, -3.2953, -3.5589, -3.3670, -4.3238,
         -4.7673, -3.9655, -4.2126, -2.9093, -2.9708, -3.8661, -3.7024, -2.6296,
         -2.8401, -3.6538, -3.8384],
        [-2.9221, -2.8495, -2.3383, -2.9125, -3.3366, -3.4310, -3.9924, -3.0359,
         -3.9580, -3.7286, -2.9552, -3.0985, -3.0474, -3.5557, -3.0728, -3.2784,
         -3.6525, -4.1385, -3.8847, -3.2946, -4.0144, -3.8155, -4.2485, -2.7364,
         -3.7662, -3.3131, -3.7084],
        [-3.9722, -3.7847, -4.2938, -4.4023, -3.8066, -3.1617, -2.8325, -2.7736,
         -2.7888, -3.4810, -3.9044, -3.3975, -3.1220, -3.0080, -3.7930, -3.6750,
         -4.3177, -3.3865, -3.5933, -2.1774, -2.6969, -3.2878, -3.1831, -3.2233,
         -3.2701, -

In [27]:
# loss = -(a + b + c) / 3 (for 3 numbers, or 32 in our batch)
# = -a/3 + -b/3 + -c/3

# dloss/da = -1/3, or -1/n, where n=batch size
# dloss/d(other numbers) = 0, since the others dont participate in loss with respect to 'a'

# init dlogprobs to 0
dlogprobs = torch.zeros_like(logprobs)
# for each label position, set derivative
dlogprobs[range(n), Yb] = -1.0/n

# check with autograd
cmp('logprobs', dlogprobs, logprobs)

logprobs        | exact: True  | approx: True  | maxdiff: 0.0


In [28]:
# probs = [log(a), log(b), log(c)
# dloss/dprobs = dloss/dlogprobs * dlogprobs/dprobs = dlogprobs * [1/a, 0, 0]
dprobs = dlogprobs * 1.0/probs

cmp('dprobs', dprobs, probs)

dprobs          | exact: True  | approx: True  | maxdiff: 0.0


In [29]:
probs.shape

torch.Size([32, 27])