In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read in all the words.
words = open('names.txt', 'r').read().splitlines()
print(f"Total amount of words: {len(words)}")

Total amount of words: 32033


In [3]:
# Build the vocabulary of characters and mappings to/from integers.
chars = sorted(list(set(''.join(words))))
stoi = {ch: ix + 1 for ix, ch in enumerate(chars)}
stoi['.'] = 0
itos = {ix: ch for ch, ix in stoi.items()}
vocab_size = len(itos)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 27


In [4]:
def build_dataset(words: list[str], context_size: int = 3, logging: bool = False) -> tuple[torch.tensor]:
    """
    Given an list of string words, creates input and output targets.
    Args:
        words(list[int], shape=[n]): List of all words to sample examples from.
        context_size(int)          : How many characters are considered for prediction.
        logging(bool)              : Whether to print builded examples or not.
    Returns:
        X(torch.tensor, shape=[m, contex size]): Input tensor, where m is number of examples.
        Y(torch.tensor, shape=[m])             : Target tensor, where m is number of examples.
    """
    X, Y = [], []
    for word in words:
        context = [0] * context_size
        for ch in word + '.':
            X.append(context)
            Y.append(stoi[ch])
            context = context[1:] + [stoi[ch]]
    X, Y = torch.tensor(X), torch.tensor(Y)
    if logging:
        print(f'Input: {X.shape}, Output: {Y.shape}')
    return X, Y


In [5]:
import random
random.seed(13)
random.shuffle(words)
n1 = int(len(words) * 0.8)
n2 = int(len(words) * 0.9)


block_size = 3  # how many charecters do we take to predict the next one?
Xtr, Ytr = build_dataset(words[:n1], block_size, True)     # 80%
Xdev, Ydev = build_dataset(words[n1:n2], block_size, True) # 10%
Xte, Yte = build_dataset(words[n2:], block_size, True)     # 10%

Input: torch.Size([182597, 3]), Output: torch.Size([182597])
Input: torch.Size([22761, 3]), Output: torch.Size([22761])
Input: torch.Size([22788, 3]), Output: torch.Size([22788])


In [6]:
# ok biorplate done, now let's get to the action

In [7]:
# utility function we will use later when comparing manual gradients to PyTorch gradients.
def cmp(s: str, dt: torch.tensor, t: torch.tensor) -> None:
    """
    Args:
        s (str)          : Name of parameter, derivatives were computed with rescpect to.
        dt (torch.tensor): Derivative, computed by hand.
        t (torch.tensor) : Derivative, computed by Torch.
    """
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [31]:
n_embd = 10  # the dimensionality of the character embedding vectors
n_hidden = 200  # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embd),             generator=g)
# Layer 1
W1 = torch.randn([n_embd * block_size, n_hidden], generator=g) * (5/3) / (n_embd * block_size)**0.5
b1 = torch.randn(n_hidden,                        generator=g) * 0.1  # using just for fun
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden)) * 0.1
bnbias = torch.randn((1, n_hidden)) * 0.1
# Note: I'm initializing many of these parameters in non-standart ways 
# so as nothing could mask an incorrect implementation of backward pass

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print("Total parametrs:", sum(p.nelement() for p in parameters))  # number of parameters in total
for p in parameters:
    p.requires_grad = True

Total parametrs: 12297


In [9]:
batch_size = 32
n = batch_size  # shorter variable also, for convenience
# Construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix]  # batch X, Y

In [22]:
# Forward pass
emb = C[Xb]
embcat = emb.view(emb.shape[0], -1)
# Linear layer 1
hprebn = embcat @ W1 + b1
# BatchNorm Layer
bnmeani = 1/n * hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1) * bndiff2.sum(0, keepdim=True)  # Note: Bessel's correction (dividing by n-1, not n)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias
# Non-linearity
h = torch.tanh(hpreact)
# Linear layer 2
logits = h @ W2 + b2
# cross entropy loss
logit_maxes = logits.max(1, keepdim=True).values  # exclude gradients
norm_logits = logits - logit_maxes
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = counts_sum**-1
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

# PyTorch backward pass
for p in parameters:
    p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, 
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
          bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani, 
          embcat, emb]:
    t.retain_grad()
loss.backward()
loss

tensor(3.2769, grad_fn=<NegBackward0>)

In [12]:
# --- CrossEntropy ---
dlogprobs = torch.zeros_like(logprobs)
dlogprobs[range(n), Yb] = -1/n
# ---
dprobs = (1.0 / probs) * dlogprobs
# ---
dcounts = dprobs * counts_sum_inv                            # init "counts" gradient
# ---
dcounts_sum_inv = dprobs * counts
dcounts_sum_inv = dcounts_sum_inv.sum(1, keepdim=True)
# ---
dcounts_sum = dcounts_sum_inv * (-1 * counts_sum**-2)
# ---
dcounts += dcounts_sum * torch.ones(batch_size, vocab_size)  # add "conunts" gradient
# ---
dnorm_logits = dcounts * counts
# ---
dlogits = dnorm_logits.clone()                               # init "logits" gradient
dlogit_maxes = -dnorm_logits.sum(1, keepdim=True)
# ---
ix = torch.argmax(logits, 1)
dlogits[range(n), ix] += dlogit_maxes.view(-1)               # add "logits" gradient
# --- Forward Pass ---
dh = dlogits @ W2.T
dW2 = h.T @ dlogits
db2 = dlogits.sum(0)
# ---
dhpreact = dh * (1 - h**2)
# ---
dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
dbnraw = dhpreact * bngain
dbnbias = (dhpreact).sum(0, keepdim=True)
# ---
dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
dbndiff = dbnraw * bnvar_inv                               # init "bndiff" grad
# ---
dbnvar = dbnvar_inv * -1/2 * (bnvar + 1e-5)**(-3/2)
dbndiff2 = (dbnvar * (1/(n-1))).expand(n, n_hidden)
dbndiff += 2 * bndiff * dbndiff2                           # add "bndiff" grad
# ---
dhprebn = dbndiff.clone()  # deepcopy                      # init "hprebn" grad
dbnmeani = -dbndiff.sum(0, keepdim=True)
# ---
dhprebn += (dbnmeani * 1/n).expand(n, n_hidden)            # add "hprebn" grad
# ---
dembcat = dhprebn @ W1.T
dW1 = embcat.T @ dhprebn
db1 = dhprebn.sum(0) # AFTER_DEBUG: require gradient for b1 pls!!! Unable to compare
# ---
demb = dembcat.view(emb.shape[0], block_size, n_embd)
# ---
dC = torch.zeros_like(C)
for k, row in enumerate(Xb):
    for j, ix in enumerate(row):
        dC[ix] += demb[k, j]

In [13]:
cmp('C', dC, C)

C               | exact: True  | approximate: True  | maxdiff: 0.0


In [14]:
alt_dlogits = F.softmax(logits, 1) 
alt_dlogits[range(n), Yb] -= 1.0
alt_dlogits /= n
cmp('', alt_dlogits, logits)

                | exact: False | approximate: True  | maxdiff: 5.3551048040390015e-09


In [18]:
dhprebn = bngain * bnvar_inv / n * (n * dhpreact - dhpreact.sum(0) - n / (n - 1) * bnraw * (bnraw * dhpreact).sum(0))

In [19]:
cmp('hprebn', dhprebn, hprebn)

hprebn          | exact: False | approximate: True  | maxdiff: 2.3283064365386963e-10


In [33]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    """Mini-batch construct"""
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]  # batch X, Y
    # forward pass
    emb = C[Xb] # embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    # Linear layer
    hprebn = embcat @ W1 + b1 # hidden layer pre-activation
    # BatchNorm layer
    # -------------------------------------------------------------
    bnmean = hprebn.mean(0, keepdim=True)
    bnvar = hprebn.var(0, keepdim=True, unbiased=True)
    bnvar_inv = (bnvar + 1e-5)**-0.5
    bnraw = (hprebn - bnmean) * bnvar_inv
    hpreact = bngain * bnraw + bnbias
    # -------------------------------------------------------------
    # Non-linearity
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # loss function    
    """Backward pass"""
    for p in parameters:
        p.grad = None
    # loss.backward()  # old way
    # --- new way---
    # Cross-entropy
    dlogits = F.softmax(logits, 1) 
    dlogits[range(n), Yb] -= 1.0
    dlogits /= n
    # 2-nd layer 
    dh = dlogits @ W2.T
    dW2 = h.T @ dlogits
    db2 = dlogits.sum(0)
    # Tanh()
    dhpreact = dh * (1 - h**2)
    # BatchNorm layer
    dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
    dbnraw = dhpreact * bngain
    dbnbias = (dhpreact).sum(0, keepdim=True)
    dhprebn = bngain * bnvar_inv / n * (n * dhpreact - dhpreact.sum(0) - n / (n - 1) * bnraw * (bnraw * dhpreact).sum(0))
    # 1-st layer
    dembcat = dhprebn @ W1.T
    dW1 = embcat.T @ dhprebn
    db1 = dhprebn.sum(0)
    # Embedding
    demb = dembcat.view(emb.shape[0], block_size, n_embd)
    
    dC = torch.zeros_like(C)
    for k, row in enumerate(Xb):
        for j, ix in enumerate(row):
            dC[ix] += demb[k, j]
    # Save computed grads
    grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]
    """Update"""
    lr = 0.1 if i < 100000 else 0.01  # learning step decay
    for p, grad in zip(parameters, grads):
        # p.data += -lr * p.grad  # old way
        p.data += -lr * grad  # manual way
    
    """Track stats"""
    if i % 10000 == 0:
        print(f"{i:7d}/{max_steps:7d}: {loss.item():.4f}")
    lossi.append(loss.log10().item())

    # if i > 100:
    #     break

      0/ 200000: 3.2965
  10000/ 200000: 2.2476
  20000/ 200000: 1.9275
  30000/ 200000: 2.6641
  40000/ 200000: 2.4417
  50000/ 200000: 2.2880
  60000/ 200000: 2.0486
  70000/ 200000: 2.1531
  80000/ 200000: 2.3181
  90000/ 200000: 2.1601
 100000/ 200000: 2.2190
 110000/ 200000: 2.2769
 120000/ 200000: 2.1896
 130000/ 200000: 1.6163
 140000/ 200000: 2.0624
 150000/ 200000: 1.8632
 160000/ 200000: 2.4450
 170000/ 200000: 2.3920
 180000/ 200000: 2.6192
 190000/ 200000: 2.2125


In [23]:
for p, g in zip(parameters, grads):
    cmp(str(tuple(p.shape)), g, p)

(27, 10)        | exact: False | approximate: True  | maxdiff: 1.5832483768463135e-08
(30, 200)       | exact: False | approximate: True  | maxdiff: 1.1175870895385742e-08
(200,)          | exact: False | approximate: True  | maxdiff: 6.51925802230835e-09
(200, 27)       | exact: False | approximate: True  | maxdiff: 1.4901161193847656e-08
(27,)           | exact: False | approximate: True  | maxdiff: 7.450580596923828e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 5.587935447692871e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 7.450580596923828e-09


In [24]:
with torch.no_grad():
    # Pass the training set through
    emb = C[Xtr]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1# + b1
    # Measure mean/std over the entire training set (get constant values for single example evaluation)
    bnmean = hpreact.mean(0, keepdim=True)
    bnstd = hpreact.std(0, keepdim=True)

In [25]:
# Sample from the model.
g = torch.Generator().manual_seed(13 + 10)
for _ in range (20):
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
        # forward pass the neural net
        emb = C[torch.tensor([context])] # (1, block_size, n_embd)
        hpreact = emb.view(1, -1) @ W1
        hpreact = bngain * (hpreact - bnmean) / bnstd + bnbias
        h = torch.tanh(hpreact)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        # if we sample the special '' token, break
        if ix == 0:
            break
    print("".join(itos[i] for i in out)) # decode and print the generated word

jamelle.
jlaryamiviyah.
brair.
haz.
maxeliaantrivo.
elianovan.
elanya.
ashna.
aur.
brextlyn.
sharson.
novan.
maledslee.
jodumaasikalarlin.
laz.
haisheer.
sareldance.
zazar.
kan.
grey.


In [29]:
Xb, Yb = Xdev, Ydev  # batch X, Y
# forward pass
emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
# Linear layer
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# BatchNorm layer
# -------------------------------------------------------------
bnmean = hprebn.mean(0, keepdim=True)
bnvar = hprebn.var(0, keepdim=True, unbiased=True)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = (hprebn - bnmean) * bnvar_inv
hpreact = bngain * bnraw + bnbias
# -------------------------------------------------------------
# Non-linearity
h = torch.tanh(hpreact) # hidden layer
logits = h @ W2 + b2 # output layer
loss = F.cross_entropy(logits, Yb) # loss function 

In [30]:
print(loss)

tensor(2.1725, grad_fn=<NllLossBackward0>)
