# WS_follow_along_makemore.ipynb
# WESmith 06/06/23
## follow along with Karpathy video
##  https://www.youtube.com/watch?v=PaCmpygFfXo

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
words = open('../names.txt', 'r').read().splitlines()

In [None]:
words[:5]

In [None]:
len(words)

In [None]:
dd = list(len(w) for w in words)
min(dd), max(dd)

In [None]:
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1
        #print(ch1, ch2)

In [None]:
len(b)

In [None]:
sorted(b.items(), key=lambda kv: -kv[1]) # or could use reverse=True with +kv[1]

In [None]:
chars = sorted(list(set(''.join(words))))
stoi  = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos   = {i:s for s, i in stoi.items()}
itos

In [None]:
N = torch.zeros((27, 27), dtype=torch.int32)
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha='center', va='bottom', color='gray')
        plt.text(j, i, N[i, j].item(), ha='center', va='top', color='gray')
plt.axis('off')

In [None]:
N[0]

In [None]:
p = N[0].float()
p /= p.sum()
p

In [None]:
g  = torch.Generator().manual_seed(2147483647) # same seed as in video
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
itos[ix]

In [None]:
p = torch.rand(3, generator=g)
p = p/p.sum()
p

In [None]:
torch.multinomial(p, num_samples=100, replacement=True, generator=g)

In [None]:
# normalize each row
P  = (N + 1).float()  # add 1 for model smoothing, to avoid log blowing up
P /= P.sum(1, keepdim=True) # needs keepdim True here for proper broadcasting: be careful in general

In [None]:
P.sum(1, keepdim=True).shape, P.sum(1)

In [None]:
g  = torch.Generator().manual_seed(2147483647) # same seed as in video

for i in range(10):
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0: break
    print(''.join(out))

In [None]:
log_likelihood = 0.0
n = 0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        #print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')
nll = -log_likelihood.item()/n
print(f'{nll:.4f}')

In [None]:
# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)  # advised to use x.tensor() instead of x.Tensor(): video at 1h 09m
ys = torch.tensor(ys)

In [None]:
xs, ys

In [None]:
xenc = F.one_hot(xs, num_classes=27).float() # cast to float for the NN, otherwise int64 as is xs
xenc

In [None]:
W = torch.randn((27, 27))  # interpret as 27 neurons, each with 27 inputs
# @ is pytorch matric multiplcation operator
logits = xenc @ W # 'logits' interpreted as 'log counts' historically
# take exp() to get all numbers > 0, to interpret as counts
counts = logits.exp()
# form probability for each row as before
probs = counts / counts.sum(1, keepdims=True)
# 'softmax' operation is the last two lines: logits exponentiation and normalization
probs

In [None]:
# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)  # advised to use x.tensor() instead of x.Tensor(): video at 1h 09m
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

In [None]:
# 27 neurons, each with 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# gradient descent
for k in range(40):
    
    # forward pass
    xenc   = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs  = counts / counts.sum(1, keepdims=True)
    #probs[torch.arange(5), ys]  # pulls out the NN predicted probs for the target second-char
    # get -log likelihood and add a loss function to minimize W weights (regularization)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001 * (W**2).mean()
    #print(loss.item())

    # backward pass
    W.grad = None # set gradient to zero for pytorch
    loss.backward()

    # update: a an element-by-element multiply of weight matrix by gradient matrix (not a matrix multiply)
    W.data += -50 * W.grad
print(loss.item())

In [None]:
# sample from neural net (with same seed, get identical results to array result)
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
    out = []
    ix  = 0
    while True:
        xenc   = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p      = counts / counts.sum(1, keepdims=True) # prob for next char
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))