# WS_follow_along_makemore_2.ipynb
# WESmith 06/07/23
## follow along with Karpathy video
## https://www.youtube.com/watch?v=TCH_1BHY58I

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [None]:
words = open('../names.txt', 'r').read().splitlines()

In [None]:
len(words)

In [None]:
chars = sorted(list(set(''.join(words))))
stoi  = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos   = {i:s for s, i in stoi.items()}
print(itos)

In [None]:
def build_dataset(words, block_size=3):
    X, Y = [], []
    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '--->', ch)
            context = context[1:] + [ix] # crop and append
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [None]:
# build the dataset
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

block_size = 3
Xtr,  Ytr  = build_dataset(words[:n1],   block_size=block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size=block_size)
Xte,  Yte  = build_dataset(words[n2:],   block_size=block_size)

In [None]:
C = torch.randn((27, 2))  # embedding space

In [None]:
# must cast one-hot to float() because C is float()
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # matrix multiplication to pull out a row of C

In [None]:
C[torch.tensor([5, 6, 7])]

In [None]:
X

In [None]:
X.shape, C.shape

In [None]:
emb = C[X]
emb.shape

In [None]:
# hidden layer of 100 neurons: three letters, each with 2D embedding: => 6 inputs to each neuron
W1 = torch.randn((6, 100))
b1 = torch.randn(100)  # biases

In [None]:
# need to transform emb into an N x 6 array for the NN
# this is one way (that we won't use: it is ineffecient with memory)
dd = torch.cat(torch.unbind(emb, 1), 1)
dd.shape

In [None]:
# 'view' tutorial here: it is efficient: no memory use
a = torch.arange(18)
a.shape

In [None]:
a.view(2,9)

In [None]:
a.view(3,2,3)

In [None]:
#a.storage()

In [None]:
# need to transform emb into an N x 6 array for the NN
emb.view(32, 6).shape

In [None]:
# dimensions work; or can do emb.view(emb.shape[0], 6)
# also be careful that b1 broadcasts correctly
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h.shape

In [None]:
# output layer: 27 neurons, 100 inputs to each
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [None]:
logits = h @ W2 + b2
logits.shape

In [None]:
# softmax
counts = logits.exp()
prob   = counts / counts.sum(1, keepdims=True)
prob.shape

In [None]:
# get negative log likelihood
loss = -prob[torch.arange(32), Y].log().mean()
loss

# CLEAN UP

In [None]:
Xtr.shape, Ytr.shape

In [None]:
# parameters
block_size = 5  # number of characters to use for the prediction
n          = 200 # number of neurons in hidden layer (started at 100)
n_emb      = 15  # embedding dimension (was 2 to begin with)
batch_size = 64

In [None]:
# build the dataset
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr,  Ytr  = build_dataset(words[:n1],   block_size=block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size=block_size)
Xte,  Yte  = build_dataset(words[n2:],   block_size=block_size)

In [None]:
ncharemb = n_emb * block_size
g  = torch.Generator().manual_seed(2147483647)
C  = torch.randn((27, n_emb), generator=g)
W1 = torch.randn(( ncharemb, n), generator=g)
b1 = torch.randn(n,       generator=g)
W2 = torch.randn((n, 27), generator=g)
b2 = torch.randn(27,      generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters) # number of total parameters

In [None]:
for p in parameters:
    p.requires_grad = True  # False by default

In [None]:
lrexp = torch.linspace(-3, 0, 1000)
lrs   = 10**lrexp

In [None]:
lri   = []
lossi = []
stepi = []

for i in range(20000):
    
    # minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    
    # forward pass
    emb    = C[Xtr[ix]]  # (batch_size, 3, 2)
    h      = torch.tanh(emb.view(-1, ncharemb) @ W1 + b1)
    logits = h @ W2 + b2  # (batch_size, 27)
    #counts = logits.exp()
    #prob   = counts / counts.sum(1, keepdims=True)
    #loss   = -prob[torch.arange(32), Y].log().mean()
    loss   = F.cross_entropy(logits, Ytr[ix]) # this replaces the above three lines: much more efficient
    #print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    #lr = lrs[i]
    lr = 0.02  # did most training at 0.1, then reduced in end stages
    for p in parameters:
        p.data += -lr * p.grad
        
    # track stats
    #lri.append(lrexp[i])  # use this to find best learning rates
    stepi.append(i)
    lossi.append(loss.log10().item())
        
loss.item() 

In [None]:
# look at loss for full training dataset
emb    = C[Xtr]
h      = torch.tanh(emb.view(-1, ncharemb) @ W1 + b1)
logits = h @ W2 + b2
loss   = F.cross_entropy(logits, Ytr)
loss.item()

In [None]:
# look at loss for dev or test dataset
# Karpathy's best in the video for this set is 2.1701
emb    = C[Xdev]
h      = torch.tanh(emb.view(-1, ncharemb) @ W1 + b1)
logits = h @ W2 + b2
loss   = F.cross_entropy(logits, Ydev)
loss.item()

# sample from the model

In [None]:
g = torch.Generator().manual_seed(12345+10)

In [None]:
for _ in range(20):
    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, ncharemb) @ W1 + b1)
        logits = h @ W2 + b2
        probs  = F.softmax(logits, dim=1)
        ix     = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))