### Build makemore Bigram yay.

In [None]:
words = open('../names.txt', 'r').read().split()

In [None]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline


N = torch.zeros((27, 27), dtype=torch.int32)

chars = sorted(set("".join(words)))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

for w in words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        N[stoi[ch1], stoi[ch2]] += 1

plt.figure(figsize=(16, 16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha='center', va='bottom', color='silver')
        plt.text(j, i, N[i, j].item(), ha='center', va='top', color='black')
plt.axis('off')

In [None]:
# setting a matrix P to storage the info from table
P = (N + 1).float() # N + 1 for smoothing, avoiding prob to be 0
P /= P.sum(dim=1, keepdim=True)

# generating names by bi-grams model
g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    out = []
    ix = 0
    while True:
        # p = N[ix].float()
        # p /= p.sum()
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print("".join(out))


In [None]:
# create a valuation to the above
# use negative log-likelihood as the valuation
log_likehood = 0.0
n = 0

for w in words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likehood += logprob
        n += 1
        print(f"{ch1}->{ch2}: {prob:.4f} {logprob:.4f}")

print(f"Average log-likelihood: {log_likehood / n:.4f}")
nll = -log_likehood / n
print(f"Negative log-likelihood: {nll:.4f}")


In [None]:
# log-likelihood -> inf, because prob of j -> q is 0
# to solve this, we can add a small value to the count matrix
log_likehood = 0.0
n = 0

# for w in ["andrejq"]: 
for w in ["andrej"]:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likehood += logprob
        n += 1
        print(f"{ch1}->{ch2}: {prob:.4f} {logprob:.4f}")

print(f"Average log-likelihood: {log_likehood / n:.4f}")
nll = -log_likehood / n
print(f"Negative log-likelihood: {nll:.4f}")

### Next, we use neural network to solve the problem.

In [None]:
# How to feed the data to a neural network?

# create the training set of bigrams (x, y)
xs, ys = [], []

for w in words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        xs.append(stoi[ch1])
        ys.append(stoi[ch2])
        print(f"{ch1}->{ch2}")

# if using torch.Tensor, the data type will be convert to float32
xs = torch.tensor(xs)   # torch.tensor keeps the data type
ys = torch.tensor(ys)

In [None]:
# one-hot encoding
import torch.nn.functional as F

# Class must be smaller than num_classes
# in this case, the least num_classes is 27
xenc = F.one_hot(xs, num_classes=27).float()
xenc

In [None]:
plt.imshow(xenc)

In [None]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(27, 27, generator=g, requires_grad=True)

In [None]:
for k in range(10):
    
    # forward pass
    logits = xenc @ W   # log-counts
    counts = logits.exp()   # equivalent to N
    probs = counts / counts.sum(dim=1, keepdim=True)
    # btw, the last 2 lines here are together called a softmax
    
    # loss function
    loss = - probs[torch.arange(len(xs)), ys].log().mean()
    print(loss.item())

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    W.data -= 10 * W.grad


In [None]:
P.shape

In [None]:
probs.shape

In [None]:
# finally, sample from both models
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    
    out = []
    ix = 0
    
    while True:
        
        # ------------
        # BEFORE
        # p = P[ix]
        # ------------
        
        # ------------
        # Now:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W   # predict log-counts
        counts = logits.exp()   # counts, equivalent to N
        p = counts / counts.sum(1, keepdim=True)   # probs for next char
        # ------------
        
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print("".join(out))