In [10]:
import torch
import torch.nn.functional as F

from matplotlib import pyplot as plt
%matplotlib inline

In [6]:
# create training set of bigrams
special_token = "."
bag = dict()
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = dict((s, i + 1) for i, s in enumerate(chars))
stoi[special_token] = 0
itos = dict((i, s) for s, i in stoi.items())

xs, ys = [], []

for w in words[:1]:
    chs = [special_token] + list(w) + [special_token] # adding start and end tokens
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [43]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((len(stoi), len(stoi)), generator=g)

In [44]:
xenc = F.one_hot(xs, num_classes=len(stoi)).float()
logits = xenc @ W
# softmax
counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdims=True)

In [45]:
probs.shape

torch.Size([5, 27])

In [46]:
nlls = torch.zeros(probs.shape[0])
for i in range(nlls.shape[0]):
    x = xs[i].item()
    y = ys[i].item()
    print("-" * 10)
    print(f"example: {itos[x]}{itos[y]} (indexes {x}, {y})")
    print(f"Input: {x} ({itos[x]})")
    print(f"Output probabilities: {probs[i]}")
    print(f"Correct label: {y} ({itos[y]})")
    p = probs[i, y]
    print(f"probability of the correct label with current weights: {p.item()}")
    logp = torch.log(p)
    print(f"log likelihood: {logp.item()}")
    nll = -logp
    print(f"negative log likelihood: {nll.item()}")
    nlls[i] = nll

print("=" * 20)
print(f"average negative log likelihood (loss): {nlls.mean().item()}")

----------
example: .e (indexes 0, 5)
Input: 0 (.)
Output probabilities: tensor([0.0607, 0.0100, 0.0123, 0.0042, 0.0168, 0.0123, 0.0027, 0.0232, 0.0137,
        0.0313, 0.0079, 0.0278, 0.0091, 0.0082, 0.0500, 0.2378, 0.0603, 0.0025,
        0.0249, 0.0055, 0.0339, 0.0109, 0.0029, 0.0198, 0.0118, 0.1537, 0.1459])
Correct label: 5 (e)
probability of the correct label with current weights: 0.01228625513613224
log likelihood: -4.399273872375488
negative log likelihood: 4.399273872375488
----------
example: em (indexes 5, 13)
Input: 5 (e)
Output probabilities: tensor([0.0290, 0.0796, 0.0248, 0.0521, 0.1989, 0.0289, 0.0094, 0.0335, 0.0097,
        0.0301, 0.0702, 0.0228, 0.0115, 0.0181, 0.0108, 0.0315, 0.0291, 0.0045,
        0.0916, 0.0215, 0.0486, 0.0300, 0.0501, 0.0027, 0.0118, 0.0022, 0.0472])
Correct label: 13 (m)
probability of the correct label with current weights: 0.018050700426101685
log likelihood: -4.014570713043213
negative log likelihood: 4.014570713043213
----------
example: m

# Optimization with backprop

In [79]:
# create dataset
special_token = "."
bag = dict()
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = dict((s, i + 1) for i, s in enumerate(chars))
stoi[special_token] = 0
itos = dict((i, s) for s, i in stoi.items())

xs, ys = [], []

for w in words:
    chs = [special_token] + list(w) + [special_token] # adding start and end tokens
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num_examples = xs.nelement()

# init the "network"
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((len(stoi), len(stoi)), generator=g, requires_grad=True)

In [85]:
alpha = 50
regularization_param = 0.01
# gradient descent
for epoch in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=len(stoi)).float()
    logits = xenc @ W
    # softmax
    counts = logits.exp()
    probs = counts / counts.sum(dim=1, keepdims=True)
    loss = -probs[torch.arange(num_examples), ys].log().mean() + regularization_param * (W ** 2).mean()
    print(f"Loss: {loss.item():.4f}")
    # backward pass
    W.grad = None # set grad to zero
    loss.backward()
    
    # update parameters
    W.data += -alpha * W.grad

Loss: 2.4834
Loss: 2.4834
Loss: 2.4833
Loss: 2.4833
Loss: 2.4833
Loss: 2.4832
Loss: 2.4832
Loss: 2.4832
Loss: 2.4832
Loss: 2.4831
Loss: 2.4831
Loss: 2.4831
Loss: 2.4831
Loss: 2.4830
Loss: 2.4830
Loss: 2.4830
Loss: 2.4830
Loss: 2.4829
Loss: 2.4829
Loss: 2.4829
Loss: 2.4829
Loss: 2.4828
Loss: 2.4828
Loss: 2.4828
Loss: 2.4828
Loss: 2.4828
Loss: 2.4827
Loss: 2.4827
Loss: 2.4827
Loss: 2.4827
Loss: 2.4827
Loss: 2.4827
Loss: 2.4826
Loss: 2.4826
Loss: 2.4826
Loss: 2.4826
Loss: 2.4826
Loss: 2.4826
Loss: 2.4825
Loss: 2.4825
Loss: 2.4825
Loss: 2.4825
Loss: 2.4825
Loss: 2.4825
Loss: 2.4824
Loss: 2.4824
Loss: 2.4824
Loss: 2.4824
Loss: 2.4824
Loss: 2.4824
Loss: 2.4824
Loss: 2.4824
Loss: 2.4823
Loss: 2.4823
Loss: 2.4823
Loss: 2.4823
Loss: 2.4823
Loss: 2.4823
Loss: 2.4823
Loss: 2.4822
Loss: 2.4822
Loss: 2.4822
Loss: 2.4822
Loss: 2.4822
Loss: 2.4822
Loss: 2.4822
Loss: 2.4822
Loss: 2.4822
Loss: 2.4821
Loss: 2.4821
Loss: 2.4821
Loss: 2.4821
Loss: 2.4821
Loss: 2.4821
Loss: 2.4821
Loss: 2.4821
Loss: 2.4821

In [89]:
# sample
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=len(stoi)).float()
        logits = xenc @ W
        # softmax
        counts = logits.exp()
        probs = counts / counts.sum(dim=1, keepdims=True)
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        if ix == 0:
            break
        out.append(itos[ix])
    print(''.join(out))

junide
janasah
prelay
a
nn
