In [1]:
words = open('names.txt', 'r').read().splitlines()

In [None]:
words[:10]

In [None]:
len(words)

In [None]:
min(len(w) for w in words)

In [None]:
max(len(w) for w in words)

In [6]:
# Count ocuurences in a dictionary
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs[:], chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1    

In [None]:
# Look at most frequent bigrams
sorted(b.items(), key = lambda kv: kv[1], reverse=True)

In [8]:
import torch  # type: ignore

In [22]:
N = torch.zeros(size=(27, 27), dtype=torch.int32)

In [26]:
# Get the alphabet as the list of stings
chs = sorted(list(set("".join(words))))

# Create character: id and  id: character mappings
stoi = {s: i+1 for i, s in enumerate(chs)}
stoi['.'] = 0 
itos = {i: s for s, i in stoi.items()}

In [27]:
# Count ocuurences in a 2D tensor
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs[:], chs[1:]):
        ix = stoi[ch1]
        iy = stoi[ch2]
        N[ix, iy] += 1  

In [None]:
import matplotlib.pyplot as plt  # type: ignore
%matplotlib inline

plt.figure(figsize=(16, 16))
plt.imshow(N, cmap='Blues')
for i in range(len(itos)):
    for j in range(len(itos)):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha='center', va='bottom', color='gray')
        plt.text(j, i, N[i, j].item(), ha='center', va='top', color='gray')
plt.axis('off')

In [None]:
N[0]

In [None]:
p = N[0].float()
p /= p.sum()
p

In [None]:
g = torch.Generator().manual_seed(13)
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
itos[ix]

In [209]:
P = (N+1).float()  # Smooth by adding 1 to all elments
P /= P.sum(axis=1, keepdim=True)

In [None]:
# Randomly generate names
g = torch.Generator().manual_seed(13)

for i in range(5):
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() 
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

In [None]:
# Evaluate model with Negative Lo Likelihood
# likelihood = a * b * c * ... * z
# log(likelihood) = log(a * b * c * ... * z) = log(a) + log(b) + log(c) + ... + log(z)

log_likelihood = 0.0
n = 0
print('| bigram | prob   | -log(prob)')
for w in ['yaroslav']:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs[:], chs[1:]):
        ix = stoi[ch1]
        iy = stoi[ch2]
        prob = P[ix, iy]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f'|   {ch1}{ch2}   | {prob:0.4f} | {-logprob:0.4f}')
    
print(f'{log_likelihood = }')
nll = -log_likelihood / n
print(f'NLL (average): {nll.item()}')