In [None]:
# load the names dataset from file
from pathlib import Path

data_dir = Path.cwd() / ".." / ".." / "data"


def load_names(path: Path) -> list[str]:
    with path.open("r") as f:
        return f.read().splitlines()


words = load_names(data_dir / "names.txt")
words[:4]

In [None]:
min_length = min(len(w) for w in words)
max_length = max(len(w) for w in words)
print(f"min len = {min_length}")
print(f"max_len = {max_length}")

**Counting Bigrams**

In [None]:
# global sentinel tokens (start and stop)
TOKEN_DOT = "."

In [None]:
# counting bigram frequencies
bigram_counts = {}
for w in words:
    chs = [TOKEN_DOT] + list(w) + [TOKEN_DOT]
    for l, r in zip(chs, chs[1:]):
        bigram = (l, r)
        bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

In [None]:
# most popular bigrams
sorted(bigram_counts.items(), key=lambda p: p[1], reverse=True)[:10]

**Coverting to a Matrix Representation**

In [None]:
# LUT construction
chars = sorted(list(set("".join(words))))

# string-to-index
stoi = {c: i + 1 for i, c in enumerate(chars)}
stoi[TOKEN_DOT] = 0

# index to string
itos = {i: c for c, i in stoi.items()}

assert len(stoi) == len(itos), "broken invariant"
assert all(itos[stoi[c]] == c for c in chars), "broken invariant"

In [None]:
# transition to array-based representation of bigram counts (efficiency)
import torch

# 26 characters + 1 special token
ALPHABET_SIZE = 27

# initialize counts to 0
N = torch.zeros((ALPHABET_SIZE, ALPHABET_SIZE), dtype=torch.int32)

for w in words:
    chs = [TOKEN_DOT] + list(w) + [TOKEN_DOT]
    for l, r in zip(chs, chs[1:]):
        il = stoi[l]
        ir = stoi[r]
        N[il, ir] += 1

In [None]:
# verify results are consistent with dictionary representation
for bigram, count in bigram_counts.items():
    il, ir = stoi[bigram[0]], stoi[bigram[1]]
    assert N[il, ir] == count, "broken invariant"

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

plt.figure(figsize=(16, 16))
plt.imshow(N, cmap="Blues")
for i in range(ALPHABET_SIZE):
    for j in range(ALPHABET_SIZE):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
        plt.text(j, i, str(N[i, j].item()), ha="center", va="top", color="gray")

plt.axis("off")

**Sampling from the Model**

In [None]:
# normalize across rows to compute probability distribution for each character
P = (N + 1).float()
P /= P.sum(1, keepdim=True)

In [None]:
def sample_one(model: torch.Tensor, g: torch.Generator) -> str:
    """Sample a single word from the model."""
    word = ""

    ix = 0  # 0 is the index of the start token '.'
    while True:
        # sample an index from the distribution
        ix = torch.multinomial(
            model[ix, :].float(), num_samples=1, generator=g
        ).item()

        # check if this is the stop token
        if ix == 0:
            return word

        # add the character to the growing word
        word += itos[ix]


def sample(model: torch.Tensor, k: int = 1, seed: int = 1337):
    """Sample k words from the model."""
    g = torch.Generator().manual_seed(seed)
    return [sample_one(model, g) for _ in range(k)]

In [None]:
samples = sample(P, k=8)
samples

**Loss Function**

In [None]:
import numpy as np
import math

x = np.linspace(0.1, 100)
y = np.array([math.log(v) for v in x])

plt.plot(x, y)

In [None]:
def loss(model: torch.Tensor, data: list[str]) -> float:
    """Compute loss with respect to the given data."""
    # the number of bigrams
    n = 0

    log_likelihood = 0.0
    for w in data:
        chs = [TOKEN_DOT] + list(w) + [TOKEN_DOT]
        for l, r in zip(chs, chs[1:]):
            ix0, ix1 = stoi[l], stoi[r]
            log_likelihood += torch.log(model[ix0, ix1]).item()
            n += 1

    # invert to get negative log-likelihood
    nll = -log_likelihood
    # compute mean of nll
    return nll / n

In [None]:
# loss w.r.t. the entire training set
loss(P, words)

**Model Smoothing**

In [None]:
loss(P, ["andrejq"])