In [None]:
import random
from typing import List, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm.autonotebook import tqdm

%matplotlib inline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
words = open("../../data/names.txt", "r").read().splitlines()
words[:8]

In [None]:
len(words)

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}
print(itos)

In [None]:
class WordTokensDataset(Dataset):
    def __init__(self, words: List[str], block_size: int):
        X, Y = [], []
        for w in words:

            context = [0] * block_size
            for ch in w + ".":
                ix = stoi[ch]
                X.append(context)
                Y.append(ix)
                context = context[1:] + [ix]  # crop and append

        X = torch.tensor(X)
        Y = torch.tensor(Y)

        self.X = X
        self.Y = Y

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

    def __len__(self):
        return self.X.shape[0]

In [None]:
random.seed(42)
block_size = 3

random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

train_dataset = WordTokensDataset(words[:n1], block_size)
validation_dataset = WordTokensDataset(words[n1:n2], block_size)
test_dataset = WordTokensDataset(words[n2:], block_size)

In [None]:
train_dataset.X.shape, train_dataset.Y.shape

In [None]:
class WordTokenModel(nn.Module):
    def __init__(
        self,
        token_count: int,
        block_size: int,
        embedding_layer_size: int,
        hidden_layer_size: int,
        generator: Optional[torch.Generator],
    ):
        super().__init__()

        if not generator:
            generator = torch.Generator()

        self.C = nn.Parameter(
            torch.randn((token_count, embedding_layer_size), generator=generator)
        )
        self.W1 = nn.Parameter(
            torch.randn(
                (embedding_layer_size * block_size, hidden_layer_size),
                generator=generator,
            )
        )
        self.b1 = nn.Parameter(torch.randn(hidden_layer_size, generator=generator))
        self.W2 = nn.Parameter(
            torch.randn((hidden_layer_size, token_count), generator=generator)
        )
        self.b2 = nn.Parameter(torch.randn(token_count, generator=generator))

    def forward(self, X: torch.Tensor):
        emb = self.C[X]  # (BATCH_SIZE, block_size, embedding_layer_size)
        h = torch.tanh(
            emb.view(-1, 30) @ self.W1 + self.b1
        )  # (BATCH_SIZE, hidden_layer_size)
        logits = h @ self.W2 + self.b2  # (BATCH_SIZE, token_count)
        return logits

In [None]:
model = WordTokenModel(
    token_count=len(stoi),
    block_size=block_size,
    embedding_layer_size=10,
    hidden_layer_size=200,
    generator=torch.Generator().manual_seed(2147483647),
).to(device)

In [None]:
BATCH_SIZE = 512 if device == torch.device("cuda") else 32
TOTAL_SAMPLES_TO_TRAIN = 40_000_000 if device == torch.device("cuda") else 5_000_000
EPOCHS = TOTAL_SAMPLES_TO_TRAIN // len(train_dataset)

train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True
)

lossi = []
stepi = []


for epoch in tqdm(range(EPOCHS)):
    for X_batch, Y_batch in train_dataloader:

        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        logits = model.forward(X_batch)
        loss = F.cross_entropy(logits, Y_batch)
        # print(loss.item())

        # backward pass
        for p in model.parameters():
            p.grad = None
        loss.backward()

        # update
        lr = 0.1 if epoch < EPOCHS * 0.5 else 0.01
        for p in model.parameters():
            p.data += -lr * p.grad

    # track stats
    stepi.append(epoch)
    lossi.append(loss.log10().item())

In [None]:
plt.plot(stepi, lossi)

In [None]:
def calculate_loss(model: WordTokenModel, dataset: WordTokensDataset):
    X, Y = dataset.X, dataset.Y
    X = X.to(device)
    Y = Y.to(device)
    logits = model.forward(X)
    loss = F.cross_entropy(logits, Y)
    return loss.item()

In [None]:
training_loss = calculate_loss(model, train_dataset)
validation_loss = calculate_loss(model, validation_dataset)
print(f"{training_loss = :4f}, {validation_loss = :4f}")

In [None]:
# visualize dimensions 0 and 1 of the embedding matrix C for all characters
C_cpu = model.C.cpu()

plt.figure(figsize=(8, 8))
plt.scatter(C_cpu[:, 0].data, C_cpu[:, 1].data, s=200)
for i in range(C_cpu.shape[0]):
    plt.text(
        C_cpu[i, 0].item(),
        C_cpu[i, 1].item(),
        itos[i],
        ha="center",
        va="center",
        color="white",
    )
plt.grid("minor")

In [None]:
# sample from the model
g = torch.Generator(device).manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size  # initialize with all ...
    while True:
        logits = model.forward(torch.tensor([context], device=device))
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break

    print("".join(itos[i] for i in out))