In [None]:
import math
import os
import random
from datetime import datetime
from typing import List, Optional, Tuple

import optuna
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm.autonotebook import tqdm

from nn_zero_to_hero.datasets import WordTokensDataset
from nn_zero_to_hero.loss import calculate_loss
from nn_zero_to_hero.models import WordTokenModel
from nn_zero_to_hero.optimizers import StepBasedLrGDOptimizer
from nn_zero_to_hero.tokens import sample_from_model, tokens_to_int_mapping
from nn_zero_to_hero.vizs import plot_embeddings

%matplotlib inline

In [None]:
BLOCK_SIZE = 3

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
words = open("../../data/names.txt", "r").read().splitlines()
words[:8]

In [None]:
len(words)

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(words))))
STOI, ITOS = tokens_to_int_mapping(chars)

print(ITOS)

In [None]:
random.seed(42)

random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

train_dataset = WordTokensDataset(words[:n1], BLOCK_SIZE, STOI)
validation_dataset = WordTokensDataset(words[n1:n2], BLOCK_SIZE, STOI)
test_dataset = WordTokensDataset(words[n2:], BLOCK_SIZE, STOI)

In [None]:
train_dataset.X.shape, train_dataset.Y.shape

In [None]:
model = WordTokenModel(
    token_count=len(STOI),
    block_size=BLOCK_SIZE,
    embedding_layer_size=5,
    hidden_layer_size=100,
    generator=torch.Generator().manual_seed(2147483647),
).to(device)

In [None]:
def train_model(
    model: WordTokenModel,
    dataset: Dataset,
    optimizer: torch.optim.Optimizer,
    epochs: int,
    batch_size: int,
    *,
    trial: Optional[optuna.Trial] = None,
) -> Tuple[List[int], List[float]]:
    model.train()

    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=True, pin_memory=True
    )

    lossi = []
    stepi = []

    for epoch in tqdm(range(epochs), leave=False):
        epoch_loss_sum = 0.0
        for X_batch, Y_batch in dataloader:

            X_batch = X_batch.to(device)
            Y_batch = Y_batch.to(device)

            logits = model.forward(X_batch)
            loss = F.cross_entropy(logits, Y_batch)
            epoch_loss_sum += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # track stats
        # TODO: Add validation loss
        stepi.append(epoch)
        average_loss = epoch_loss_sum / (epoch + 1)
        lossi.append(math.log10(average_loss))
        if trial:
            trial.report(average_loss, step=epoch)

    return stepi, lossi

In [None]:
batch_size = 32
epochs = 50

batches_by_epoch = len(train_dataset) // batch_size
optimizer = StepBasedLrGDOptimizer(
    model.parameters(),
    max_step_to_lr=[
        (batches_by_epoch * epochs * 0.5, 0.1),
        (batches_by_epoch * epochs * 0.75, 0.01),
        (None, 0.001),
    ],
)

stepi, lossi = train_model(
    model=model,
    dataset=train_dataset,
    optimizer=optimizer,
    epochs=epochs,
    batch_size=batch_size,
)

In [None]:
plt.plot(stepi, lossi)

In [None]:
training_loss = calculate_loss(model, train_dataset, F.cross_entropy, device)
validation_loss = calculate_loss(model, validation_dataset, F.cross_entropy, device)
print(f"{training_loss = :4f}, {validation_loss = :4f}")

In [None]:
plot_embeddings(model.C, ITOS)

In [None]:
# sample from the model
g = torch.Generator(device).manual_seed(2147483647 + 10)

for _ in range(20):
    s = sample_from_model(
        model,
        block_size=BLOCK_SIZE,
        device=device,
        itos=ITOS,
        generator=g,
    )
    print(s)

### Hyperparameter tuning

In [None]:
epochs = 50 if device == torch.device("cuda") else 20


def objective(trial: optuna.Trial) -> float:

    n_embedding_layer_size = trial.suggest_int("embedding_layer_size", 2, 15)
    n_hidden_layer_size = trial.suggest_int("hidden_layer_size", 100, 500, step=50)
    n_lr_start = trial.suggest_float("lr", 1e-2, 1e-1, step=1e-2)
    batch_size = trial.suggest_int("batch_size", 32, 512, step=32)

    batches_by_epoch = len(train_dataset) // batch_size

    model = WordTokenModel(
        token_count=len(STOI),
        block_size=BLOCK_SIZE,
        embedding_layer_size=n_embedding_layer_size,
        hidden_layer_size=n_hidden_layer_size,
        generator=torch.Generator().manual_seed(2147483647),
    ).to(device)

    optimizer = StepBasedLrGDOptimizer(
        model.parameters(),
        max_step_to_lr=[
            (batches_by_epoch * epochs * 0.5, n_lr_start),
            (batches_by_epoch * epochs * 0.75, n_lr_start * 0.1),
            (None, n_lr_start * 0.01),
        ],
    )

    train_model(
        model=model,
        dataset=train_dataset,
        optimizer=optimizer,
        epochs=epochs,
        batch_size=batch_size,
        trial=trial,
    )

    validation_loss = calculate_loss(model, validation_dataset)
    return validation_loss

In [None]:
db_file_path = os.path.abspath("../../optuna_db/db.sqlite3")
current_dt_iso = datetime.now().isoformat()

study = optuna.create_study(
    direction="minimize",
    storage=f"sqlite:///{db_file_path}",
    study_name=f"makemore-part2-{current_dt_iso}",
)
study.optimize(objective, n_trials=5)

trial = study.best_trial

print(f"Loss: {trial.value}")
print(f"Best hyperparameters: {trial.params}")

In [None]:
# Loss: 2.1235523223876953
# Best hyperparameters: {'embedding_layer_size': 15, 'hidden_layer_size': 350, 'lr': 0.09999999999999999, 'batch_size': 32}

n_embedding_layer_size = trial.params["embedding_layer_size"]
n_hidden_layer_size = trial.params["hidden_layer_size"]
n_lr_start = trial.params["lr"]
batch_size = trial.params["batch_size"]

epochs = 500
batches_by_epoch = len(train_dataset) // batch_size

model = WordTokenModel(
    token_count=len(STOI),
    block_size=BLOCK_SIZE,
    embedding_layer_size=n_embedding_layer_size,
    hidden_layer_size=n_hidden_layer_size,
    generator=torch.Generator().manual_seed(2147483647),
).to(device)

optimizer = StepBasedLrGDOptimizer(
    model.parameters(),
    max_step_to_lr=[
        (batches_by_epoch * epochs * 0.2, n_lr_start),
        (batches_by_epoch * epochs * 0.3, n_lr_start * 0.1),
        (None, n_lr_start * 0.01),
    ],
)

_ = train_model(
    model=model,
    dataset=train_dataset,
    optimizer=optimizer,
    epochs=epochs,
    batch_size=batch_size,
)

In [None]:
training_loss = calculate_loss(model, train_dataset, F.cross_entropy, device)
validation_loss = calculate_loss(model, validation_dataset, F.cross_entropy, device)
test_loss = calculate_loss(model, test_dataset, F.cross_entropy, device)
print(f"{training_loss = :4f}, {validation_loss = :4f}, {test_loss = :4f}")
# training_loss = 1.984158, validation_loss = 2.121833, test_loss = 2.122970