In [None]:
%reload_ext autoreload
%autoreload 2

import torch

torch.manual_seed(42)


In [None]:
from data.names_data_source import NamesDataSource
from data.tokenizer import Tokenizer
from learning.names_generator.names_generator_dataset import NamesGeneratorDataset


tokenizer = Tokenizer(use_start_token=True, use_end_token=True)

names_data_source = NamesDataSource.load(
    data_folder="../datasets/names",
    tokenizer=tokenizer,
    normalize_unicode=True,
)
print(tokenizer.token_to_index)

names_dataset = NamesGeneratorDataset(names_data_source, tokenizer)
sample = names_dataset[0]
print(sample)


In [None]:
train_dataset, test_dataset = torch.utils.data.random_split(names_dataset, [0.85, 0.15])
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [None]:
from learning.names_generator.model import NamesGenerator
import torch
import torch.nn as nn


def generate_name(model: NamesGenerator, country_idx: int):
    with torch.no_grad():
        # shape: [1, C]
        category_tensor = names_dataset.country_index_to_one_hot(country_idx).unsqueeze(
            0
        )

        # shape: [1, H]
        hidden_tensor = model.init_hidden()

        # shape: [1, V]
        input_tensor = tokenizer.to_one_hot(Tokenizer.START_TOKEN)

        output_names = []
        for i in range(50):
            # shape: [1, V]
            output, hidden_tensor = model(category_tensor, input_tensor, hidden_tensor)

            probs = torch.exp(output)
            idx = int(torch.multinomial(probs, num_samples=1).item())

            if tokenizer.is_special_idx(idx):
                break

            next_char = tokenizer.index_to_token[idx]
            output_names.append(next_char)
            input_tensor = tokenizer.to_one_hot(next_char)

        return "".join(output_names)


test_names_generator = NamesGenerator(
    hidden_size=128,
    num_vocab=tokenizer.vocab_size,
    num_classes=names_data_source.num_classes,
)
for _ in range(10):
    print(generate_name(test_names_generator, 4))


In [None]:
import time
import math
import torch
from learning.names_generator.model import (
    Batch,
    ParallelBatchLearner,
    SequentialBatchLearner,
    NamesGenerator,
)
from learning.metrics import (
    ConfusionMatrixMetric,
)

from torch.utils.data import DataLoader
from torch import nn

BATCH_SIZE = 64
LEARNING_RATE = 1e-4
HIDDEN_SIZE = 256
NUM_EPOCHS = 500
PATIENCE = 10

model = NamesGenerator(
    num_classes=names_data_source.num_classes,
    num_vocab=tokenizer.vocab_size,
    hidden_size=HIDDEN_SIZE,
)
print(model)
pad_token_idx = tokenizer.token_to_index[Tokenizer.PAD_TOKEN]
criterion = nn.CrossEntropyLoss(reduction="sum", ignore_index=pad_token_idx)
print(criterion)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(optimizer)

learner = ParallelBatchLearner(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    padding_idx=pad_token_idx,
)

train_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries
)

eval_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries
)


train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=Batch.from_samples,
)

eval_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=Batch.from_samples,
)

print(
    "Starting training...\n"
    f"Expecting initial loss around {math.log(tokenizer.vocab_size)}"
)
start_time = time.time()
train_losses, eval_losses = learner.fit(
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    num_epochs=NUM_EPOCHS,
    patience=PATIENCE,
    train_metrics=[],
    eval_metrics=[],
)
elapsed_time = time.time() - start_time
print(f"Training completed. Elapsed time: {elapsed_time:.2f}s")


In [None]:
print(model)
for _ in range(10):
    print(generate_name(model, 4))
