In [None]:
%reload_ext autoreload
%autoreload 2

import torch

torch.manual_seed(42)


In [None]:
from data.names_data_source import NamesDataSource, START_TOKEN, END_TOKEN
from learning.names_generator.names_generator_dataset import NamesGeneratorDataset


names_data_source = NamesDataSource.load(
    data_folder="../datasets/names",
    prefix=START_TOKEN,
    suffix=END_TOKEN,
    normalize_unicode=True,
)

names_dataset = NamesGeneratorDataset(names_data_source)
sample = names_dataset[0]
print(sample)


In [None]:
train_dataset, test_dataset = torch.utils.data.random_split(names_dataset, [0.85, 0.15])
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [None]:
from learning.names_generator.model import NamesGenerator
import torch
import torch.nn as nn


def generate_name(model: nn.Module, country_idx: int):
    with torch.no_grad():
        # shape: [1, C]
        category_tensor = names_data_source.country_index_to_one_hot(
            country_idx
        ).unsqueeze(0)

        # shape: [1, H]
        hidden_tensor = test_names_generator.init_hidden()

        # shape: [1, V]
        input_tensor = names_data_source.name_to_one_hot(START_TOKEN).squeeze(1)

        output_names = []
        for i in range(50):
            # shape: [1, V]
            output, hidden_tensor = model(category_tensor, input_tensor, hidden_tensor)

            probs = torch.exp(output)
            idx = int(torch.multinomial(probs, num_samples=1).item())
            next_char = names_data_source.index_to_token[idx]

            if next_char == END_TOKEN:
                break

            output_names.append(next_char)
            input_tensor = names_data_source.name_to_one_hot(next_char).squeeze(1)

        return "".join(output_names)


test_names_generator = NamesGenerator(
    hidden_size=128,
    num_vocab=names_data_source.num_vocab,
    num_classes=names_data_source.num_classes,
)

for _ in range(10):
    print(generate_name(test_names_generator, 4))


In [None]:
import time
import torch
from learning.names_generator.model import (
    Batch,
    ParallelBatchLearner,
    SequentialBatchLearner,
    NamesGenerator,
)
from learning.metrics import (
    ConfusionMatrixMetric,
)

from torch.utils.data import DataLoader
from torch import nn

BATCH_SIZE = 16
LEARNING_RATE = 0.001
HIDDEN_SIZE = 128
NUM_EPOCHS = 50
PATIENCE = 5

model = NamesGenerator(
    num_classes=names_data_source.num_classes,
    num_vocab=names_data_source.num_vocab,
    hidden_size=HIDDEN_SIZE,
)
print(model)
criterion = nn.CrossEntropyLoss()
print(criterion)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(optimizer)

learner = SequentialBatchLearner(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
)

train_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries
)

eval_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries
)


train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=Batch.from_samples,
)

eval_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=Batch.from_samples,
)

print("Starting training...")
start_time = time.time()
train_losses, eval_losses = learner.fit(
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    num_epochs=NUM_EPOCHS,
    patience=PATIENCE,
    train_metrics=[],
    eval_metrics=[],
)
elapsed_time = time.time() - start_time
print(f"Training completed. Elapsed time: {elapsed_time:.2f}s")


In [None]:
for _ in range(10):
    print(generate_name(model, 4))
