In [None]:
%reload_ext autoreload
%autoreload 2

import torch
from data.names_dataset import NamesDataset
from torch.utils.data import DataLoader

torch.manual_seed(42)

# Initialize NamesDataset with the detected device
names_dataset = NamesDataset(data_folder="../datasets/names")

# train_dataset, test_dataset = torch.utils.data.random_split(names_dataset, [0.85, 0.15])
train_dataset, test_dataset = torch.utils.data.random_split(names_dataset, [0.85, 0.15])
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [2]:
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence


class NamesClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NamesClassifier, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            batch_first=False,
        )
        self.h2o = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p=0.5)  # Add dropout regularization

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x is of shape (seq_length, batch_size, input_size)
        _lstm_output, (hidden, _cell) = self.lstm(x)
        # Use the last hidden state for classification
        last_hidden_state = hidden[-1]  # Shape: (batch_size, hidden_size)
        last_hidden_state = self.dropout(last_hidden_state)  # Apply dropout
        h2o_output = self.h2o(last_hidden_state)  # Shape: (batch_size, output_size)
        return h2o_output


def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(list(inputs), batch_first=False, padding_side="left")
    labels = torch.stack(labels)
    return inputs_padded, labels

In [None]:
from common.learner import Learner
from common.metrics import (
    AccuracyMetric,
    ConfusionMatrixMetric,
    Metric,
    PrecisionMetric,
    RecallMetric,
    F1ScoreMetric,
)

BATCH_SIZE = 64
LEARNING_RATE = 0.001
HIDDEN_SIZE = 256
NUM_EPOCHS = 50
PATIENCE = 3

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

full_dataloader = DataLoader(
    dataset=names_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

num_classes = len(names_dataset.countries)


rnn = NamesClassifier(
    input_size=len(names_dataset.index_to_token),
    hidden_size=HIDDEN_SIZE,
    output_size=len(names_dataset.countries),
)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LEARNING_RATE)

learner = Learner(
    model=rnn,
    optimizer=optimizer,
    criterion=criterion,
)

train_accuracy_metric = AccuracyMetric(num_classes=num_classes)
train_precision_metric = PrecisionMetric(num_classes=num_classes)
train_recall_metric = RecallMetric(num_classes=num_classes)
train_f1_metric = F1ScoreMetric(num_classes=num_classes)
train_metrics: list[Metric] = [
    train_accuracy_metric,
    train_precision_metric,
    train_recall_metric,
    train_f1_metric,
]

eval_accuracy_metric = AccuracyMetric(num_classes=num_classes)
eval_precision_metric = PrecisionMetric(num_classes=num_classes)
eval_recall_metric = RecallMetric(num_classes=num_classes)
eval_f1_metric = F1ScoreMetric(num_classes=num_classes)
eval_metrics: list[Metric] = [
    eval_accuracy_metric,
    eval_precision_metric,
    eval_recall_metric,
    eval_f1_metric,
]

print("Starting training...")
train_losses, eval_losses = learner.fit(
    train_loader=train_dataloader,
    eval_loader=test_dataloader,
    num_epochs=NUM_EPOCHS,
    patience=PATIENCE,
    train_metrics=train_metrics,
    eval_metrics=eval_metrics,
)
print("Training completed.")


In [None]:
import matplotlib.pyplot as plt

confusion_matrix_metric = ConfusionMatrixMetric(num_classes=num_classes, normalize=True)
final_loss = learner.final_eval(
    eval_loader=full_dataloader, eval_metrics=[confusion_matrix_metric]
)
print(f"Final loss: {final_loss}")

_, ax = plt.subplots(figsize=(10, 10))
ax.set_title("Loss and Accuracy")
ax.set_xlabel("Epoch")
train_accuracy_metric.plot(ax, "Train Accuracy")
eval_accuracy_metric.plot(ax, "Test Accuracy")
ax.plot(train_losses, label="Train Loss")
ax.plot(eval_losses, label="Test Loss")
ax.legend()

_, ax = plt.subplots(figsize=(10, 10))
ax.set_title("Precision and Recall")
ax.set_xlabel("Epoch")
train_precision_metric.plot(ax, "Train Precision")
train_recall_metric.plot(ax, "Train Recall")
train_f1_metric.plot(ax, "Train F1")
eval_precision_metric.plot(ax, "Test Precision")
eval_recall_metric.plot(ax, "Test Recall")
eval_f1_metric.plot(ax, "Test F1")
ax.legend()

_, ax = plt.subplots(figsize=(15, 10))
confusion_matrix_metric.plot(ax, "Confusion Matrix")
print(list(enumerate(names_dataset.countries)))

In [None]:
idx = learner.predict(names_dataset.name_to_tensor("Hai"))
print(names_dataset.countries[idx])

likelihoods, indices = learner.predict_topk(names_dataset.name_to_tensor("Hai"), k=3)
for likelihood, country_idx in zip(likelihoods, indices):
    print(f"{likelihood:.2f} {names_dataset.countries[country_idx]}")
