In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib widget

import torch

torch.manual_seed(42)


In [366]:
from data.names_data_source import NamesDataSource
from data.tokenizer import Tokenizer
from learning.names_classifier.names_classifier_dataset import (
    NamesClassifierDataset,
)

tokenizer = Tokenizer()

# Initialize NamesDataset with the detected device
names_data_source = NamesDataSource.load(
    data_folder="../datasets/names",
    tokenizer=tokenizer,
    normalize_unicode=True,
)


In [None]:
names_data_source.plot_class_frequency((10, 5))
names_data_source.plot_token_frequency((10, 5))

In [368]:
from learning.names_classifier.model import Config

config = Config(
    batch_size=2**10,
    learning_rate=1e-3,
    epochs=500,
    patience=30,
    min_delta=1e-4,
    device=torch.device("cuda"),
    vocab_size=tokenizer.vocab_size,
    class_size=names_data_source.num_classes,
    hidden_size=64,
    num_layers=2,
    bidirectional=False,
    activation="relu",
    dropout=0.2,
)

In [None]:
names_dataset = NamesClassifierDataset(
    names_data_source=names_data_source,
    tokenizer=tokenizer,
    device=config.device,
)
sample = names_dataset[0]
print(sample)
print(sample.input.shape)
print(sample.label.shape)


In [None]:
from learning.names_classifier.model import NamesClassifierRNN


model = NamesClassifierRNN(config=config)
model.predict_topk(sample.input, k=1)


In [None]:
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    names_dataset, [0.80, 0.10, 0.10]
)
print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [None]:
train_sampler_weights = [
    names_data_source.class_frequency[int(sample.label.item())] ** -0.5
    for sample in train_dataset
]
print(train_sampler_weights[10000:10010])

criterion_weights = (
    torch.tensor(names_data_source.class_frequency, device=config.device) ** 0.0
)
criterion_weights = (
    criterion_weights / criterion_weights.sum() * names_data_source.num_classes
)
print(criterion_weights)


In [None]:
import math
import time
import torch
from learning.names_classifier.model import (
    Batch,
    ParallelBatchLearner,
)
from learning.metrics import (
    ConfusionMatrixMetric,
)

from torch.utils.data import WeightedRandomSampler, DataLoader
from torch import nn

criterion = nn.CrossEntropyLoss(reduction="sum", weight=criterion_weights)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.learning_rate,
)

learner = ParallelBatchLearner(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    config=config,
)
print(config)
print(learner)

train_sampler = WeightedRandomSampler(
    weights=train_sampler_weights,
    num_samples=len(train_dataset),
    replacement=True,
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    sampler=train_sampler,
    batch_size=config.batch_size,
    collate_fn=Batch.from_samples,
)

eval_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    collate_fn=Batch.from_samples,
)

train_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries,
    device=config.device,
)

eval_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries,
    device=config.device,
)

print(f"Expecting initial loss around {math.log(names_data_source.num_classes)}")
first_epoch_loss = learner.eval(dataloader=train_dataloader)
print(first_epoch_loss)

In [None]:
print("Starting training...\n")
start_time = time.time()
train_losses, eval_losses = learner.fit(
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    num_epochs=config.epochs,
    patience=config.patience,
    min_delta=config.min_delta,
    train_metrics=[train_confusion_matrix_metric],
    eval_metrics=[eval_confusion_matrix_metric],
)
elapsed_time = time.time() - start_time
print(f"Training completed. Elapsed time: {elapsed_time:.2f}s")


In [None]:
import matplotlib.pyplot as plt


_, ax = plt.subplots(figsize=(12, 8))
eval_confusion_matrix_metric.plot_confusion_matrix(ax, normalize=True)

_, ax = plt.subplots(figsize=(5, 5))
ax.plot(train_losses, label="Train Loss")
ax.plot(eval_losses, label="Test Loss")
train_confusion_matrix_metric.plot_accuracies(ax, "Train")
eval_confusion_matrix_metric.plot_accuracies(ax, "Eval")
ax.set_title("Loss and Accuracy")

_, ax = plt.subplots(figsize=(5, 5))
train_confusion_matrix_metric.plot_class_precisions(ax, "Train")
eval_confusion_matrix_metric.plot_class_precisions(ax, "Eval")
# full_confusion_matrix_metric.plot_class_precisions(ax, "Final")
ax.set_title("Precision")

_, ax = plt.subplots(figsize=(5, 5))
train_confusion_matrix_metric.plot_class_recalls(ax, "Train")
eval_confusion_matrix_metric.plot_class_recalls(ax, "Eval")
# full_confusion_matrix_metric.plot_class_recalls(ax, "Final")
ax.set_title("Recall")

_, ax = plt.subplots(figsize=(5, 5))
train_confusion_matrix_metric.plot_class_f1_scores(ax, "Train")
eval_confusion_matrix_metric.plot_class_f1_scores(ax, "Eval")
# full_confusion_matrix_metric.plot_class_f1_scores(ax, "Final")
ax.set_title("F1 Score")


In [None]:
print(model.fc)
print(model.fc.weight.data)

hist, bin_edges = torch.histogram(model.fc.weight.data)
f, ax = plt.subplots(figsize=(5, 5))
ax.plot(bin_edges[:-1], hist)

plt.figure(figsize=(20, 10))
plt.imshow(model.fc.weight.abs() > 0.1)


In [None]:
likelihoods, indices = model.predict_topk(tokenizer.to_one_hot("Hai"), k=3)
for likelihood, country_idx in zip(likelihoods, indices):
    print(f"{likelihood:.2f} {names_data_source.countries[country_idx]}")

print(f"Total likelihood: {likelihoods.sum().item():.2f}")
