In [None]:
%reload_ext autoreload
%autoreload 2

%matplotlib widget

import torch

torch.manual_seed(42)


In [None]:
from data.names_data_source import NamesDataSource
from learning.names_classifier.names_classifier_dataset import (
    NamesClassifierDataset,
)


# Initialize NamesDataset with the detected device
names_data_source = NamesDataSource.load(
    data_folder="../datasets/names", normalize_unicode=True
)

names_dataset = NamesClassifierDataset(names_data_source)
sample = names_dataset[0]
print(sample)


In [6]:
train_dataset, test_dataset = torch.utils.data.random_split(names_dataset, [0.85, 0.15])
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 17042
Test dataset size: 3007


In [None]:
import time
import torch
from learning.names_classifier.model import (
    Batch,
    ParallelBatchLearner,
    SequentialBatchLearner,
    NamesClassifierLSTM,
)
from learning.metrics import (
    ConfusionMatrixMetric,
)

from torch.utils.data import DataLoader
from torch import nn

BATCH_SIZE = 16
LEARNING_RATE = 0.001
HIDDEN_SIZE = 128
NUM_EPOCHS = 50
PATIENCE = 5

model = NamesClassifierLSTM(
    input_size=names_data_source.num_vocab,
    hidden_size=HIDDEN_SIZE,
    output_size=names_data_source.num_classes,
)
print(model)
criterion = nn.CrossEntropyLoss()
print(criterion)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(optimizer)

learner = ParallelBatchLearner(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
)

train_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries
)

eval_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries
)


train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=Batch.from_samples,
)

eval_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=Batch.from_samples,
)

print("Starting training...")
start_time = time.time()
train_losses, eval_losses = learner.fit(
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    num_epochs=NUM_EPOCHS,
    patience=PATIENCE,
    train_metrics=[train_confusion_matrix_metric],
    eval_metrics=[eval_confusion_matrix_metric],
)
elapsed_time = time.time() - start_time
print(f"Training completed. Elapsed time: {elapsed_time:.2f}s")


## LOG

Manual Seed = 42

### Experiment 1

First Experiment:
- model = RNN
- BATCH_SIZE = 64
- LEARNING_RATE = 0.001
- HIDDEN_SIZE = 64
- NUM_EPOCHS = 50
- PATIENCE = 5

Result
- 29/50 -- 3.66s 	Train loss 	0.4853 	Eval loss 	0.6321
- Recall for Portugese, Scottish, Vietnamese are really low

### Experiment 2

Changes:
- BATCH_SIZE = 128

Result
- 45/50 -- 3.73s 	Train loss 	0.4493 	Eval loss 	0.6522
- Korean recall for eval is much worse
- Vietnamese recall is better

### Experiment 3

Changes:
- BATCH_SIZE = 32

Result
- 27/50 -- 3.71s 	Train loss 	0.4340 	Eval loss 	0.6441
- Still bad at Scottish, but better than before

### Experiment 4

Changes
- HIDDEN_SIZE = 128

Result
- 21/50 -- 4.22s 	Train loss 	0.3574 	Eval loss 	0.6969
- Training completed. Elapsed time: 93.07s
- Bigger model fits better. Eval result is stll horrible.

### Experiment 5

Changes:
- model = BiLSTM

Result
- 18/50 -- 64.10s 	Train loss 	0.3235 	Eval loss 	0.6190
- Training completed. Elapsed time: 1242.25s
- LSTM is really slow. Slightly better result.

### Experiment 6

Changes:
- model = BiLSTM, ParallelBatchLearner

Result
- 21/50 -- 8.67s 	Train loss 	0.2822 	Eval loss 	0.6491
- Training completed. Elapsed time: 187.26s
- Good speed up with decent metrics

### Experiment 7

Changes:
- BATCH_SIZE = 16

Result
- 15/50 -- 9.96s 	Train loss 	0.3147 	Eval loss 	0.6260
- Training completed. Elapsed time: 158.88s
- It seems smaller batch size helps with learning from imbalance dataset


In [None]:
import matplotlib.pyplot as plt


full_dataloader = DataLoader(
    dataset=names_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=Batch.from_samples,
)
full_confusion_matrix_metric = ConfusionMatrixMetric(
    classes=names_data_source.countries
)
final_loss = learner.final_eval(
    dataloader=full_dataloader, metrics=[full_confusion_matrix_metric]
)

_, ax = plt.subplots(figsize=(12, 8))
full_confusion_matrix_metric.plot_confusion_matrix(ax, normalize=True)

_, ax = plt.subplots(figsize=(5, 5))
ax.plot(train_losses, label="Train Loss")
ax.plot(eval_losses, label="Test Loss")
train_confusion_matrix_metric.plot_accuracies(ax, "Train")
eval_confusion_matrix_metric.plot_accuracies(ax, "Eval")
ax.set_title("Loss and Accuracy")

_, ax = plt.subplots(figsize=(5, 5))
train_confusion_matrix_metric.plot_class_precisions(ax, "Train")
eval_confusion_matrix_metric.plot_class_precisions(ax, "Eval")
full_confusion_matrix_metric.plot_class_precisions(ax, "Final")
ax.set_title("Precision")

_, ax = plt.subplots(figsize=(5, 5))
train_confusion_matrix_metric.plot_class_recalls(ax, "Train")
eval_confusion_matrix_metric.plot_class_recalls(ax, "Eval")
full_confusion_matrix_metric.plot_class_recalls(ax, "Final")
ax.set_title("Recall")

_, ax = plt.subplots(figsize=(5, 5))
train_confusion_matrix_metric.plot_class_f1_scores(ax, "Train")
eval_confusion_matrix_metric.plot_class_f1_scores(ax, "Eval")
full_confusion_matrix_metric.plot_class_f1_scores(ax, "Final")
ax.set_title("F1 Score")


In [None]:
print(model.fc)
print(model.fc.weight.data)
assert model.fc.weight.grad is not None

hist, bin_edges = torch.histogram(model.fc.weight.data)
f, ax = plt.subplots(figsize=(5, 5))
ax.plot(bin_edges[:-1], hist)

plt.figure(figsize=(10, 5))
plt.imshow(model.fc.weight.abs() > 0.03)


In [None]:
likelihoods, indices = learner.predict_topk(
    names_data_source.name_to_one_hot("Albert"), k=3
)
for likelihood, country_idx in zip(likelihoods, indices):
    print(f"{likelihood:.2f} {names_data_source.countries[country_idx]}")

print(f"Total likelihood: {likelihoods.sum().item():.2f}")
