In [None]:
import string
import unicodedata
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

# Some name contains space, comma, or apostrophe.
characters = set(string.ascii_letters + " ,'")

token_to_index: dict[str, int] = {}
for c in characters:
    token_to_index[c] = len(token_to_index)

index_to_token: dict[int, str] = {i: c for c, i in token_to_index.items()}


def t2i(token: str) -> int:
    return token_to_index[token]


def i2t(index: int) -> str:
    return index_to_token[index]


def unicode_to_ascii(s) -> str:
    return "".join(
        c
        for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn" and c in characters
    )


# Returns a one-hot encoded tensor for a name.
def name_to_tensor(name: str) -> torch.Tensor:
    return (
        F.one_hot(torch.tensor([t2i(c) for c in name]), num_classes=len(characters))
        .unsqueeze(0)
        .float()
    )


def label_index_to_tensor(label_index: int, num_label: int) -> torch.Tensor:
    return (
        F.one_hot(torch.tensor(label_index), num_classes=num_label).unsqueeze(0).float()
    )


def tensor_to_label_index(tensor: torch.Tensor) -> int:
    return int(tensor.argmax(dim=1).item())


tensor = name_to_tensor("O'Connor")
print(tensor)
print(tensor.size())
print(tensor.dtype)

In [None]:
from data.names_dataset import NamesDataset

names_dataset = NamesDataset(
    data_folder="../datasets/names",
    transform_input=unicode_to_ascii,
    transform_output=name_to_tensor,
    transform_label=name_to_tensor,
)

In [None]:
def collate_fn(batch) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
    inputs, labels = zip(*batch)
    return list(inputs), list(labels)

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64

train_dataset, test_dataset = torch.utils.data.random_split(names_dataset, [0.85, 0.15])
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

train_dataloader = DataLoader(
    dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

test_dataloader = DataLoader(
    dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)

train_features, train_labels = next(iter(train_dataloader))
print(train_features)
print(train_labels)
print(len(train_features))
print(len(train_labels))
print(train_features[0].size())
print(train_labels[0].size())


In [None]:
import torch.nn as nn


class NamesClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NamesClassifier, self).__init__()
        self.rnn = nn.RNN(
            input_size=input_size, hidden_size=hidden_size, batch_first=True
        )
        self.h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x is of shape (batch_size, seq_length, input_size)
        output, hidden = self.rnn(x)
        output = self.h2o(hidden[0])
        output = F.log_softmax(output, dim=1)
        return output


rnn = NamesClassifier(
    input_size=len(characters),
    hidden_size=128,
    output_size=len(names_dataset.countries),
)
print(rnn)

tensor = name_to_tensor("O'Connor")
print(f"{tensor.size()=}")
print(tensor.dtype)
output = rnn(tensor)
print(output)
print(names_dataset.countries[tensor_to_label_index(output)])

inputs, labels = next(iter(train_dataloader))
print(f"{inputs[0].size()=}")
print(f"{labels[0].size()=}")

In [None]:
def train(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: nn.Module,
):
    model.train()
    total_loss = 0.0
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.argmax(dim=1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model: nn.Module, dataloader: DataLoader, criterion: nn.Module):
    model.eval()
    total_loss = 0.0
    correct = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, labels.argmax(dim=1))
            total_loss += loss.item()
            predictions = outputs.argmax(dim=1)
            correct += (predictions == labels.argmax(dim=1)).sum().item()