In [11]:
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
class MultiLayerPerceptron(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in, apply_softmax = False):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)

        if apply_softmax:
            output = F.softmax(output, dim = 1)
        return output

In [8]:
from argparse import Namespace

train_args = Namespace(
    batch_size = 2,
    input_dim = 3,
    hidden_dim = 100,
    output_dim = 4
)


mlp = MultiLayerPerceptron(train_args.input_dim, train_args.hidden_dim, train_args.output_dim)
display(mlp)

MultiLayerPerceptron(
  (fc1): Linear(in_features=3, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)

In [18]:
class SurnameDataset(torch.utils.data.Dataset):
    def __init__(self, surname_df, vectorizer):
        self.surname_df = surname_df
        self._vectorizer = vectorizer

        self.train_df = self.surname_df[self.surname_df["split"] == "train"]
        self.train_size = len(self.train_df)

        self.val_df = self.surname_df[self.surname_df["split"] == "val"]
        self.val_size = len(self.val_df)

        self.test_df = self.surname_df[self.surname_df["split"] == "test"]
        self.test_size = len(self.test_df)

        self._lookup_dict = {
            "train": (self.train_df, self.train_size),
            "val": (self.val_df, self.val_size),
            "test": (self.test_df, self.test_size)
        }

        self.set_split("train")

    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        surname_df = pd.read_csv(surname_csv)
        return cls(surname_df, SurnameVectorizer.from_dataframe(surname_df))

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        surname_vector = self._vectorizer.vectorize(row.surname)
        nationality_index = self._vectorizer.nationality_vocab.lookup_token(
            row.nationality)

        return {"x_data": surname_vector, "y_data": nationality_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size


The `Vocabulary` class is exactly the same, with these important methods:

- `add_token()`: which adds the token into the `Vocabulary`
- `lookup_token()`: looks up the token and returns an index
- `lookup_index()`: looks up the index and returns a token

In [10]:
class Vocabulary():
    def __init__(self, token_to_idx = None, add_unk = True, unk_token = "<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {
            idx: token for token,
            idx in token_to_idx.items()
        }

        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_idx = -1
        if add_unk:
            self.unk_idx = self.add_token(unk_token)

    def to_serializable(self):
        return {
            "token_to_idx": self._token_to_idx,
            "add_unk": self._add_unk,
            "unk_token": self._unk_token
        }

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token

    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_idx)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError(f"The index {index} is not found in the vocabulary.")
        else:
            return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size = %d)" % len(self)

    def __len__(self):
        return len(self._token_to_idx)


In [12]:
class SurnameVectorizer():
    def __init__(self, surname_vocab, nationality_vocab):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab

    def vectorizer(self, surname):
        vocab = self.surname_vocab
        one_hot = np.zeros(len(vocab), dtype = np.float32)
        for token in surname:
            one_hot[surname_vocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, surname_df):
        surname_vocab = Vocabulary(unk_token = "@")
        nationality_vocab = Vocabulary(add_unk = False)

        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab)
    

In [13]:
class SurnameClassifier(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in, apply_softmax = False):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)

        if apply_softmax:
            output = F.softmax(output, dim = 1)

        return output

In [14]:
args = Namespace(
    surname_csv="data/surnames/surnames_with_splits.csv", 
    vectorizer_file="vectorizer.json", 
    model_state_file="model.pth", 
    save_dir="model_storage/ch4/surname_mlp",
    hidden_dim = 300,
    seed = 42,
    num_epochs=100, early_stopping_criteria=5, learning_rate=0.001,
    batch_size=64,
)


In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [21]:
dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
vectorizer = dataset.get_vectorizer()

classifier = SurnameClassifier(
    input_dim = len(vectorizer.surname_vocab),
    hidden_dim = args.hidden_dim,
    output_dim = len(vectorizer.nationality_vocab)
)

classifier = classifier.to(device)
loss_func = torch.nn.CrossEntropyLoss() # actually can set weights
optimizer = torch.optim.Adam(classifier.parameters(), lr = args.learning_rate)

In [22]:
from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle = True, drop_last = True, device = "cpu"):
    dataloader = DataLoader(
        dataset = dataset, batch_size = batch_size,
        shuffle = shuffle, drop_last = drop_last
    )

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = tensor.to(device)
        yield out_data_dict

In [23]:
def make_train_state(args):
    return {
        'epoch_index': 0, 'train_loss': [],
        'train_acc': [], 'val_loss': [], 'val_acc': [], 'test_loss': 1, 'test_acc': 1
    }

In [24]:
train_state = make_train_state(args)

In [None]:
for epoch_index in range(args.num_epochs):
    train_state["epoch_index"] = epoch_index

    dataset.set_split("train")
    batch_generator = generate_batches(dataset, batch_size = args.batch_size, device = args.device)

    running_loss = 0.0
    running_acc = 0.0

    classifier.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        y_pred = classifier(x_in = batch_dict["x_data"].float())

        loss_batch = loss_func(y_pred, batch_dict["y_data"].float())
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        loss_batch.backward()
        optimizer.step()
        
        acc_batch = compute_accuracy(y_pred.detach(), batch_dict["y_data"])
        running_acc += (acc_batch - running_acc) / (batch_index + 1) # starts from 0 so batch_index + 1

    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)

    dataset.set_split("val")
    batch_generator = generate_batches(dataset, batch_size = args.batch_size, device = args.device)

    running_loss = 0.0
    running_acc = 0.0

    classifier.eval()
    for batch_index, batch_dict in enumerate(batch_generator):
        y_pred = classifier(x_in = batch_dict["x_data"].float())
        loss = loss_func(y_pred, batch_dict["y_data"].float())

        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        acc_batch = compute_accuracy(y_pred.detach(), batch_dict["y_data"])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state["val_loss"].append(running_loss)
    train_state["val_acc"].append(running_acc)

