In [2]:
import torch
import torch.nn.functional as F

In [4]:
class MultiLayerPerceptron(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in, apply_softmax = False):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)

        if apply_softmax:
            output = F.softmax(output, dim = 1)
        return output

In [8]:
from argparse import Namespace

train_args = Namespace(
    batch_size = 2,
    input_dim = 3,
    hidden_dim = 100,
    output_dim = 4
)


mlp = MultiLayerPerceptron(train_args.input_dim, train_args.hidden_dim, train_args.output_dim)
display(mlp)

MultiLayerPerceptron(
  (fc1): Linear(in_features=3, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)

In [9]:
class SurnameDataset(torch.utils.data.Dataset):
    def __init__(self, surname_df, vectorizer):
        self.surname_df = surname_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df["split"] == "train"]
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df["split"] == "val"]
        self.val_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df["split"] == "test"]
        self.test_size = len(self.test_df)

        self._lookup_dict = {
            "train": (self.train_df, self.train_size),
            "val": (self.val_df, self.val_size),
            "test": (self.test_df, self.test_size)
        }

        self.set_split("train")

    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        surname_df = pd.read_csv(surname_csv)
        return cls(surname_df, SurnameVectorizer.from_dataframe(surname_df))

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        surname_vector = self._vectorizer.vectorize(row.surname)
        nationality_index = self._vectorizer.nationality_vocab.lookup_token(
            row.nationality)

        return {"x_data": surname_vector, "y_data": nationality_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size


The `Vocabulary` class is exactly the same, with these important methods:

- `add_token()`: which adds the token into the `Vocabulary`
- `lookup_token()`: looks up the token and returns an index
- `lookup_index()`: looks up the index and returns a token

In [None]:
class Vocabulary():
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {
            idx: token for token,
            idx in token_to_idx.items()
        }

        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_idx = -1
        if add_unk:
            self.unk_idx = self.add_token(unk_token)

    def to_serializable(self):
        return {
            "token_to_idx": self._token_to_idx,
            "add_unk": self._add_unk,
            "unk_token": self._unk_token
        }

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token

    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_idx)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError(f"The index {index} is not found in the vocabulary.")
        else:
            return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size = %d)" % len(self)

    def __len__(self):
        return len(self._token_to_idx)
