### Name Classification at character Level

In [1]:
import io
import os 
import unicodedata
import string
import glob
import torch
import random

# alphabet small
ALL_LETTERS = string.ascii_lowercase
N_LETTERS = len(ALL_LETTERS)

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn' and c in ALL_LETTERS
    )

def load_data():
    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []
    
    def find_files(path):
        return glob.glob(path)
    
    # Read a file and split into lines
    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line.lower()) for line in lines]
    
    for filename in find_files('data/names/*.txt'):
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        lines = read_lines(filename)
        category_lines[category] = lines
        
    return category_lines, all_categories  

    def line_to_tensor(line):
        tensor = torch.zeros(len(line), 1, N_LETTERS)
        for i, letter in enumerate(line):
            tensor[i][0][letter_to_index(letter)] = 1
        return tensor

In [2]:
class NameDataset(torch.utils.data.Dataset):
    def __init__(self, category_lines):
        super(NameDataset, self).__init__()      
        ALL_LETTERS = string.ascii_lowercase
        category_list = []
        name_list = []
        for i, (category, names) in  enumerate(category_lines.items()):
            for name in names:
                name_list.append(self.__name_to_sequence(name))
                category_list.append(i)
        self.x = torch.nn.utils.rnn.pad_sequence(name_list, batch_first=True)
        self.y = torch.tensor(category_list).unsqueeze(dim=1)
        self.n_samples = self.x.size(0)
       
    def __letter_to_index(self, letter):
        return ALL_LETTERS.find(letter)

    def __name_to_sequence(self, name):
        sequence = []
        for letter in name:
            sequence.append(self.__letter_to_index(letter))
        return torch.tensor(sequence)    
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return self.n_samples        

In [3]:
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader

# Load dataset as a dictionary
# category_lines: {caetoryname: list_of_names}
category_lines, all_categories = load_data()

name_dataset = NameDataset(category_lines)
# split dataset in train and test set
num_train = int(len(name_dataset)*0.95)
train_dataset, test_dataset = random_split(name_dataset, [num_train, len(name_dataset)-num_train])
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

In [4]:
class RNN(torch.nn.Module):
    def __init__(self, num_classes, hidden_size=50, embed_dim=5,num_layers=2):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = torch.nn.Embedding(num_embeddings=27, 
                                            embedding_dim=embed_dim, 
                                            padding_idx=0,
                                            sparse=False)
        self.rnn = torch.nn.RNN(embed_dim, hidden_size, num_layers, batch_first=True)
        self.linear = torch.nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.rnn(embedded, h0) 
        out = out[:, -1, :]
        out = self.linear(out)
        return out

In [5]:
class LSTM(torch.nn.Module):
    def __init__(self, num_classes, hidden_size=50, embed_dim=5,num_layers=2):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = torch.nn.Embedding(num_embeddings=27, 
                                            embedding_dim=embed_dim, 
                                            padding_idx=0,
                                            sparse=False)
        self.lstm = torch.nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True)
        self.linear = torch.nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.lstm(embedded, (h0, c0)) 
        out = out[:, -1, :]
        out = self.linear(out)
        return out

In [6]:
class GRU(torch.nn.Module):
    def __init__(self, num_classes, hidden_size=50, embed_dim=5,num_layers=2):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = torch.nn.Embedding(num_embeddings=27, 
                                            embedding_dim=embed_dim, 
                                            padding_idx=0,
                                            sparse=False)
        self.gru = torch.nn.GRU(embed_dim, hidden_size, num_layers, batch_first=True)
        self.linear = torch.nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.gru(embedded, h0) 
        out = out[:, -1, :]
        out = self.linear(out)
        return out

In [7]:
def train(dataloader, model):
    n_samples, n_accurates = 0, 0
    total_batches = len(dataloader)
    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(text)
        outputs = outputs.type(torch.FloatTensor)
        label = label.reshape(-1).type(torch.LongTensor)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
        n_accurates += (outputs.argmax(1) == label).sum().item()
        n_samples += label.size(0)
        acc = 100*n_accurates/n_samples
        if (idx+1)%(total_batches/2) == 0:
            print(f"| epoch: {epoch} | batches: {idx+1}/{total_batches} | train_accuracy: {acc: .3f}")

            
def evaluate(dataloader, model):
    n_samples, n_accurates = 0, 0
    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            outputs = model(text)
            outputs = outputs.type(torch.FloatTensor)
            label = label.reshape(-1).type(torch.LongTensor)
            n_accurates += (outputs.argmax(1) == label).sum().item()
            n_samples += label.size(0)
            return n_accurates/n_samples

In [8]:
LR = 0.0005
N_EPOCHS = 50
n_classes = len(set(all_categories))

rnn_model = RNN(n_classes)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn_model.parameters(), lr=LR)


for epoch in range(1, N_EPOCHS + 1):
    train(train_loader, rnn_model)

# Test with test set
accu_test = evaluate(test_loader, rnn_model)
print('='*60)
print(f"Test Accuracy: {accu_test: .3f}")

| epoch: 1 | batches: 1907/1907 | train_accuracy:  47.834
| epoch: 2 | batches: 1907/1907 | train_accuracy:  53.450
| epoch: 3 | batches: 1907/1907 | train_accuracy:  55.160
| epoch: 4 | batches: 1907/1907 | train_accuracy:  59.261
| epoch: 5 | batches: 1907/1907 | train_accuracy:  60.635
| epoch: 6 | batches: 1907/1907 | train_accuracy:  61.877
| epoch: 7 | batches: 1907/1907 | train_accuracy:  62.926
| epoch: 8 | batches: 1907/1907 | train_accuracy:  63.928
| epoch: 9 | batches: 1907/1907 | train_accuracy:  64.751
| epoch: 10 | batches: 1907/1907 | train_accuracy:  65.558
| epoch: 11 | batches: 1907/1907 | train_accuracy:  66.508
| epoch: 12 | batches: 1907/1907 | train_accuracy:  67.242
| epoch: 13 | batches: 1907/1907 | train_accuracy:  67.792
| epoch: 14 | batches: 1907/1907 | train_accuracy:  68.175
| epoch: 15 | batches: 1907/1907 | train_accuracy:  68.962
| epoch: 16 | batches: 1907/1907 | train_accuracy:  69.046
| epoch: 17 | batches: 1907/1907 | train_accuracy:  69.549
| epoc

In [9]:
LR = 0.0005
N_EPOCHS = 50
n_classes = len(set(all_categories))

lstm_model = LSTM(n_classes)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=LR)


for epoch in range(1, N_EPOCHS + 1):
    train(train_loader, lstm_model)

# Test with test set
accu_test = evaluate(test_loader, lstm_model)
print('='*60)
print(f"Test Accuracy: {accu_test: .3f}")

| epoch: 1 | batches: 1907/1907 | train_accuracy:  48.317
| epoch: 2 | batches: 1907/1907 | train_accuracy:  54.095
| epoch: 3 | batches: 1907/1907 | train_accuracy:  57.247
| epoch: 4 | batches: 1907/1907 | train_accuracy:  61.096
| epoch: 5 | batches: 1907/1907 | train_accuracy:  63.026
| epoch: 6 | batches: 1907/1907 | train_accuracy:  64.672
| epoch: 7 | batches: 1907/1907 | train_accuracy:  66.104
| epoch: 8 | batches: 1907/1907 | train_accuracy:  67.745
| epoch: 9 | batches: 1907/1907 | train_accuracy:  69.376
| epoch: 10 | batches: 1907/1907 | train_accuracy:  70.724
| epoch: 11 | batches: 1907/1907 | train_accuracy:  72.166
| epoch: 12 | batches: 1907/1907 | train_accuracy:  73.173
| epoch: 13 | batches: 1907/1907 | train_accuracy:  73.896
| epoch: 14 | batches: 1907/1907 | train_accuracy:  74.976
| epoch: 15 | batches: 1907/1907 | train_accuracy:  75.711
| epoch: 16 | batches: 1907/1907 | train_accuracy:  76.397
| epoch: 17 | batches: 1907/1907 | train_accuracy:  76.744
| epoc

In [11]:
LR = 0.0005
N_EPOCHS = 40
n_classes = len(set(all_categories))

gru_model = GRU(n_classes)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru_model.parameters(), lr=LR)


for epoch in range(1, N_EPOCHS + 1):
    train(train_loader, gru_model)

# Test with test set
accu_test = evaluate(test_loader, gru_model)
print('='*60)
print(f"Test Accuracy: {accu_test: .3f}")

| epoch: 1 | batches: 1907/1907 | train_accuracy:  51.982
| epoch: 2 | batches: 1907/1907 | train_accuracy:  58.453
| epoch: 3 | batches: 1907/1907 | train_accuracy:  61.500
| epoch: 4 | batches: 1907/1907 | train_accuracy:  63.833
| epoch: 5 | batches: 1907/1907 | train_accuracy:  66.135
| epoch: 6 | batches: 1907/1907 | train_accuracy:  68.291
| epoch: 7 | batches: 1907/1907 | train_accuracy:  70.509
| epoch: 8 | batches: 1907/1907 | train_accuracy:  72.590
| epoch: 9 | batches: 1907/1907 | train_accuracy:  74.316
| epoch: 10 | batches: 1907/1907 | train_accuracy:  75.485
| epoch: 11 | batches: 1907/1907 | train_accuracy:  76.256
| epoch: 12 | batches: 1907/1907 | train_accuracy:  77.079
| epoch: 13 | batches: 1907/1907 | train_accuracy:  77.850
| epoch: 14 | batches: 1907/1907 | train_accuracy:  78.249
| epoch: 15 | batches: 1907/1907 | train_accuracy:  78.867
| epoch: 16 | batches: 1907/1907 | train_accuracy:  79.266
| epoch: 17 | batches: 1907/1907 | train_accuracy:  79.733
| epoc