The goal is to predict the nationality of a name using a vanilla RNN.

https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [None]:
!pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import string
import unidecode
import re

import torch

Downloading dataset

In [None]:
!curl -O https://download.pytorch.org/tutorial/data.zip; unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2814k  100 2814k    0     0  1134k      0  0:00:02  0:00:02 --:--:-- 1134k
Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating

Helpers

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
    
def to_t(tensor):
    #convert tensor to gpu if available
    return tensor.to(device)

Language to id mapping

In [None]:
# import shutil
# shutil.rmtree('data/names/.ipynb_checkpoints')
filenames = [f for f in os.listdir('data/names')]
    
lang_to_id = {
    language.split('.')[0]: to_t(torch.tensor(i)) for i, language in enumerate(filenames)
}

In [None]:
print(lang_to_id)

{'Portuguese': tensor(0, device='cuda:0'), 'Scottish': tensor(1, device='cuda:0'), 'German': tensor(2, device='cuda:0'), 'Irish': tensor(3, device='cuda:0'), 'Arabic': tensor(4, device='cuda:0'), 'Vietnamese': tensor(5, device='cuda:0'), 'Spanish': tensor(6, device='cuda:0'), 'Japanese': tensor(7, device='cuda:0'), 'Dutch': tensor(8, device='cuda:0'), 'French': tensor(9, device='cuda:0'), 'Czech': tensor(10, device='cuda:0'), 'English': tensor(11, device='cuda:0'), 'Russian': tensor(12, device='cuda:0'), 'Greek': tensor(13, device='cuda:0'), 'Chinese': tensor(14, device='cuda:0'), 'Korean': tensor(15, device='cuda:0'), 'Polish': tensor(16, device='cuda:0'), 'Italian': tensor(17, device='cuda:0')}


Character to id mapping

In [None]:
char_to_id = {
    char: i for i, char in enumerate(string.ascii_lowercase + " -'")
}

In [None]:
print(char_to_id)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, ' ': 26, '-': 27, "'": 28}


Functions to convert names and languages to tensors

In [None]:
regex = re.compile('[1/,:ÃŸ]')

def name_to_tensor(name):
    name_tensor = to_t(torch.zeros(len(name), 1, len(char_to_id)))
    # convert text to unicode, make lowercase and remove any symbols that shouldn't exist
    name = unidecode.unidecode(regex.sub('', name.lower()))
    
    # one hot encode tensor for inputted name
    for i, char in enumerate(name):
        name_tensor[i][0][char_to_id[char]] = 1
    
    return name_tensor

In [None]:
def lang_to_tensor(lang):
    # one hot encode tensor for inputted language
    lang_tensor = to_t(torch.zeros(len(lang_to_id)))
    lang_tensor[lang_to_id[lang]] = 1
    
    return lang_tensor

### Create Datasets

Loading names and languages into arrays.

In [None]:
# one hot encoded tensors of names
x_names = []
# one hot encoded tensors of corresponding languages
# language at index i in y_langs is the language of the name at index i in x_names
y_langs = []

for filename in filenames:
    with open('data/names/'+ filename) as f:
        names = f.read().split()

        for name in names:
            x_names.append(name_to_tensor(name))
            y_langs.append(lang_to_tensor(filename.split('.')[0]))
            

In [None]:
class NameDataset(torch.utils.data.Dataset):
    def __init__(self, names, langs):
        self.names = names
        self.langs = langs
        
    def __len__(self):
        return len(self.names)
    
    def __getitem__(self, i):
        item = {
            'name': self.names[i],
            'lang': self.langs[i]
        }
        return item
        

Creating dataset and splitting into training and test sets.

In [None]:
dataset = NameDataset(x_names, y_langs)


train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])


Data loaders

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=True)

data_loaders = {'train': train_data_loader, 'test': test_data_loader}


### Nationality Model

Vanilla RNN model architecture from https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html.

In [None]:
class NationalityModel(torch.nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(NationalityModel, self).__init__()

        self.hidden_size = hidden_size
        self.input_to_hidden = torch.nn.Linear(input_size + hidden_size, hidden_size)
        self.input_to_output = torch.nn.Linear(input_size + hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = self.input_to_hidden(combined)
        output = self.input_to_output(combined)
        output = self.softmax(output)

        return output, hidden
    
    def init_hidden(self):
        return to_t(torch.zeros(1, self.hidden_size))


In [None]:
model = NationalityModel(len(char_to_id), 128, len(lang_to_id))
# model to gpu if available
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

Training model

In [None]:
num_epochs = 1

for epoch in range(num_epochs):
    
    for i, train_data in enumerate(data_loaders['train']):
        hidden_state = model.init_hidden()
        
        for name in train_data['name']:
            # zero gradients when feeding new word to model
            optimizer.zero_grad()
            
            for char in name:
                # feed each char of the name and the current hidden state to model
                output, hidden_state = model(char, hidden_state)
                
            # calculate loss using cross entropy loss function
            loss = criterion(output, train_data['lang'])

            # caculate gradients of loss function
            loss.backward()
            # step model using calculated gradients to minimize loss
            optimizer.step()
        
            
    print(
        f"Epoch [{epoch + 1}/{num_epochs}], "
        f"Loss: {loss.item():.4f}"
    )

Epoch [1/1], Loss: 0.3329


### Results

Testing model accuracy

In [None]:
num_correct = 0
num_samples = len(test_dataset)

with torch.no_grad():
    model.eval()
    for train_data in data_loaders['test']:
        hidden_state = model.init_hidden()
        for char in train_data['name'][0]:
            output, hidden_state = model(char, hidden_state)
        pred = torch.argmax(output).item()
        num_correct += bool(pred == torch.argmax(train_data['lang']).item())

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 68.6974%


The model is overfitting. I think this is because of the small ammount of data being used in training, as well as the fact that the number of names are unevenly distributed among languages.

In [None]:
def predict_nationality(name):
    with torch.no_grad():
        model.eval()
        hidden_state = model.init_hidden()
        for char in name_to_tensor(name):
            output, hidden_state = model(char, hidden_state)

    return(list(lang_to_id.keys())[list(lang_to_id.values()).index(torch.argmax(output).item())])
        

In [None]:
print(predict_nationality('Lucchese'))
print(predict_nationality('Muhammad'))
print(predict_nationality('Xiu'))

Italian
Arabic
Chinese
