# Classification Model to Determine the nationality of a given 'name'

In [1]:
%load_ext autoreload
%autoreload 2
# Download the dataset
#!curl -O https://download.pytorch.org/tutorial/data.zip; 

#! unzip data.zip

In [2]:
# Install dependancies
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode


torch.manual_seed(2)

data_dir = "data/names"
arabic_file = os.path.join(data_dir, 'Arabic.txt')

In [3]:
all_files = os.listdir(data_dir)
num_langs = len(all_files)
lang2label  = {file_name.split('.')[0]: torch.tensor([i], dtype = torch.long) for i, file_name in enumerate(all_files)}

#Output: {'Czech': tensor(0),
         #'German': tensor(1),...}
print(f"Total Languages (classes): {num_langs}")    

Total Languages (classes): 18


In [4]:
vocab = dict()
for i, letter in enumerate(ascii_letters + " .,:;-'"):
    vocab.update({letter:i})

# Vocab
vocab_size = len(vocab) 

print(f'Total Characters in the Vocab: {vocab_size}')

Total Characters in the Vocab: 59


We see that there are a total of 59 tokens in our character vocabulary. This includes spaces and punctuations, such as ` .,:;-‘. <br>
This also means that each name will now be expressed as a tensor of size (num_char, 59). <br>
One-hot vector for each character. so if 5 characters in a name. 5 one-hot vectors.

### Function to create one-hot vectors for name

In [5]:
def name2tensor(name,vocab):
    '''
    Converts a name to a tensor of size (len(name), len(vocab))
    '''
    base_tensor = torch.zeros(len(name),1,  len(vocab))
    #*the extra dimension in the above tensor is bcos pytorch expects everything in a batch.
    for i, chars in enumerate(name):
        idx = vocab[chars]
        base_tensor[i][0][idx] = 1 
        
    return base_tensor
        


### Creating the dataset

In [6]:
def create_dataset(data_dir, all_files):
    names = 0
    c = 0
    tensor_names= []
    tensor_labels = []
    for file in all_files:
        with open(os.path.join(data_dir, file)) as f:
            lang = file.split('.')[0]
            names = [unidecode(name.rstrip()) for name in f]
            for name in names:
                c += 1
                try:
                    tensor_names.append(name2tensor(name, vocab)) # This is a one-hot vector for every character
                    tensor_labels.append(lang2label[lang])  #These are integer labels
                except KeyError:
                    print('Key Not Present')
                    print(name)
                    pass
    print(f'Total Names in all files: {c}')
    return tensor_names, tensor_labels
            
tensor_names, tensor_labels = create_dataset(data_dir, all_files)
print("Done")

Key Not Present
Maxa/B
Key Not Present
Rafaj1
Key Not Present
Urbanek1
Key Not Present
Whitmire1
Total Names in all files: 20074
Done


#### Split the Dataset

In [7]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(tensor_names, tensor_labels, test_size = 0.1, shuffle= True, stratify = tensor_labels)

train_set = list(zip(xtrain, ytrain))
test_set = list(zip(xtest, ytest))

print(f"Total Training Examples: {len(train_set)}")
print(f"Total Testing Examples: {len(test_set)}")

Total Training Examples: 18063
Total Testing Examples: 2007


### Training

In [10]:
from model_rnn import Classification_RNN

# Initialize hyperparameters and modules
hidden_size = 256
learning_rate = 1e-3
model = Classification_RNN(vocab_size, hidden_size, num_langs)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)




In [19]:
c = 0
num_epochs = 3
print_interval = 3000
for epoch in range(num_epochs):
    for i, (name, label) in enumerate(train_set):
        c += 1
        hidden_state = model.init_hidden()
        for char in name: #Looping through one-hot vectors
            output, hidden_state = model(char, hidden_state)
 
        loss = criterion(output, label) # output: 18-dim tensor , label: 1-dim tensor
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i+1) % print_interval == 0:
            print(f"Epoch: [{epoch+1}/{num_epochs}], "
                  f"Step [{i + 1}/{len(train_set)}], "
                  f"Loss: {loss.item():.4f}"
            )
            
num_correct = 0
num_samples = len(test_set)

model.eval()

with torch.no_grad():
    for name, label in test_set:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")
        

Epoch: [1/3], Step [3000/18063], Loss: 0.1314
Epoch: [1/3], Step [6000/18063], Loss: 4.7553
Epoch: [1/3], Step [9000/18063], Loss: 0.0037
Epoch: [1/3], Step [12000/18063], Loss: 0.0015
Epoch: [1/3], Step [15000/18063], Loss: 0.0586
Epoch: [1/3], Step [18000/18063], Loss: 0.0000
Epoch: [2/3], Step [3000/18063], Loss: 0.0112
Epoch: [2/3], Step [6000/18063], Loss: 5.9061
Epoch: [2/3], Step [9000/18063], Loss: 0.0000
Epoch: [2/3], Step [12000/18063], Loss: 0.0000
Epoch: [2/3], Step [15000/18063], Loss: 0.0293
Epoch: [2/3], Step [18000/18063], Loss: 0.0000
Epoch: [3/3], Step [3000/18063], Loss: 0.0068
Epoch: [3/3], Step [6000/18063], Loss: 5.8815
Epoch: [3/3], Step [9000/18063], Loss: 0.0000
Epoch: [3/3], Step [12000/18063], Loss: 0.0000
Epoch: [3/3], Step [15000/18063], Loss: 0.0142
Epoch: [3/3], Step [18000/18063], Loss: 0.0000
Accuracy: 73.8416%


In [23]:
label2lang = {label.item():lang for lang, label in lang2label.items()}
def predict(name):
    model.eval()
    name_tensor = name2tensor(name, vocab)
    
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in name_tensor:
            output, hidden_state = model(char, hidden_state)
        print(f'output: {output}')
        a, pred = torch.max(output, 1)
        print(f"a: {a}")    
        print(f"pred: {pred}")
        
    model.train()
    
    return label2lang[pred.item()]


name = 'Mike'
print(f"Ethnicity of {name} is '{predict(name)}'")    

output: tensor([[ -5.2481,  -2.1307,  -7.1686,  -1.2919,  -2.9700,  -6.0612,  -3.1607,
          -3.5493,  -4.0597,  -0.3481,  -5.9803,  -9.0225,  -6.2125,  -8.1270,
          -5.1937,  -3.6715,  -4.4746, -10.1209]])
a: tensor([-0.3481])
pred: tensor([9])
Ethnicity of Mike is 'English'


In [30]:
# Save Model state_dict
torch.save(model.state_dict(), 'models/classification_rnn.pt')
