# Classification Model to Determine the nationality of a given 'name'

In [57]:
%load_ext autoreload
%autoreload 2

# Downloads the ready-dataset
#!curl -O https://download.pytorch.org/tutorial/data.zip; 
#! unzip data.zip

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [73]:
# Install dependancies
import os
import random
from string import ascii_letters
from tqdm import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode
from sklearn.model_selection import train_test_split

from utils import *
from model_rnn import Classification_RNN

torch.manual_seed(2)

data_dir = "data/names"
arabic_file = os.path.join(data_dir, 'Arabic.txt')
all_files = os.listdir(data_dir)

In [74]:
#Create Vocab
vocab, vocab_size = get_vocab(ascii_letters)

#Create language-label dict
lang2label, num_langs = get_lang2label(data_dir)

#Create Dataset
actual_names, tensor_names, tensor_labels = create_dataset(data_dir, all_files, vocab, lang2label)

#Split into train and test
train_names, xtrain,ytrain, test_names,xtest, ytest = split_train_test(actual_names, tensor_names, tensor_labels, test_size=0.10)
train_set = list(zip(xtrain, ytrain))
test_set = list(zip(xtest, ytest))

print(f"Total Training Examples: {len(train_set)}")
print(f"Total Testing Examples: {len(test_set)}")
print("Done")

Total Vocab Size: 59
Total Languages (classes): 18
TOtal names in all languages: 20074
Key Not Present
Maxa/B
Key Not Present
Rafaj1
Key Not Present
Urbanek1
Key Not Present
Whitmire1
Total Names in all files: 20074
3
Total Training Examples: 18063
Total Testing Examples: 2007
Done


We see that there are a total of 59 tokens in our character vocabulary. This includes spaces and punctuations, such as ` .,:;-‘. <br>
This also means that each name will now be expressed as a tensor of size (num_char, 59). <br>
One-hot vector for each character. so if 5 characters in a name. 5 one-hot vectors. therefore tensor shape is -> (5,59) or (5, 1, 59) with a batch dimensions

### Training

In [75]:
# Initialize hyperparameters and modules
hidden_size = 256
learning_rate = 1e-3
model = Classification_RNN(vocab_size, hidden_size, num_langs)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)


In [76]:
c = 0
num_epochs = 5
print_interval = 3000
for epoch in tqdm(range(num_epochs)):
    for i, (name, label) in enumerate(train_set):
        c += 1
        hidden_state = model.init_hidden()
        for char in name: #Looping through one-hot vectors of 59 each
            output, hidden_state = model(char, hidden_state)
 
        loss = criterion(output, label) # output: 18-dim tensor , label: 1-dim tensor
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i+1) % print_interval == 0:
            print(f"Epoch: [{epoch+1}/{num_epochs}], "
                  f"Step [{i + 1}/{len(train_set)}], "
                  f"Loss: {loss.item():.4f}"
            )
            
num_correct = 0
num_samples = len(test_set)

model.eval()

with torch.no_grad():
    for name, label in test_set:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")
        

Epoch: [1/5], Step [3000/18063], Loss: 0.0092
Epoch: [1/5], Step [6000/18063], Loss: 1.9305
Epoch: [1/5], Step [9000/18063], Loss: 4.3423
Epoch: [1/5], Step [12000/18063], Loss: 0.3467
Epoch: [1/5], Step [15000/18063], Loss: 0.0875
Epoch: [1/5], Step [18000/18063], Loss: 0.0004
Epoch: [2/5], Step [3000/18063], Loss: 0.0000
Epoch: [2/5], Step [6000/18063], Loss: 0.7335
Epoch: [2/5], Step [9000/18063], Loss: 4.6068
Epoch: [2/5], Step [12000/18063], Loss: 0.1576
Epoch: [2/5], Step [15000/18063], Loss: 0.0484
Epoch: [2/5], Step [18000/18063], Loss: 0.0000
Epoch: [3/5], Step [3000/18063], Loss: 0.0000
Epoch: [3/5], Step [6000/18063], Loss: 0.7671
Epoch: [3/5], Step [9000/18063], Loss: 4.0403
Epoch: [3/5], Step [12000/18063], Loss: 0.1258
Epoch: [3/5], Step [15000/18063], Loss: 0.0193
Epoch: [3/5], Step [18000/18063], Loss: 0.0000
Epoch: [4/5], Step [3000/18063], Loss: 0.0000
Epoch: [4/5], Step [6000/18063], Loss: 1.2322
Epoch: [4/5], Step [9000/18063], Loss: 3.8525
Epoch: [4/5], Step [12000

In [23]:
label2lang = {label.item():lang for lang, label in lang2label.items()}
def predict(name):
    model.eval()
    name_tensor = name2tensor(name, vocab)
    
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in name_tensor:
            output, hidden_state = model(char, hidden_state)
        print(f'output: {output}')
        a, pred = torch.max(output, 1)
        print(f"a: {a}")    
        print(f"pred: {pred}")
        
    model.train()
    
    return label2lang[pred.item()]


name = 'Mike'
print(f"Ethnicity of {name} is '{predict(name)}'")    

output: tensor([[ -5.2481,  -2.1307,  -7.1686,  -1.2919,  -2.9700,  -6.0612,  -3.1607,
          -3.5493,  -4.0597,  -0.3481,  -5.9803,  -9.0225,  -6.2125,  -8.1270,
          -5.1937,  -3.6715,  -4.4746, -10.1209]])
a: tensor([-0.3481])
pred: tensor([9])
Ethnicity of Mike is 'English'


In [30]:
# Save Model state_dict
#torch.save(model.state_dict(), 'models/classification_rnn.pt')

## THE END