# Name Classification Using RNN

<p>Salam Alikom everyone, In this project we're going to make a classification model using RNN with Pytorch framework.<br>
So the model's role is to classify a name, which language that the name belongs to. We have 18 different languages <i>(Arabic, English, French, etc...)</i>.</p>


#### Imports

In [2]:
import os
import string
import unicodedata
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

from tqdm import tqdm

#### Constants

In [3]:
ALL_LETTERS = string.ascii_letters + " .,;'"
N_LETTERS = len(ALL_LETTERS)
DATA_FOLDER = "./data/names/"
FILE_NAMES = os.listdir(DATA_FOLDER)

## Preparing Data

In [4]:
def unicode_to_ascii(name):
    return ''.join(
    c for c in unicodedata.normalize('NFD', name)
    if unicodedata.category(c) != 'Mn'
    and c in ALL_LETTERS
    )

#### Loading & Preprocessing

In [5]:
data = []
languages = []

for file_name in FILE_NAMES:
    file_path = os.path.join(DATA_FOLDER, file_name)
    with open(file_path, 'r', encoding='utf8') as file:
        lines = file.readlines()
        for line in lines:
            name = line.strip()
            name = unicode_to_ascii(name)
            language = file_name.split('.')[0]
            languages.append(language) if language not in languages else None
            data.append([[letter for letter in name], language])
language_label = {language: index for index, language in enumerate(languages)}

In [6]:
# 18 different laguanges
language_label.values()

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17])

In [7]:
labels_list = [l for l in language_label.keys()]
labels_count = []
for label in labels_list:
    count = 0
    count = sum(1 for _, y in data if y == label)
    labels_count.append([label, count])

In [8]:
labels_count

[['Arabic', 2000],
 ['Chinese', 268],
 ['Czech', 519],
 ['Dutch', 297],
 ['English', 3668],
 ['French', 277],
 ['German', 724],
 ['Greek', 203],
 ['Irish', 232],
 ['Italian', 709],
 ['Japanese', 991],
 ['Korean', 94],
 ['Polish', 139],
 ['Portuguese', 74],
 ['Russian', 9408],
 ['Scottish', 100],
 ['Spanish', 298],
 ['Vietnamese', 73]]

In [9]:
random.shuffle(data)

In [10]:
data[0:10]

[[['M', 'a', 'l', 'a', 'f', 'a'], 'Czech'],
 [['K', 'a', 'l', 'a', 'm', 'k', 'a', 'r', 'y', 'a', 'n'], 'Russian'],
 [['K', 'o', 'u', 'r', 'i'], 'Arabic'],
 [['K', 'a', 'p', 'p', 'e', 'l'], 'German'],
 [['M', 'u', 's', 't', 'a', 'f', 'a'], 'Arabic'],
 [['F', 'i', 'o', 'h', 'i', 'n'], 'Russian'],
 [['D', 'u', 'b', 'n', 'i', 'k', 'o', 'v'], 'Russian'],
 [['S', 'h', 'a', 'l', 'a', 'g', 'a', 'e', 'v'], 'Russian'],
 [['J', 'u', 'k'], 'Russian'],
 [['K', 'a', 'l', 'i', 'n', 'k', 'o'], 'Russian']]

#### Encoding

In [11]:
one_hot = torch.eye(N_LETTERS)
encoded_data = []
for idx, name in enumerate(data):
    encoded_name = []
    for letter in name[0]:
        encoded_name.append(one_hot[ALL_LETTERS.index(letter)])
    encoded_data.append([encoded_name, language_label[name[1]]])

In [12]:
# Padding Tensors ==> To create a Tensor of Tensors instead of List of Tensors
padded_tensor = pad_sequence([torch.stack(tensor) for tensor, _ in encoded_data],
                             batch_first=True, padding_value=0)

In [21]:
padded_tensor[0][10]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [23]:
# X = data, y = Labels
X = torch.stack([tensor for tensor in padded_tensor])
y = torch.tensor([y for _, y in encoded_data])

In [24]:
X.size()[0], y.size()[0]

(20074, 20074)

#### Splitting 

In [25]:
dataset = TensorDataset(X, y)

In [26]:
train_size = int(0.7 * len(dataset))  
val_size = int(0.2 * len(dataset))    
test_size = len(dataset) - train_size - val_size 

In [27]:
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

In [28]:
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

In [29]:
batch = next(iter(train_loader))
input_data, labels = batch

print("Input data shape:", input_data.shape)
print("Labels shape:", labels.shape)

Input data shape: torch.Size([32, 19, 57])
Labels shape: torch.Size([32])


## Creating Model

In [30]:
NUM_EPOCHS = 20
LR = 0.0001
CLASSES = 18  # Number of classes
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [31]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.rnn = nn.RNN(input_size, input_size*8, batch_first=True)
        self.fc = nn.Linear(input_size*8, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.input_size*8).to(DEVICE)
        out, _ = self.rnn(x, h0) 
        out = self.fc(out[:, -1, :])
        return out

In [32]:
model = RNN(N_LETTERS, CLASSES)
model.to(DEVICE)

RNN(
  (rnn): GRU(57, 456, batch_first=True)
  (fc): Linear(in_features=456, out_features=18, bias=True)
)

#### Training the model

In [33]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [None]:
best_accuracy = 0.0
all_losses = []
# Training loop
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    # Create tqdm progress bar
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    
    # Iterate over batches of training data
    for batch_idx, (inputs, labels) in loop:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)   
        
        # 0 Grads
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Compute accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()
        
        running_loss += loss.item()
        
        # Update tqdm progress bar
        loop.set_description(f"Epoch[{epoch}/{NUM_EPOCHS}]")
        loop.set_postfix(loss = loss.item())
    
    all_losses.append(running_loss)
    
    
    accuracy = correct_predictions / total_predictions
    print(f"Accuracy = {accuracy}")
    
    # Check if accuracy has improved
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), "./models/best_model2_"+str(epoch)+".pth")
        print("Best model saved.")


In [35]:
# Validation loop
model.eval()  # Set model to evaluation mode
val_loss = 0.0
val_correct_predictions = 0
val_total_predictions = 0

# Disable gradient calculation
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Compute accuracy
        _, predicted = torch.max(outputs.data, 1)
        val_total_predictions += labels.size(0)
        val_correct_predictions += (predicted == labels).sum().item()

        val_loss += loss.item()

    val_accuracy = val_correct_predictions / val_total_predictions
    print(f"Validation Accuracy = {val_accuracy}")
model.train()