Refer to this [article](https://jaketae.github.io/study/pytorch-rnn/).


# Step 1. Download Data

In [1]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_ = torch.manual_seed(42)
device = torch.device(
    "cuda" if torch.cuda.is_available() 
    else "cpu"
)


In [2]:
# Step 1.1.
# Prepare labels
data_dir = "./data/data/names"



lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate(os.listdir(data_dir))
}

label2lang = {
    label.item(): lang
    for lang, label in lang2label.items()
}




num_langs = len(lang2label)
label2lang

{0: 'Czech',
 1: 'German',
 2: 'Arabic',
 3: 'Japanese',
 4: 'Chinese',
 5: 'Vietnamese',
 6: 'Russian',
 7: 'French',
 8: 'Irish',
 9: 'English',
 10: 'Spanish',
 11: 'Greek',
 12: 'Italian',
 13: 'Portuguese',
 14: 'Scottish',
 15: 'Dutch',
 16: 'Korean',
 17: 'Polish'}

# Step 2. Preprocessing

In [3]:
unidecode("Ślusàrski")


'Slusarski'

In [4]:
char2idx = {
    letter: i 
    for i, letter in enumerate(ascii_letters + " .,:;-'")
}

idx2char = {
    i: letter
    for letter, i in char2idx.items()
}

num_letters = len(char2idx); 


print(f'num_letters: {num_letters}')

num_letters: 59


In [5]:
# Step 3.1
# Convert name string to a list one-hot encoding
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

name2tensor('abc')

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

# Step 4. Dataset Creation


In [6]:
# Step 4.1
# From files
# Prepare name tenor and label tensor

tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass


In [7]:
# Step 4.2
# split tensors into 
# - training set 
# - and test set

from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]

print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")


Train: 18063
Test: 2007


In [8]:
# Step 4.3 
# Reconstruct 1 name string from a list of one-hot encodings

name = ''
for vector in train_dataset[0][0]:
    
    
    idx = torch.multinomial(
        vector[0],
        num_samples=1, 
        replacement=True).item()
    name+=idx2char[idx]
    
print(name)




Prigov


# Step 5. Model

## Simple RNN

1. A simple diagram of RNN

    ![01-simple-rnn-diagram](./images/01-simple-rnn-diagram.png)


2. RNN predict language of a vocabulary

    ![02-rnn-predict-language-of-a-vocabulary](./images/02-rnn-predict-language-of-a-vocabulary.png)

In [9]:
# Step 5.1 
# Define model
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyRNN, self).__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

In [10]:
# Step 5.2
# - Initialize model
# - Prepare Loss function
# - Prepare gradient descent function

hidden_size = 256
learning_rate = 0.001

model = MyRNN(num_letters, hidden_size, num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
name, label = train_dataset[0]

hidden_state = model.init_hidden()

for char in name:
    output, hidden_state = model(char, hidden_state)

print(output)
loss = criterion(output, label)

label

tensor([[-17.9011, -16.1696, -37.0570, -25.3422, -31.3286, -25.8627, -21.8149,
         -16.1815, -21.9011,  -9.0964, -19.3889, -20.1110, -25.4340, -30.0159,
         -15.5548, -17.2972, -22.6951, -23.5569]], grad_fn=<AddmmBackward0>)


tensor([9])

In [11]:
# Step 5.3 
# Training
num_epochs = 10
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/10], Step [3000/18063], Loss: 4.3870
Epoch [1/10], Step [6000/18063], Loss: 0.6024
Epoch [1/10], Step [9000/18063], Loss: 1.7229
Epoch [1/10], Step [12000/18063], Loss: 3.7423
Epoch [1/10], Step [15000/18063], Loss: 3.2060
Epoch [1/10], Step [18000/18063], Loss: 0.2994
Epoch [2/10], Step [3000/18063], Loss: 0.2163
Epoch [2/10], Step [6000/18063], Loss: 0.2159
Epoch [2/10], Step [9000/18063], Loss: 0.0942
Epoch [2/10], Step [12000/18063], Loss: 2.0435
Epoch [2/10], Step [15000/18063], Loss: 0.0017
Epoch [2/10], Step [18000/18063], Loss: 0.3208
Epoch [3/10], Step [3000/18063], Loss: 3.2071
Epoch [3/10], Step [6000/18063], Loss: 0.0001
Epoch [3/10], Step [9000/18063], Loss: 0.0205
Epoch [3/10], Step [12000/18063], Loss: 0.0002
Epoch [3/10], Step [15000/18063], Loss: 6.1180
Epoch [3/10], Step [18000/18063], Loss: 1.4174
Epoch [4/10], Step [3000/18063], Loss: 4.9398
Epoch [4/10], Step [6000/18063], Loss: 0.3027
Epoch [4/10], Step [9000/18063], Loss: 0.2370
Epoch [4/10], Step [12000

In [12]:
num_correct = 0
num_samples = len(test_dataset)

model.eval()

with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 81.3154%


In [13]:
def myrnn_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in tensor_name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
    model.train()    
    return label2lang[pred.item()]

In [14]:
myrnn_predict("Mike")


'Japanese'

In [15]:
myrnn_predict("Qin")


'Chinese'

In [16]:
myrnn_predict("Slaveya")


'Czech'