Refer to this [article](https://jaketae.github.io/study/pytorch-rnn/).


# Step 1. Download Data

In [1]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_ = torch.manual_seed(42)
device = torch.device(
    "cuda" if torch.cuda.is_available() 
    else "cpu"
)


In [23]:
# Step 1.1.
# Prepare labels
data_dir = "./data/data/names"



lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate(os.listdir(data_dir))
}

num_langs = len(lang2label)

display(lang2label)
print(f'num_langs: {num_langs}')

{'Czech': tensor([0]),
 'German': tensor([1]),
 'Arabic': tensor([2]),
 'Japanese': tensor([3]),
 'Chinese': tensor([4]),
 'Vietnamese': tensor([5]),
 'Russian': tensor([6]),
 'French': tensor([7]),
 'Irish': tensor([8]),
 'English': tensor([9]),
 'Spanish': tensor([10]),
 'Greek': tensor([11]),
 'Italian': tensor([12]),
 'Portuguese': tensor([13]),
 'Scottish': tensor([14]),
 'Dutch': tensor([15]),
 'Korean': tensor([16]),
 'Polish': tensor([17])}

num_langs: 18


# Step 2. Preprocessing

In [24]:
unidecode("Ślusàrski")


'Slusarski'

In [77]:
char2idx = {
    letter: i 
    for i, letter in enumerate(ascii_letters + " .,:;-'")
}

idx2char = {
    i: letter
    for letter, i in char2idx.items()
}

num_letters = len(char2idx); 


print(f'num_letters: {num_letters}')

num_letters: 59


In [32]:
# Step 3.1
# Convert name string to a list one-hot encoding
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

name2tensor('abc')

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

# Step 4. Dataset Creation


In [92]:
# Step 4.1
# From files
# Prepare name tenor and label tensor

tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass


In [91]:
# Step 4.2
# split tensors into 
# - training set 
# - and test set

from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]



In [115]:
# Step 4.3 
# Reconstruct 1 name string from a list of one-hot encodings

name = ''
for vector in train_dataset[0][0]:
    
    
    idx = torch.multinomial(
        vector[0],
        num_samples=1, 
        replacement=True).item()
    name+=idx2char[idx]
    
print(name)




Dale
