In [3]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import unicodedata 
import string

In [6]:
# all_letters = string.ascii_letters + " .,;'"
# n_letters = len(all_letters)

In [4]:
# decrease dimension
all_letters = string.ascii_lowercase # use it
n_letters = len(all_letters)

In [5]:
# converting uncode surnames to ascii
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def findFiles(path): return glob.glob(path)

In [6]:
category_names = {}
all_categories = []

In [7]:
def read_names(infile):
    with open(infile, encoding="utf-8") as file:
        lines = file.readlines()
        
    return [unicodeToAscii(line) for line in lines]
    
# # Read a file and split into lines
# def readLines(filename):
#     lines = open(filename, encoding='utf-8').read().strip().split('\n')
#     return [unicodeToAscii(line) for line in lines]

In [8]:
read_names("data/names/Spanish.txt")[:5]

['bana', 'bano', 'barca', 'baroa', 'bascal']

In [9]:
for filename in findFiles("data/names/*.txt"):
    cat = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(cat)
    names = read_names(filename)
    category_names[cat] = names


In [10]:
n_categories = len(all_categories)

In [11]:
# turning words to tensors

def letter_to_index(letter):
    return all_letters.index(letter)

def name_to_tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters) # initially zero tensor; 1 batch at a time
    for ind, letter in enumerate(line):
        tensor[ind][0][letter_to_index(letter)] = 1
    return tensor

In [12]:
name_to_tensor("hello").size()

torch.Size([5, 1, 26])

In [13]:
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        # логарифм вероятности
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.softmax(self.i2o(combined))
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [14]:
n_hidden = 128

rnn = RNN(n_letters, n_hidden, n_categories)

In [15]:
def most_possible(output):
    top_cat, top_ind = torch.Tensor.topk(output, 1)
    cat_i = top_ind[0].item()
    
    return all_categories[cat_i], cat_i

In [92]:
# define measure of erros 
criterion = nn.CrossEntropyLoss() # or NLLoss

In [93]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_names[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = name_to_tensor(line)
    return category, line, category_tensor, line_tensor

In [94]:
lr = 0.001 

def train(category_tensor, line_tensor, lr):
    # zero initialization
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    for i in range(line_tensor.size(0)):
        output, hidden = rnn(line_tensor[i], hidden)
    
    loss = criterion(output, category_tensor)
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-lr, p.grad.data)

    return output, loss.item()

In [95]:
n_epoches = 20 # it seems 100_000 not enough
losses = [] # in order to accumulate losses over training

for ep in range(n_epoches):
    cat, line, cat_tensor, line_tensor = randomTrainingExample()
    output, loss = train(cat_tensor, line_tensor, lr)

    losses.append(loss)
#     if ep % 999 == 0 and ep != 0:
#         guess, _ = most_possible(output)
#         print("epoch: {.3f}%, loss is {.3f}, name is {}, guess is {}, correct is {}".format((ep+1)*100/n_epoches, 
#                                                                                      loss,
#                                                                                      line,
#                                                                                      guess, 
#                                                                                      cat))

In [96]:
# Just return an output given a line
def evaluate(line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size(0)):
        output, hidden = rnn(line_tensor[i], hidden)

    return output

In [97]:
def predict(input_line, n_predictions=3):
    input_line = input_line.lower() # because i decrease dimension
    print('\n> %s' % input_line)
    with torch.no_grad():
        output = evaluate(name_to_tensor(input_line))

        # Get top N categories
        topv, topi = output.topk(n_predictions, 1, True)
        predictions = []

        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_categories[category_index]))
            predictions.append([value, all_categories[category_index]])

In [98]:
predict("hinton")


> hinton
(-1.51) English
(-1.66) Scottish
(-1.71) Irish
