### This is from a [tutorial]('https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html) from the PyTorch website.

### In this notebook, we will be building and training a basic RNN to classify words. 
### This seems to be a good start for any of you who wish to learn how to implement RNN using PyTorch.
### With RNN, the basics is that we will output a prediction and 'hidden state' at each step, feeding its previous hidden state into each next step.
### For this task, we'll train on a few thousand surnames from 18 languages of origin, and predict which language a name is from based on the spelling.
### We will use some helper code from the tutorial to convert the names from Unicode to ACSII and come up with a dictinoary of lists of names per language.

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

home = '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/'

print(findFiles(home+'data/names/*.txt'))

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles(home+'data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

['/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/Czech.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/German.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/Arabic.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/Japanese.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/Chinese.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/Vietnamese.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/Russian.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/French.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/Irish.txt', '/Users/kimhyunbin/Documents/Python/My own project (Python)/PyTorch_Guide/data/names/English.txt', '/Users/kimh

### We have *category_lines*, a dictinoary mapping each category (language) to a list of lines (names).
### *all_categories* is just a list of languages and *n_categories* is the number of languages we have.

In [3]:
print(category_lines['Italian'][:5])

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


# Turning names into Tensors

### For this task, we would use a one-hot vector of size <1 x n_letters>.
### It would be a sparse matrix overall with binary representations of the presence of the respective letters.
### The eventual joined representation of a word would be a 2D matrix <line_length x 1 x n_letters>

In [7]:
import torch 

def letterToIndex(letter):
    return all_letters.find(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def lineToTensor(word):
    tensor = torch.zeros(len(word), 1, n_letters)
    for li, letter in enumerate(word):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


### You can understand this as 5 letters in the word 'Jones' and each of the tensor representing individual letters have a single row with 57 columns, each column for every element in the all_letters representation.

# Creating the Network

In [11]:
import torch.nn as nn 

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden),1)
        hidden = self.i2h(combined)
        output = self.softmax(self.i2o(combined))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

### This network will be ran as such.
### We will pass an input (the Tensor for the current letter) and a previous hidden state (which is initilized as zeroes first).
### We will get back the output (probability of each langugage) and a next hidden state (for the next time step).

In [16]:
input = letterToTensor('A')
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input, hidden)
output.size(), next_hidden.size()

(torch.Size([1, 18]), torch.Size([1, 128]))

### In practice, we would convert the whole letter to Tensors which our model would be able to interpret.

In [17]:
input = lineToTensor('Albert')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
output

tensor([[-2.9647, -2.8111, -2.7743, -2.9133, -2.8510, -3.0001, -2.8762, -2.7799,
         -2.9853, -2.9864, -2.8899, -2.9542, -2.9563, -2.8634, -2.9135, -2.9159,
         -2.7841, -2.8538]], grad_fn=<LogSoftmaxBackward0>)

### The output is a tensor showing the probability for each criteria. 
### The higher the numerical figure, the more likely it is that the name is from that specific language.

# Training

### Let's create a helper function to be able to interpret the output of the network.

In [18]:
def categoryFromOutput(input):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

print(categoryFromOutput(output))

('Arabic', 2)
