In [13]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import random
import time
import math
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from matplotlib import pyplot as plt
import pandas as pd
import string
import unicodedata
import torch.nn as nn
from collections import defaultdict

In [14]:
def parseCSV(file):
    data = pd.read_csv(file)
    return data

In [15]:
csv_data = parseCSV("language_dataset.csv").to_dict('split')['data']
category_lines = defaultdict(list)
for entry in csv_data:
    category_lines[entry[1]].append(entry[0])

all_categories = list(category_lines.keys())

n_categories = len(all_categories)

hi


In [4]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

#might need to use char2vec???

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


In [5]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

In [6]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [7]:
def train_iteration_CharRNN(learning_rate, category_tensor, line_tensor):
    criterion = nn.NLLLoss()
    hidden = rnn.initHidden()
    rnn.zero_grad()

    #The forward process
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden) #update and use the hidden layer for each line of the rnn tensor

    #The backward process
    loss = criterion(output, category_tensor) #compute the NLLLoss
    loss.backward() #backward step

    #Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data) #add parameters in-place

    return output, loss.item()


def train_charRNN(n_iters, learning_rate):
    print_every = 1000

    current_loss = 0

    def timeSince(since):
        now = time.time()
        s = now - since
        m = math.floor(s / 60)
        s -= m * 60
        return '%dm %ds' % (m, s)

    start = time.time()

    for iter in range(1, n_iters + 1):
        category, line, category_tensor, line_tensor = randomTrainingExample()
        output, loss = train_iteration_CharRNN(learning_rate, category_tensor, line_tensor)
        current_loss += loss

        # Print iter number, loss, name and guess
        if iter % print_every == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('%d %d%% (%s) %.4f %s / %s %s' % (
                iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))
            print('Average loss: %.4f' % (current_loss/print_every))
            current_loss = 0

    torch.save(rnn, 'char-rnn-classification.pt')

In [8]:
def predict(input_line, n_predictions=7): #change to 8 for other???
    print("Predition for %s:" % input_line)
    hidden = rnn.initHidden()

    #Generate the input for RNN
    #follow similar code to prior code for line tensor and generating output, hidden
    line_tensor = lineToTensor(input_line) #convery input_line to tensor
    for i in range(line_tensor.size()[0]): #go through each element of the tensor
        output, hidden = rnn(line_tensor[i], hidden) #update and use the hidden layer for each line of the rnn tensor

    #Get the value and index of top K predictions from the output
    #Then apply Softmax function on the scores of all category predictions so we can 
    #output the probabilities that this name belongs to different languages.
    topv, topi = output.topk(n_predictions, 1, True) #top value and top index of the top k predictions from the output
    softmax = nn.LogSoftmax(dim=1) #softmax layer
    top_prob = softmax(topv) #apply Softmax function on the last output value topv so result is in probability
    predictions = []
 
    for i in range(n_predictions):
        value = topv[0][i].item() #topv keeps track of values
        prob = top_prob[0][i].item() #top_prob keeps track of probability 
        category_index = topi[0][i].item() #topi keeps track of category index
        print('%s Probability: (%.2f), Score: (%.2f)' % (all_categories[category_index], prob, value))
        predictions.append([value, all_categories[category_index]])
    return predictions

In [11]:
train_charRNN(15000, 0.005)
predict("event")

1000 6% (0m 1s) 1.2088 Lass / German ✗ (Simlish)
Average loss: 0.9619
2000 13% (0m 3s) 1.0212 caccia / Spanish ✗ (Italian)
Average loss: 0.9289
3000 20% (0m 5s) 0.0407 Sugnorg / Simlish ✓
Average loss: 0.9911
4000 26% (0m 7s) 0.2714 vetro / Italian ✓
Average loss: 0.8501
5000 33% (0m 9s) 0.0282 wō / Tolkien Elvish ✓
Average loss: 0.8927
6000 40% (0m 10s) 0.3136 joy / English ✓
Average loss: 0.8589
7000 46% (0m 12s) 0.0097 kirtē / Tolkien Elvish ✓
Average loss: 0.9140
8000 53% (0m 14s) 0.5481 énergie / French ✓
Average loss: 0.8555
9000 60% (0m 16s) 0.5057 vestido / Spanish ✓
Average loss: 0.8393
10000 66% (0m 18s) 1.3809 besoin / English ✗ (French)
Average loss: 0.9118
11000 73% (0m 19s) 0.5760 queue / French ✓
Average loss: 0.9134
12000 80% (0m 21s) 0.0007 wegō(n) / Tolkien Elvish ✓
Average loss: 0.9036
13000 86% (0m 23s) 2.0227 Firma / Simlish ✗ (German)
Average loss: 0.8203
14000 93% (0m 25s) 0.2864 chick / English ✓
Average loss: 0.8867
15000 100% (0m 27s) 0.2091 sapnā / Tolkien El

[[-0.5009896159172058, 'French'],
 [-1.315126895904541, 'English'],
 [-2.6837406158447266, 'Italian'],
 [-3.3664541244506836, 'Spanish'],
 [-4.228370666503906, 'German'],
 [-4.813390254974365, 'Tolkien Elvish'],
 [-9.082365036010742, 'Simlish']]