In [13]:
import torch 
from torch import nn
from torch import optim
import matplotlib.pyplot as plt
import numpy as np
import requests
import pprint
from bs4 import BeautifulSoup
import re
import string

In [14]:
all_letters = string.ascii_lowercase
n_letters = len(all_letters)

In [20]:
def read_words(infile):
    with open(infile) as file:
        lines = file.readlines()
        
    return [line.replace('\n', '') for line in lines] 

In [21]:
read_words("data/french_words.txt")[:4]

['france', 'wikip', 'document', 'documentelement']

In [85]:
all_langs = ['english', 'spanish', 'croatian', 'french', 'german', 'polish', 'italian']
category_words = {}
n_langs = len(all_langs)

In [86]:
# read some data
en_read = read_words("data/english_words.txt")
sp_read = read_words("data/spanish_words.txt")
cr_read = read_words("data/croatian_words.txt")
fr_read = read_words("data/french_words.txt")
ge_read = read_words("data/german_words.txt")
po_read = read_words("data/polish_words.txt")
it_read = read_words("data/italian_words.txt")


In [87]:
# define language categories
category_words['english'] = en_read
category_words['spanish'] = sp_read
category_words['croatian'] = cr_read
category_words['french'] = fr_read
category_words['german'] = ge_read
category_words['polish'] = po_read
category_words['italian'] = it_read

In [88]:
# words to tensor

def letter_to_index(letter):
    return all_letters.index(letter)

In [89]:
def word_to_tensor(word):
    tensor = torch.zeros(len(word), 1, len(all_letters))
    for i, letter in enumerate(word):
        tensor[i][0][letter_to_index(letter)] = 1
    return tensor

In [90]:
word_to_tensor("hello").size()

torch.Size([5, 1, 26])

In [91]:
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        
        self.soft = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.soft(self.i2o(combined))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [92]:
n_hidden = 128

net = RNN(n_letters, n_hidden, len(all_langs))

In [93]:

def most_possible(output):
    top_cat, top_ind = torch.Tensor.topk(output, 1)
    cat_i = top_ind[0].item()
    
    return all_langs[cat_i], cat_i

In [94]:
criterion = nn.CrossEntropyLoss()

In [95]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l)-1)]

def randomTrainingExample():
    category = randomChoice(all_langs)
    
    word = randomChoice(category_words[category])
    category_tensor = torch.tensor([all_langs.index(category)], dtype=torch.long)
    
    word_tensor = word_to_tensor(word)
    
    return category, word, category_tensor, word_tensor

In [96]:
lr = 0.001

def train(category_tensor, word_tensor, lr):
    
    hidden = net.init_hidden()
    
    net.zero_grad()
    
    for i in range(word_tensor.size(0)):
        output, hidden = net(word_tensor[i], hidden)
        
    loss = criterion(output, category_tensor)
    
    loss.backward()
    
    for p in net.parameters():
        p.data.add_(-lr, p.grad.data)
        
    return output, loss.item()

In [101]:
n_epoches = 1000
losses = []

for ep in range(n_epoches):
    cat, word, cat_tensor, word_tensor = randomTrainingExample()
    out, loss = train(cat_tensor, word_tensor, lr)
    
    losses.append(loss)

In [102]:
# Just return an output given a word
def evaluate(word_tensor):
    hidden = net.init_hidden()

    for i in range(word_tensor.size(0)):
        output, hidden = net(word_tensor[i], hidden)

    return output

In [103]:
def predict(input_word, n_predictions=2):
    input_word = input_word.lower() # to lowercase
    
    print('\n> %s' % input_word)
    
    with torch.no_grad():
        output = evaluate(word_to_tensor(input_word))
        
        # get top
        topv, topi = output.topk(n_predictions, 1, True)
        predictions = []
    
        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_langs[category_index]))
            predictions.append([value, all_langs[category_index]])

In [113]:
predict("english", n_predictions=5)


> english
(-1.84) italian
(-1.88) croatian
(-1.94) french
(-1.96) english
(-1.96) polish


In [109]:
len(en_read)

11122

In [114]:
len(it_read)

17225