In [1]:
from __future__ import unicode_literals, print_function, division
import os, glob, random
from io import open
import unicodedata, string
# thanks to https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [2]:
def find_files(path): 
    return glob.glob(path)
print(find_files('../dataset/data/names/*.txt'))

def unicode_to_ascii(s, all_letters):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters
    )

all_letters = string.ascii_letters + ".,;''"
n_letters = len(all_letters)
print(unicode_to_ascii('Ślusàrski', all_letters))

['../dataset/data/names/Czech.txt', '../dataset/data/names/German.txt', '../dataset/data/names/Arabic.txt', '../dataset/data/names/Japanese.txt', '../dataset/data/names/Chinese.txt', '../dataset/data/names/Vietnamese.txt', '../dataset/data/names/Russian.txt', '../dataset/data/names/French.txt', '../dataset/data/names/Irish.txt', '../dataset/data/names/English.txt', '../dataset/data/names/Spanish.txt', '../dataset/data/names/Greek.txt', '../dataset/data/names/Italian.txt', '../dataset/data/names/Portuguese.txt', '../dataset/data/names/Scottish.txt', '../dataset/data/names/Dutch.txt', '../dataset/data/names/Korean.txt', '../dataset/data/names/Polish.txt']
Slusarski


In [3]:
import collections


def read_lines(filename):
    f = open(filename, encoding='utf-8')
    content = f.read().strip().split('\n')  # good trick to make document into list, strip() remove leading and trailing space.  
    res = [unicode_to_ascii(name, all_letters) for name in content]
    return res

def categorize(path = '../dataset/data/names/*.txt'):
    category_lines = collections.defaultdict(list)
  
    pathes = find_files(path)
    for filename in pathes:
        basename = os.path.basename(filename)      # output Chinese.txt, this is removing leading directory. 
        category = os.path.splitext(basename)[0]   # this splits the filename and the extension. 
        all_categories.append(category)
        lines = read_lines(filename)
        category_lines[category] = lines
    return category_lines
all_categories = []
category_lines = categorize()
# print(category_lines)
n_categories = len(all_categories)
print(n_categories)                               
print(category_lines["Italian"][:5]) 
print(all_categories)

18
['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']
['Czech', 'German', 'Arabic', 'Japanese', 'Chinese', 'Vietnamese', 'Russian', 'French', 'Irish', 'English', 'Spanish', 'Greek', 'Italian', 'Portuguese', 'Scottish', 'Dutch', 'Korean', 'Polish']


In [4]:
import torch
def letter2idx(letter): return all_letters.find(letter)
def letter2tensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letter2idx(letter)] = 1
    return tensor
def line2tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)  # tensor treate everything with batch
    for i, char in enumerate(line):
        tensor[i][0][letter2idx(char)] = 1
    return tensor

print(letter2tensor('J'))
print(line2tensor("adfadf"))



tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.

In [5]:
import torch.nn as nn
activations = nn.ModuleDict([
    ['lrelu', nn.LeakyReLU()],
    ['relu', nn.ReLU()],
    ['selu', nn.SELU(inplace=True)],
    ['tanh', nn.Tanh()],
    ['softmax', nn.LogSoftmax(dim=1)],
    ['none', nn.Identity()],
])

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
#         self.activation = nn.LogSoftmax(dim=1)
        self.activation = activations['softmax']
    
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined) 
        output = self.i2o(combined)
        output = self.activation(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    
n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)
# input = letter2tensor('A')
# hidden = torch.zeros(1, n_hidden)
# output, next_hidden = rnn(input, hidden)
# print(output, next_hidden)

input = line2tensor('Albert')
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input[0], hidden)
print(output, next_hidden)




tensor([[-2.8094, -2.9531, -2.8433, -2.9077, -2.8186, -2.9677, -2.7931, -2.9278,
         -2.8185, -2.8538, -2.9698, -2.8808, -2.9851, -2.7302, -2.9611, -2.9959,
         -2.9263, -2.9360]], grad_fn=<LogSoftmaxBackward>) tensor([[ 0.0700, -0.0099, -0.0380, -0.0079,  0.0454, -0.0385, -0.0488, -0.0493,
         -0.0119, -0.0605, -0.0175, -0.0221, -0.0409, -0.0326, -0.0334,  0.0094,
          0.0897, -0.0034, -0.0822, -0.0094,  0.0217,  0.0517,  0.0023,  0.0040,
          0.0338, -0.1036,  0.0050, -0.0205, -0.0647,  0.0565, -0.0561,  0.0600,
          0.0065, -0.0128, -0.0742, -0.0069, -0.0237,  0.0272,  0.0834,  0.1100,
         -0.0913,  0.0543, -0.0183, -0.0182, -0.1311, -0.0282, -0.1155, -0.0439,
          0.1229,  0.0881, -0.0957, -0.0043, -0.1153,  0.0068,  0.0647,  0.0514,
         -0.0083,  0.0761,  0.1220,  0.1306,  0.0141, -0.0938, -0.0024, -0.1040,
         -0.0674, -0.0522, -0.0859, -0.0322,  0.0233, -0.0327, -0.0321, -0.0084,
          0.0194, -0.0675,  0.0229, -0.0165,  0.06

In [6]:
def category_from_output(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i
print(category_from_output(output))

def random_choice(arr):
    return arr[random.randint(0, len(arr) - 1)]

def get_training_sample():
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    line_tensor = line2tensor(line)
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_tensor, line_tensor = get_training_sample()
    print('category =', category, '/ line = ', line )

('Portuguese', 13)
category = Japanese / line =  Kijimuta
category = Chinese / line =  Jing
category = Japanese / line =  Yamada
category = German / line =  Bumgarner
category = French / line =  Sauvage
category = Spanish / line =  Araullo
category = Greek / line =  Antimisiaris
category = French / line =  Segal
category = Scottish / line =  Mclean
category = Korean / line =  So


In [11]:
criterion = nn.NLLLoss()
learning_rate = 0.005
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)


def train(line_tensor, category_tensor):
    hidden = rnn.initHidden()   # note: hidden layer needs to initilize each time trian with a word, get each letter and pass sequencly to with hidden layer. 
    rnn.train()
#     rnn.zero_grad()  # note, need to zero_grad outside the for loop, one zero when finish one word
    optimizer.zero_grad()
    for i in range(line_tensor.size()[0]):  # number of letters in this word
        output, hidden = rnn(line_tensor[i], hidden)
    
    loss = criterion(output, category_tensor)
    loss.backward()
    
#     for p in rnn.parameters():   # these two line can replace optimizer
#         p.data.add_(p.grad.data, alpha=-learning_rate)
    
    optimizer.step()
    return output, loss.item()

import time, math
n_iters = 100000
print_every = 500
plot_every = 100

def time_elapse(since):
    now = time.time()
    s = now - since
    m = math.floor(s/60)
    s -= m * 60
    return '%dm %ds' %(m, s)
current_loss = 0
all_losses = []
start = time.time()

for iter in range(1, n_iters + 1):
    
    category, line, category_tensor, line_tensor = get_training_sample()
    output, loss = train(line_tensor, category_tensor)
    current_loss += loss
    
    if iter % print_every == 0:
        pred, pred_idx = category_from_output(output)
        correct = '✓' if pred == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, 
                    time_elapse(start), loss, line, pred, correct))
        
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0
        
    
    

500 0% (0m 1s) 40.3232 Wray / Polish ✗ (English)
1000 1% (0m 3s) 272931.8750 Sternberg / Japanese ✗ (German)
1500 1% (0m 5s) 398.3838 Honda / Dutch ✗ (Japanese)
2000 2% (0m 6s) 11612.6338 Legrand / Arabic ✗ (French)
2500 2% (0m 8s) 262.2949 Gosse / Russian ✗ (French)
3000 3% (0m 10s) 6402746.0000 Awinowitski / Dutch ✗ (Russian)
3500 3% (0m 11s) 11985252.0000 Lichtenberg / Dutch ✗ (German)
4000 4% (0m 13s) 10.3745 Han / French ✗ (Chinese)
4500 4% (0m 15s) 525.7008 Doble / German ✗ (English)
5000 5% (0m 17s) 8.2079 Phi / Chinese ✗ (Vietnamese)
5500 5% (0m 18s) 595939.4375 Ishinomori / Portuguese ✗ (Japanese)
6000 6% (0m 20s) 0.0000 Wen / Chinese ✓
6500 6% (0m 22s) 0.0000 Romao / Portuguese ✓
7000 7% (0m 23s) 10049.0117 Agthoven / Chinese ✗ (Dutch)
7500 7% (0m 25s) 319.6494 Simon / Polish ✗ (French)
8000 8% (0m 27s) 3254420.0000 Goldschmidt / Greek ✗ (German)
8500 8% (0m 28s) 1668.0435 Minami / Dutch ✗ (Japanese)
9000 9% (0m 31s) 18771.1133 Hamilton / English ✗ (Scottish)
9500 9% (0m 33s)

73500 73% (4m 34s) 238.4543 Kaluza / Dutch ✗ (Polish)
74000 74% (4m 36s) 930.2970 Abreu / Spanish ✗ (Portuguese)
74500 74% (4m 38s) 0.0000 Varela / Spanish ✓
75000 75% (4m 39s) 16.7465 Baz / Vietnamese ✗ (Arabic)
75500 75% (4m 41s) 12850.8525 Hakimi / Spanish ✗ (Arabic)
76000 76% (4m 43s) 51549.9727 Tritten / Greek ✗ (German)
76500 76% (4m 45s) 8610.1426 Kruger / Dutch ✗ (German)
77000 77% (4m 46s) 0.0000 Aart / Dutch ✓
77500 77% (4m 49s) 84.9270 Chun / English ✗ (Korean)
78000 78% (4m 50s) 149946.9844 Jeronkin / Italian ✗ (Russian)
78500 78% (4m 52s) 67163912.0000 Bartolomei / English ✗ (Italian)
79000 79% (4m 54s) 83245.5469 Michel / English ✗ (Spanish)
79500 79% (4m 56s) 45953636.0000 Breitbarth / Polish ✗ (German)
80000 80% (4m 57s) 185.6261 Gong / Arabic ✗ (Chinese)
80500 80% (4m 59s) 83187.9922 Salazar / Dutch ✗ (Portuguese)
81000 81% (5m 1s) 13333.8623 Piontek / Dutch ✗ (Polish)
81500 81% (5m 2s) 389.0654 Negri / English ✗ (Italian)
82000 82% (5m 4s) 11604.2930 Salomon / Dutch ✗

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
# print(all_losses)
plt.figure()
plt.plot(all_losses)
plt.imshow()

In [10]:
confusion = torch.zeros(n_categories, n_categories)
n_confusion = 10000

def evalute(line_tensor):
    hidden = rnn.initHidden()
    rnn.eval()
    with torch.no_grad():
        for i in range(line_tensor.size()[0]):
            output, hidden = rnn(line_tensor[i], hidden)
    return output

for i in range(n_confusion):
    category, line, category_tensor, line_tensor = get_training_sample()
    output = evalute(line_tensor)
#     guess, guess_i = category_from_output(output)
#     category_i = all_categories.index(category)
#     confusion[category_i][guess_i] += 1
    
# for i in range(n_categoreis):
#     confusion[i] = confusion[i] / confusion[i].sum()
    
# fig = plt.figure()
# ax = fig.add_subplot(111)
# cax = ax.matshow(confusion.numpy())
# fig.colorbar(cax)
# ax.set_xticklabels([''] + all_categories, rotation=90)
# ax.set_yticklabels([''] + all_categories)

# # Force label at every tick
# ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
# ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

# # sphinx_gallery_thumbnail_number = 2
# plt.show()
    
    

SyntaxError: invalid syntax (<ipython-input-10-05de63a09154>, line 7)