### 1. 文本的基本处理

In [1]:
from io import open
import glob
import os
import unicodedata
import string
import torch
import torch.nn as nn
import random
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 列出所有硬盘中目录下的文件 
def findFiles(path): 
    return glob.glob(path)

path = '../data/names/*.txt'
print(len(findFiles(path)))
print(findFiles(path))



18
['../data/names\\Arabic.txt', '../data/names\\Chinese.txt', '../data/names\\Czech.txt', '../data/names\\Dutch.txt', '../data/names\\English.txt', '../data/names\\French.txt', '../data/names\\German.txt', '../data/names\\Greek.txt', '../data/names\\Irish.txt', '../data/names\\Italian.txt', '../data/names\\Japanese.txt', '../data/names\\Korean.txt', '../data/names\\Polish.txt', '../data/names\\Portuguese.txt', '../data/names\\Russian.txt', '../data/names\\Scottish.txt', '../data/names\\Spanish.txt', '../data/names\\Vietnamese.txt']


In [3]:
filename = '../data/names/Chinese.txt'
content = []
with open(filename) as f:
    for i,c in enumerate(f):
        content.append(c)
        if i>5: 
            break

In [4]:
content

['Ang\n', 'Au-Yong\n', 'Bai\n', 'Ban\n', 'Bao\n', 'Bei\n', 'Bian\n']

In [5]:
all_letters = string.ascii_letters + " .,;'"
print(all_letters)

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'


In [6]:
def unicodeToAscii(s):
    output =[c for c in unicodedata.normalize('NFD', s) 
             if c in all_letters]
    return ''.join(output)

print(unicodeToAscii('Ślusàrski'))

Slusarski


In [7]:
category_lines = {}
all_categories = []


In [8]:

def readLines(filename):
    with open(filename, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
        return [unicodeToAscii(line) for line in lines]

for filename in findFiles(path):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines



In [9]:
print(category_lines['Italian'][:5])

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


In [10]:
n_letters = len(all_letters)
n_categories = len(all_categories)

In [11]:

def letterToIndex(letter):
    return all_letters.find(letter)


def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def lineToTensor(line):
    tensor = torch.zeros(1,len(line),n_letters)
    for li, letter in enumerate(line):
        tensor[0][li][letterToIndex(letter)] = 1
    return tensor


In [12]:
print(letterToTensor('J'))
print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([1, 5, 57])


In [13]:
# 随机选择样本
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)


category = Greek / line = Bouloukos
category = Irish / line = O'Doherty
category = Russian / line = Jdan
category = Dutch / line = Aarle
category = Irish / line = Mohan
category = Dutch / line = Venn
category = Italian / line = Salvai
category = Vietnamese / line = Thai
category = Vietnamese / line = Lieu
category = Korean / line = Youj


### 2. 用全连接的神经网络判断文本

In [14]:
input_size = len(all_letters)
num_layers = 1
num_classes = len(all_categories)
hidden_size = 128
n_iters = 100000
print_every = 5000
plot_every = 1000
learning_rate=0.001

In [15]:

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [16]:
model = NeuralNet(input_size, hidden_size, num_classes)

In [17]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  


In [18]:

current_loss = 0
all_losses = []

for i in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()


    outputs = model(line_tensor.sum(1))
    labels = category_tensor
    loss = criterion(outputs, labels)


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    current_loss += loss.data.numpy()

    if i % print_every == 0:
        print('%d %d%% %.4f ' % (i, i / n_iters * 100, current_loss/print_every))
        all_losses.append(current_loss/print_every)
        current_loss = 0



5000 5% 2.0241 
10000 10% 1.6030 
15000 15% 1.4564 
20000 20% 1.3278 
25000 25% 1.2801 
30000 30% 1.1811 
35000 35% 1.1481 
40000 40% 1.0595 
45000 45% 1.0054 
50000 50% 1.0080 
55000 55% 0.9447 
60000 60% 0.9200 
65000 65% 0.8641 
70000 70% 0.8444 
75000 75% 0.8296 
80000 80% 0.7990 
85000 85% 0.7836 
90000 90% 0.7636 
95000 95% 0.7367 
100000 100% 0.7432 


### 使用循环神经网络处理文本

In [20]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):        

        out, _ = self.lstm(x)  
        out = self.fc(out[:, -1, :])  
        return out


In [22]:

model = RNN(input_size, hidden_size, num_layers, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [23]:

current_loss = 0
all_losses = []

for i in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()

    outputs = model(line_tensor)
    labels = category_tensor
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    current_loss += loss.data.numpy()

    if i % print_every == 0:
        print('%d %d%% %.4f ' % (i, i / n_iters * 100, current_loss/print_every))
        all_losses.append(current_loss/print_every)
        current_loss = 0


5000 5% 2.0559 
10000 10% 1.4343 
15000 15% 1.2078 
20000 20% 1.0029 
25000 25% 0.8625 
30000 30% 0.7747 
35000 35% 0.6664 
40000 40% 0.5825 
45000 45% 0.5491 
50000 50% 0.5229 
55000 55% 0.4711 
60000 60% 0.4371 
65000 65% 0.4155 
70000 70% 0.3666 
75000 75% 0.3698 
80000 80% 0.3861 
85000 85% 0.3255 
90000 90% 0.3515 
95000 95% 0.3097 
100000 100% 0.3034 


In [24]:
def predict(input_line, n_predictions=3):
    print('\n> %s' % input_line)
    with torch.no_grad():
        output = model(lineToTensor(input_line))

        topv, topi = output.topk(n_predictions, 1, True)
        predictions = []

        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_categories[category_index]))
            predictions.append([value, all_categories[category_index]])

predict('Dovesky')
predict('Satoshi')


> Dovesky
(10.41) Russian
(6.98) Czech
(4.06) English

> Satoshi
(10.84) Japanese
(5.06) Arabic
(4.84) Russian
