In [1]:
from IPython.display import clear_output

In [2]:
!pip install pycodestyle pycodestyle_magic
!pip install flake8
clear_output()

In [3]:
%load_ext pycodestyle_magic

In [4]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip
!rm *.zip
!rm -r data/names
clear_output()

In [5]:
import re
import unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm.notebook import tqdm
from datetime import datetime, timedelta

%matplotlib inline

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
pd.options.display.max_colwidth = 100

In [8]:
plt.switch_backend('agg')

# Data preparation

In [9]:
df = pd.read_csv('data/eng-fra.txt', sep='\t', header=None)

In [10]:
df.head(15)

Unnamed: 0,0,1
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !
5,Help!,À l'aide !
6,Jump.,Saute.
7,Stop!,Ça suffit !
8,Stop!,Stop !
9,Stop!,Arrête-toi !


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135842 entries, 0 to 135841
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       135842 non-null  object
 1   1       135842 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB


In [12]:
!rm -r data

In [13]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427

def unicode_to_ascii(string):
    return ''.join(
        char for char in unicodedata.normalize('NFD', string)
        if unicodedata.category(char) != 'Mn'
        )

In [14]:
df[0] = df[0].apply(
    lambda x: re.sub(r'[^a-z .?!]+', '',
                     unicode_to_ascii(x.lower().replace("'", ' ')))
    )
df[1] = df[1].apply(
    lambda x: re.sub(r'[^a-z .?!]+', '',
                     unicode_to_ascii(x.lower().replace("'", ' ')))
    )
df[0] = df[0].apply(lambda x: re.sub(r'([.!?])', r' \1 ', x))
df[1] = df[1].apply(lambda x: re.sub(r'([.!?])', r' \1 ', x))
df[0] = df[0].apply(lambda x: re.sub(r'\s\s+', ' ', x))
df[1] = df[1].apply(lambda x: re.sub(r'\s\s+', ' ', x))

In [15]:
df.sample(n=15)

Unnamed: 0,0,1
22088,you can t back out .,vous ne pouvez pas vous dedire .
32696,he was chosen captain .,il a ete choisi comme capitaine .
100740,this isn t supposed to be happening .,ce n est pas suppose arriver .
75982,my credit cards are maxed out .,j ai excede le plafond de mes cartes de credit .
108829,the problem disappeared as if by magic .,le probleme disparut comme par magie .
94158,tom tried to control his emotions .,tom a essaye de controler ses emotions .
73160,all the sails were taken down .,toutes les voiles furent affalees .
31779,are you in a bad mood ?,estu de mauvaise humeur ?
28422,i know you re scared .,je sais que vous avez peur .
97327,the mountain was covered with snow .,la montagne etait recouverte de neige .


In [16]:
pairs = list(zip(df[0].tolist(), df[1].tolist()))

In [17]:
del df

Отфильтруем пары.

In [18]:
MAX_LEN = 10

In [19]:
eng_prefixes = (
    'i am ', 'i m',
    'he is', 'he s',
    'she is', 'she s',
    'you are', 'you re',
    'we are', 'we re',
    'they are', 'they re'
)

pairs = [
    pair for pair in pairs
    if len(pair[0].split()) <= MAX_LEN and len(pair[1].split()) <= MAX_LEN
    and pair[0].startswith(eng_prefixes)
    ]

In [20]:
len(pairs)

12965

In [21]:
pairs[np.random.randint(12965)]

('we re not sure . ', 'nous n en sommes pas surs . ')

Составим словари для английского и французского.

In [22]:
eng_words = ['SOS', 'EOS'] + sorted(
    set(' '.join([pair[0] for pair in pairs]).split())
    )

fr_words = ['SOS', 'EOS'] + sorted(
    set(' '.join([pair[1] for pair in pairs]).split())
    )

In [23]:
len(eng_words)

3463

In [24]:
len(fr_words)

5330

In [25]:
print(eng_words[:10])

['SOS', 'EOS', '!', '.', '?', 'a', 'ability', 'able', 'aboard', 'about']


In [26]:
print(fr_words[:10])

['SOS', 'EOS', '!', '.', '?', 'a', 'aapprendre', 'abandonne', 'abandonner', 'abandonnons']


In [27]:
eng_to_idx = {elem: i for i, elem in enumerate(eng_words)}
fr_to_idx = {elem: i for i, elem in enumerate(fr_words)}

In [28]:
def tensors_from_pair(pair):
    eng_idx = [0] + [eng_to_idx[w] for w in pair[0].split()[:MAX_LEN]] + [1]
    fr_idx = [0] + [fr_to_idx[w] for w in pair[1].split()[:MAX_LEN]] + [1]

    eng_tensor = torch.tensor(eng_idx, dtype=torch.long,
                              device=device).view(-1, 1)
    fr_tensor = torch.tensor(fr_idx, dtype=torch.long,
                             device=device).view(-1, 1)

    return (eng_tensor, fr_tensor)

# Encoder





In [29]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, rnn_type='gru', rnn_n=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn_n = rnn_n

        self.embedding = nn.Embedding(input_size, hidden_size)
        if rnn_type == 'gru':
            self.rnn = nn.GRU(hidden_size, hidden_size)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size, hidden_size)
        if self.rnn_n == 2:
            if rnn_type == 'gru':
                self.rnn_2 = nn.GRU(hidden_size, hidden_size)
            elif rnn_type == 'lstm':
                self.rnn_2 = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output, hidden = self.rnn(output, hidden)
        if self.rnn_n == 2:
            output, hidden = self.rnn_2(output, hidden)
        return output, hidden

    def initHidden(self):
        if isinstance(self.rnn, nn.LSTM):
            return (torch.zeros(1, 1, self.hidden_size, device=device),
                    torch.zeros(1, 1, self.hidden_size, device=device))
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Decoder




In [30]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, rnn_type='gru', rnn_n=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn_n = rnn_n

        self.embedding = nn.Embedding(output_size, hidden_size)
        if rnn_type == 'gru':
            self.rnn = nn.GRU(hidden_size, hidden_size)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size, hidden_size)
        if self.rnn_n == 2:
            if rnn_type == 'gru':
                self.rnn_2 = nn.GRU(hidden_size, hidden_size)
            elif rnn_type == 'lstm':
                self.rnn_2 = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        if self.rnn_n == 2:
            output, hidden = self.rnn_2(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        if isinstance(self.rnn, nn.LSTM):
            return (torch.zeros(1, 1, self.hidden_size, device=device),
                    torch.zeros(1, 1, self.hidden_size, device=device))
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Функции для обучения и оценки качества

In [31]:
teacher_forcing_ratio = 0.5

In [32]:
def train(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion, max_len=MAX_LEN):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs = torch.zeros(max_len + 2, encoder.hidden_size,
                                  device=device)
    loss = 0

    for i in range(input_tensor.size(0)):
        encoder_output, encoder_hidden = encoder(input_tensor[i],
                                                 encoder_hidden)
        encoder_outputs[i] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]], device=device)
    decoder_hidden = encoder_hidden

    if np.random.random() < teacher_forcing_ratio:
        use_teacher_forcing = True
    else:
        use_teacher_forcing = False

    for i in range(target_tensor.size(0)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output, target_tensor[i])
        if use_teacher_forcing:
            # teacher forcing: feed the target as the next input
            decoder_input = target_tensor[i]
        else:
            # use its own predictions as the next input
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            if decoder_input.item() == 1:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_tensor.size(0)

Функция `train_epochs()` возвращает `losses`, т. к., сделать так, чтобы в одной и той же ячейке ноутбука выводился и прогресс обучения, и график лоссов, у меня не получилось (даже с `plt.show()`, которую в лекционном ноутбуке почему-то вообще забыли).

In [33]:
def train_epochs(encoder, decoder, n_epochs=75000, learning_rate=0.01):
    start = datetime.now()
    losses = []
    print_loss = 0  # reset every 5000 epochs
    plot_loss = 0  # reset every 100 epochs

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensors_from_pair(pairs[np.random.randint(12965)])
                      for epoch in range(n_epochs)]
    criterion = nn.NLLLoss()

    for epoch in tqdm(range(n_epochs)):
        training_pair = training_pairs[epoch]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder,
                     encoder_optimizer, decoder_optimizer, criterion)
        print_loss += loss
        plot_loss += loss

        if (epoch + 1) % 5000 == 0:
            print_loss /= 5000
            delta = datetime.now() - start
            stats = f'''
            Epoch {epoch + 1} ({(epoch + 1) * 100 / n_epochs:.1f}%)
            Time: {delta.seconds // 60}m {delta.seconds % 60}s
            Loss: {print_loss:.4f}
            '''
            print(stats)
            print_loss = 0

        if (epoch + 1) % 100 == 0:
            plot_loss /= 100
            losses.append(plot_loss)
            plot_loss = 0

    return losses

In [34]:
def plot_loss(losses):
    plt.figure(figsize=(14, 12))
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(losses)
    plt.grid()
    plt.xlabel('Epoch', fontsize='xx-large')
    plt.ylabel('Loss function', fontsize='xx-large')
    plt.show()

In [42]:
def evaluate(encoder, decoder, sent, max_len=MAX_LEN):
    with torch.no_grad():
        idx = [0] + [eng_to_idx[w] for w in sent.split()[:max_len]] + [1]
        input_tensor = torch.tensor(idx, dtype=torch.long,
                                    device=device).view(-1, 1)
        encoder_hidden = encoder.initHidden()
        encoder_outputs = torch.zeros(max_len + 2, encoder.hidden_size,
                                      device=device)

        for i in range(input_tensor.size()[0]):
            encoder_output, encoder_hidden = encoder(input_tensor[i],
                                                     encoder_hidden)
            encoder_outputs[i] += encoder_output[0, 0]

        decoder_input = torch.tensor([[0]], device=device)
        decoder_hidden = encoder_hidden
        decoded_words = []

        for i in range(max_len):
            decoder_output, decoder_hidden = decoder(decoder_input,
                                                     decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            decoded_words.append(fr_words[topi.item()])
            if topi.item() == 1:
                break
            decoder_input = topi.squeeze().detach()

        return decoded_words

In [43]:
def evaluate_random(encoder, decoder, n=10):
    for i in range(n):
        pair = pairs[np.random.randint(12965)]
        res = f'''
        Eng:  {pair[0]}
        Fr:   {pair[1]}
        Pred: {" ".join(evaluate(encoder, decoder, pair[0])[1:-1])}
        '''
        print(res)

# Обучение и оценка качества

In [37]:
hidden_size = 256

In [38]:
encoder = EncoderRNN(len(eng_words), hidden_size).to(device)
decoder = DecoderRNN(hidden_size, len(fr_words)).to(device)

In [39]:
losses = train_epochs(encoder, decoder)

HBox(children=(FloatProgress(value=0.0, max=75000.0), HTML(value='')))


            Epoch 5000 (6.7%)
            Time: 10m 0s
            Loss: 3.2755
            

            Epoch 10000 (13.3%)
            Time: 20m 18s
            Loss: 2.7835
            

            Epoch 15000 (20.0%)
            Time: 30m 32s
            Loss: 2.4998
            

            Epoch 20000 (26.7%)
            Time: 40m 53s
            Loss: 2.2698
            

            Epoch 25000 (33.3%)
            Time: 51m 7s
            Loss: 2.1089
            

            Epoch 30000 (40.0%)
            Time: 61m 30s
            Loss: 1.9528
            

            Epoch 35000 (46.7%)
            Time: 71m 33s
            Loss: 1.7949
            

            Epoch 40000 (53.3%)
            Time: 81m 35s
            Loss: 1.6950
            

            Epoch 45000 (60.0%)
            Time: 91m 36s
            Loss: 1.5859
            

            Epoch 50000 (66.7%)
            Time: 101m 33s
            Loss: 1.4910
            

            Epoch 55000 (73.3%)


Лосс очень маленький!

In [40]:
plot_loss(losses)

In [44]:
evaluate_random(encoder, decoder)


        Eng:  she isn t afraid of death . 
        Fr:   elle n a pas peur de la mort . 
        Pred: elle n craint pas peur de la mort
        

        Eng:  i am so sorry to have kept you waiting . 
        Fr:   je suis vraiment desole de t avoir fait attendre . 
        Pred: je suis tellement desolee de t avoir fait
        

        Eng:  i m trustworthy . 
        Fr:   on peut me faire confiance . 
        Pred: nous sommes fiable .
        

        Eng:  i m not scared of anybody . 
        Fr:   je ne crains personne . 
        Pred: je ne crains pas . .
        

        Eng:  i m very impressed . 
        Fr:   je suis fort impressionnee . 
        Pred: je suis fort impressionnee .
        

        Eng:  you re very emotional . 
        Fr:   tu es fort emotive . 
        Pred: vous etes tres emotif .
        

        Eng:  they re dead . 
        Fr:   elles sont decedees . 
        Pred: ils sont decedes .
        

        Eng:  you re wasting both of our time . 


Предсказания почти совпадают с французскими предложениями!