# Импорты


In [None]:
!pip3 install -U torchtext

In [2]:
import spacy
import random
import math
import time
import torch
import torch.nn as nn


from torch import optim
from torchtext.legacy.data import Field, BucketIterator, TabularDataset

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True 

# Подготовка данных


In [4]:
! python3 -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
! python3 -m spacy download de_core_news_sm

Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 6.9 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=42572164a1dea09a0da1612869a99597b2e88302a54e99692bd8756257c50fd2
  Stored in directory: /tmp/pip-ephem-wheel-cache-e6kaqgy2/wheels/00/66/69/cb6c921610087d2cab339062345098e30a5ceb665360e7b32a
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


На данном этапе создадим токенезаторы. Токенезаторы используются для преобразования исходной строки в список токенов(слов, n-грамм, символов).
>Например, предложение ***На столе стоит кружка.*** преобразуется в следующий список: **['На', 'столе', 'стоит', 'кружка', '.']**




In [6]:
import de_core_news_sm

ENG_TOKENIZER = spacy.load('en_core_web_sm')
DE_TOKENIZER = de_core_news_sm.load()

Создадим функцию для токенизации предложений на конкретном языке. Функция будет принимать текст на определенном языке и возвращать предложения в виде списка токенов

In [7]:
def tokenize_text(text, tokenizer):
    """
    Tokenize given text by using particular tokenizer
    :param text: text in target or source language
    :param tokenizer: tokenizer for language
    :return: list of tokens
    """
    return [token.text for token in tokenizer.tokenizer(text)]

Объект класса `Field` определяет каким образом данные должны быть обработаны.



In [8]:
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'

SRC_LANG = Field(tokenize=lambda text: tokenize_text(text, ENG_TOKENIZER),
                  init_token=SOS_TOKEN,
                  eos_token=EOS_TOKEN,
                  lower=True)

TRG_LANG = Field(tokenize=lambda text: tokenize_text(text, DE_TOKENIZER),
                  init_token=SOS_TOKEN,
                  eos_token=EOS_TOKEN,
                  lower=True)

На данном этапе скачаем паралельный корпус Английского-Немецкого языка и разобьем выборку на *train*, *test* и *validation* датасет. 

Датасет можно загрузить [тут](https://https://tatoeba.org/ru/downloads)


In [9]:
data = TabularDataset('eng-de.tsv', format='tsv', fields=[('src', SRC_LANG), ('trg', TRG_LANG)])

SRC_LANG.build_vocab(data, min_freq=2, max_size=4000)
TRG_LANG.build_vocab(data, min_freq=2, max_size=4000)

train_data, valid_data, test_data = data.split([0.6, 0.2, 0.2])

In [10]:
print(f"Количество примеров в обучающей выборке: {len(train_data.examples)}")
print(f"Количество примеров в валидационной выборке: {len(valid_data.examples)}")
print(f"Количество примеров в тестовой выборке: {len(test_data.examples)}")

Количество примеров в обучающей выборке: 30592
Количество примеров в валидационной выборке: 10198
Количество примеров в тестовой выборке: 10197


In [11]:
print(f"Количество уникальных токенов для английского языка: {len(SRC_LANG.vocab)}")
print(f"Количество уникальных токенов для немецкого языка: {len(TRG_LANG.vocab)}")

Количество уникальных токенов для английского языка: 4004
Количество уникальных токенов для немецкого языка: 4004


Последний этап подготовки данных - создание итератора.

In [12]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device,
    sort = False
)

# Модель Seq2Seq

## Encoder

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        encoder_states, hidden= self.rnn(embedded)

        return encoder_states, hidden


## Decoder

In [14]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()

        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, hidden, encoder_states):
        src = src.unsqueeze(0)
        embedded = self.dropout(self.embedding(src))
        decoder_state, hidden = self.rnn(embedded, hidden)
        prediction = self.out(decoder_state.squeeze(0))

        return prediction, hidden


## Seq2Seq

In [41]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ration=0.5):
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_states, hidden = self.encoder(src)

        decoder_input = trg[0, :]  # <SOS> token

        for i in range(1, max_len):
            decoder_prediction, hidden = self.decoder(decoder_input, hidden, encoder_states)

            outputs[i] = decoder_prediction
            print(decoder_prediction.shape)
            top1 = decoder_prediction.max(1)[1]

            use_teacher_forcing = random.random() < teacher_forcing_ration
            decoder_input = trg[i] if use_teacher_forcing else top1

        return outputs

## Обучение

In [42]:
INPUT_DIM = len(SRC_LANG.vocab)
OUTPUT_DIM = len(TRG_LANG.vocab) 
N_LAYERS = 2
EMB_DIM = 256
HIDDEN_DIM = 512
DROPOUT = 0.45

encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT).to(device)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT).to(device)

model = Seq2Seq(encoder, decoder, device).to(device)

In [43]:
optimizer = optim.Adam(model.parameters())

В качестве функции потерь будем использовать Кросс-Энтропию

In [44]:
PAD_IDX = TRG_LANG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

На данном этапе необходимо определить цикл обучения.

In [45]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()
  
  epoch_loss = 0

  for i, batch in enumerate(iterator):

    src = batch.src
    trg = batch.trg

    optimizer.zero_grad()

    output = model(src, trg)

    output = output[1:].view(-1, output.shape[-1])
    trg = trg[1:].view(-1)

    loss = criterion(output, trg)

    loss.backward()

    # Защита от взрыва градиента
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()

    epoch_loss += loss.item()

  return epoch_loss / len(iterator)  

Метод для оценки на валидационных данных

In [46]:
def evaluate(model, iterator, criterion):
  model.eval()
  
  epoch_loss = 0
  history = []

  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src = batch.src
      trg = batch.trg

      output = model(src, trg, 0)

      output = output[1:].view(-1, output.shape[-1])
      trg = trg[1:].view(-1)

      loss = criterion(output, trg)

      epoch_loss += loss.item()
  
  return epoch_loss / len(iterator)

In [47]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len):
  model.eval()

  if isinstance(sentence, str):
    tokens = [token.text.lower() for token in spacy_en(sentence)]
  else:
    tokens = [token.lower() for token in sentence]

  tokens = [src_field.init_token] + tokens + [src_field.eos_token]

  src_indecies = [src_field.vocab.stoi[token] for token in tokens]
  src_tensor = torch.LongTensor(src_indecies).unsqueeze(1).to(device)
  src_len = torch.LongTensor([len(src_indecies)]).to(device)

  with torch.no_grad():
    encoder_states, hidden = model.encoder(src_tensor)

  trg_indecies = [trg_field.vocab.stoi[trg_field.init_token]]

  for i in range(1, max_len):
    trg_tensor = torch.LongTensor([trg_indecies[-1]]).to(device)
  
    output, hidden = model.decoder(trg_tensor, hidden, encoder_states)

    pred_token = output.argmax(1).item()

    trg_indecies.append(pred_token)

    if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
      break

  
  trg_tokens = [trg_field.vocab.itos[i] for i in trg_indecies]

  return trg_tokens[1:]

  

In [None]:
import datetime
N_EPOCHS = 30
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'model.pt')

  diff = end_time - start_time
  remaining_secs = (N_EPOCHS - epoch + 1) * diff
  print(f"Epoch: {epoch + 1} | Epoch time: {datetime.timedelta(seconds=diff)} | Remaining time: {datetime.timedelta(seconds=remaining_secs)}")
  print(f"\tTrain loss: {train_loss:.3f}")
  print(f"\tValid loss: {valid_loss:.3f}")
  print("----------------------------------------------------------------------")



In [None]:
model.load_state_dict(torch.load('model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f"Test loss: {test_loss:.3f}")

In [None]:
import numpy as np

def get_example():
  example_idx = np.random.choice(np.arange(len(test_data)))
  src = vars(train_data.examples[example_idx])['src']
  trg = vars(train_data.examples[example_idx])['trg']

  translation = translate_sentence(src, SRC_LANG, TRG_LANG, model, device, max_len = 50)

  src_text  = " ".join(src)
  trg_text = " ".join(trg)
  prediction_text = " ".join(translation[:-1])



  print(f"Source string: {src_text}\n")
  print(f"Target string: {trg_text}\n")
  print(f"Predicted string: {prediction_text}")

get_example()

Функция для подсчета [BLEU](https://en.wikipedia.org/wiki/BLEU)

In [None]:
from torchtext.data.metrics import bleu_score

def bleu(data, model, src_lang, trg_lang, device):
  targets = []
  outputs = []

  for example in data:
    src = vars(example)['src']
    trg = vars(example)['trg']

    prediction = translate_sentence(src, src_lang, trg_lang, model, device, max_len = 50)
    prediction = prediction[:-1]

    targets.append([trg])
    outputs.append(prediction)

  return bleu_score(outputs, targets)

score = bleu(test_data, model, SRC_LANG, TRG_LANG, device)

print(f"BLEU score: {score * 100:.2f}")