# Environment setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
folder_path = '/content/drive/MyDrive/models/yandex/'

In [None]:
!pip install torch --upgrade
!pip install torchtext --upgrade
!pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git
!pip install pymorphy2

In [3]:
import os
import math
import dill
import spacy
import random
import warnings
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import OrderedDict
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors
from torchtext.data import Dataset, Example, Field
from torchtext.data.iterator import BucketIterator
from torchtext.data.metrics import bleu_score
from torchtext.datasets import TranslationDataset


from spacy.lang.ru import Russian
from spacy_russian_tokenizer import RussianTokenizer, MERGE_PATTERNS

In [4]:
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

SEED = 546
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {DEVICE}')

Device: cuda


# Models

In [5]:
class EncRNN(nn.Module):
    def __init__(self, vsz, embed_dim, hidden_dim, n_layers, use_birnn, dout):
        super(EncRNN, self).__init__()
        self.embed = nn.Embedding(vsz, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers,
                           bidirectional=use_birnn)
        self.dropout = nn.Dropout(dout)

    def forward(self, inputs):
        embs = self.dropout(self.embed(inputs))
        enc_outs, hidden = self.rnn(embs)
        return self.dropout(enc_outs), hidden


class Attention(nn.Module):
    def __init__(self, hidden_dim, method):
        super(Attention, self).__init__()
        self.method = method
        self.hidden_dim = hidden_dim

        if method == 'general':
            self.w = nn.Linear(hidden_dim, hidden_dim)
        elif method == 'concat':
            self.w = nn.Linear(hidden_dim*2, hidden_dim)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_dim))

    def forward(self, dec_out, enc_outs):
        if self.method == 'dot':
            attn_energies = self.dot(dec_out, enc_outs)
        elif self.method == 'general':
            attn_energies = self.general(dec_out, enc_outs)
        elif self.method == 'concat':
            attn_energies = self.concat(dec_out, enc_outs)
        return F.softmax(attn_energies, dim=0)

    def dot(self, dec_out, enc_outs):
        return torch.sum(dec_out*enc_outs, dim=2)

    def general(self, dec_out, enc_outs):
        energy = self.w(enc_outs)
        return torch.sum(dec_out*energy, dim=2)

    def concat(self, dec_out, enc_outs):
        dec_out = dec_out.expand(enc_outs.shape[0], -1, -1)
        energy = torch.cat((dec_out, enc_outs), 2)
        return torch.sum(self.v * self.w(energy).tanh(), dim=2)


class DecRNN(nn.Module):
    def __init__(self, vsz, embed_dim, hidden_dim, n_layers, use_birnn, 
                 dout, attn, tied):
        super(DecRNN, self).__init__()
        hidden_dim = hidden_dim*2 if use_birnn else hidden_dim

        self.embed = nn.Embedding(vsz, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim , n_layers)

        self.w = nn.Linear(hidden_dim*2, hidden_dim)
        self.attn = Attention(hidden_dim, attn)

        self.out_projection = nn.Linear(hidden_dim, vsz)
        if tied: 
            if embed_dim != hidden_dim:
                raise ValueError(
                    f"when using the tied flag, embed-dim:{embed_dim} \
                    must be equal to hidden-dim:{hidden_dim}")
            self.out_projection.weight = self.embed.weight
        self.dropout = nn.Dropout(dout)

    def forward(self, inputs, hidden, enc_outs):
        inputs = inputs.unsqueeze(0)
        embs = self.dropout(self.embed(inputs))
        dec_out, hidden = self.rnn(embs, hidden)

        attn_weights = self.attn(dec_out, enc_outs).transpose(1, 0)
        enc_outs = enc_outs.transpose(1, 0)
        context = torch.bmm(attn_weights.unsqueeze(1), enc_outs)
        cats = self.w(torch.cat((dec_out, context.transpose(1, 0)), dim=2))
        pred = self.out_projection(cats.tanh().squeeze(0))
        return pred, hidden


class Seq2seqAttn(nn.Module):
    def __init__(self, args, fields, device):
        super().__init__()
        self.src_field, self.tgt_field = fields
        self.src_vsz = len(self.src_field.vocab.itos)
        self.tgt_vsz = len(self.tgt_field.vocab.itos)
        self.encoder = EncRNN(self.src_vsz, args.embed_dim, args.hidden_dim, 
                              args.n_layers, args.bidirectional, args.dropout)
        self.decoder = DecRNN(self.tgt_vsz, args.embed_dim, args.hidden_dim, 
                              args.n_layers, args.bidirectional, args.dropout,
                              args.attn, args.tied)
        self.device = device
        self.n_layers = args.n_layers
        self.hidden_dim = args.hidden_dim
        self.use_birnn = args.bidirectional

    def forward(self, srcs, tgts=None, maxlen=100, tf_ratio=0.0):
        slen, bsz = srcs.size()
        tlen = tgts.size(0) if isinstance(tgts, torch.Tensor) else maxlen
        tf_ratio = tf_ratio if isinstance(tgts, torch.Tensor) else 0.0
       
        enc_outs, hidden = self.encoder(srcs)

        dec_inputs = torch.ones_like(srcs[0]) * 2 # <eos> is mapped to id=2
        outs = []

        if self.use_birnn:
            def trans_hidden(hs):
                hs = hs.view(self.n_layers, 2, bsz, self.hidden_dim)
                hs = torch.stack([torch.cat((h[0], h[1]), 1) for h in hs])
                return hs
            hidden = tuple(trans_hidden(hs) for hs in hidden)

        for i in range(tlen):
            preds, hidden = self.decoder(dec_inputs, hidden, enc_outs)
            outs.append(preds)
            use_tf = random.random() < tf_ratio
            dec_inputs = tgts[i] if use_tf else preds.max(1)[1]
        return torch.stack(outs)


# Parameters

In [6]:
def train_opts(parser):
    group = parser.add_argument_group('Training')
    group.add_argument('--train', default='./sample_data/sample_train.tsv',
        help='path to a train data')
    group.add_argument('--valid', default='./sample_data/sample_valid.tsv',
        help='path to a validation data')
    group.add_argument('--batch-size', type=int, default=32, 
        help='batch size')
    group.add_argument('--savedir', default='./checkpoints', 
        help='path to save models')
    group.add_argument('--max-epoch', type=int, default=0, 
        help='number of epochs')
    group.add_argument('--max-update', type=int, default=0,
        help='number of updates')
    group.add_argument('--lr', type=float, default=0.25,
        help='learning rate')
    group.add_argument('--min-lr', type=float, default=1e-5, 
        help='minimum learning rate')
    group.add_argument('--clip', type=float, default=0.1,
        help='gradient cliping')
    group.add_argument('--tf-ratio', type=float, default=0.5,
        help='teaching force ratio')
    group.add_argument('--gpu', action='store_true',
        help='whether gpu is used')
    return group


def translate_opts(parser):
    group = parser.add_argument_group('Translation')
    group.add_argument('--model', default='./checkpoints/checkpoint_best.pt',
        help='model file for translation')
    group.add_argument('--input', default='./sample_data/sample_test.txt',
        help='input file')
    group.add_argument('--batch-size', type=int, default=32,
        help='batch size')
    group.add_argument('--maxlen', type=int, default=100,
        help='maximum length of output sentence')
    group.add_argument('--gpu', action='store_true',
        help='whether gpu is used')
    return group
    

def model_opts(parser):
    group = parser.add_argument_group('Model\'s hyper-parameters')
    group.add_argument('--embed-dim', type=int, default=200,
        help='dimension of word embeddings')
    group.add_argument('--src_min-freq', type=int, default=0,
        help='''map words of source side appearing less than 
                threshold times to unknown''')
    group.add_argument('--tgt_min-freq', type=int, default=0,
        help='''map words of target side appearing less than
              threshold times to unknown''')
    group.add_argument('--rnn', choices=['lstm'], default='lstm',
        help='rnn\'s architechture')
    group.add_argument('--hidden-dim', type=int, default=1024,
        help='number of hidden units per layer')
    group.add_argument('--n-layers', type=int, default=2,
        help='number of LSTM layers')
    group.add_argument('--bidirectional', action='store_true',
        help='whether use bidirectional LSTM for encoder')
    group.add_argument('--attn', choices=['dot', 'general', 'concat'],
        default='dot', help='attention type')
    group.add_argument('--dropout', type=float, default=0.2,
        help='dropout applied to layers (0 means no dropout)')
    group.add_argument('--tied', action='store_true',
        help='tie the word embedding and softmax weight')
    return group

# Dataset loading

In [7]:
en_tok = spacy.load('en')

def tokenize_en(sentence):
    return [tok.text for tok in en_tok.tokenizer(sentence)]

In [8]:
ru_nlp = Russian()
russian_tokenizer = RussianTokenizer(ru_nlp, MERGE_PATTERNS)
ru_nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')

def tokenize_ru(sentence):
    return [tok.text for tok in ru_nlp(sentence)]

In [9]:
EN_TEXT = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=False)
RU_TEXT = Field(tokenize=tokenize_ru, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=False)

In [11]:
dataset = TranslationDataset(
    path=f'{folder_path}corpus/news-commentary-v12.ru-en', exts=('.en', '.ru'),
    fields=(EN_TEXT, RU_TEXT))

In [12]:
train_data, valid_data, test_data = dataset.split(split_ratio=[0.7, 0.2, 0.1])

In [13]:
print(f'train set size: {len(train_data.examples):,}')
print(f'valid set size: {len(valid_data.examples):,}')
print(f'test set size: {len(test_data.examples):,}')
print(vars(train_data.examples[0]))

train set size: 154,598
valid set size: 22,085
test set size: 44,171
{'src': ['it', 'makes', 'sense', 'for', 'the', 'west', ',', 'particularly', 'the', 'european', 'union', ',', 'to', 'seek', 'cooperation', 'with', 'the', 'sco', ',', 'as', 'this', 'would', 'also', 'help', 'counter', 'russia', '’s', 'attempts', 'to', 'use', 'it', 'as', 'a', 'tool', 'for', 'its', 'anti', '-', 'western', 'policies', '.'], 'trg': ['мировой', 'финансовый', 'кризис', '2008', 'года', 'поначалу', 'лишь', 'ещё', 'больше', 'укрепил', 'репутацию', 'центральных', 'банков', '.']}


In [14]:
%%time
MIN_COUNT = 2
EN_TEXT.build_vocab(train_data, min_freq=MIN_COUNT)
RU_TEXT.build_vocab(train_data, min_freq=MIN_COUNT)
print(f'Length of EN vocabulary: {len(EN_TEXT.vocab):,}')
print(f'Length of RU vocabulary: {len(RU_TEXT.vocab):,}')

Length of EN vocabulary: 38,019
Length of RU vocabulary: 80,913
CPU times: user 3.08 s, sys: 13 ms, total: 3.1 s
Wall time: 3.1 s


In [15]:
torch.save(dataset.examples, f'{folder_path}/examples.pkl', pickle_module=dill)
torch.save(dataset.fields, f'{folder_path}/fields.pkl', pickle_module=dill)

# Training

In [18]:
class Trainer(object):
    def __init__(
        self, model, criterion, optimizer, scheduler, clip):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.clip = clip
        self.n_updates = 0

    def get_lr(self):
        return self.optimizer.param_groups[0]['lr']

    def step(self, samples, tf_ratio):
        self.optimizer.zero_grad()
        bsz = samples.src.size(1)
        
        outs = self.model(samples.src, samples.trg, tf_ratio)
        loss = self.criterion(outs.view(-1, outs.size(2)), samples.trg.view(-1))

        if self.model.training:
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
            self.optimizer.step()
            self.n_updates += 1
        return loss


def save_model(save_vars, filename):
    model_path = os.path.join(args.savedir, filename)
    torch.save(save_vars, model_path)
    print(f'model saved: {model_path}')


def save_vocab(savedir, fields):
    name, field = fields
    save_path = os.path.join(savedir, f"{name}_vocab.txt")
    with open(save_path, 'w') as fout:
        for w in field.vocab.itos:
            fout.write(w + '\n')


def save_field(savedir, fields):
    name, field = fields
    save_path = os.path.join(savedir, f"{name}.field")
    with open(save_path, 'wb') as fout:
        dill.dump(field, fout)
    

def train(args):
    device = torch.device('cuda' if args.gpu  else 'cpu')

    train_iter, valid_iter = data.BucketIterator.splits(
        (train_data, valid_data), 
        batch_size=args.batch_size,
        sort_within_batch=True,
        sort_key= lambda x: len(x.src),
        repeat=False,
        device=device
    )
    
    model = Seq2seqAttn(args, (EN_TEXT, RU_TEXT), device).to(device)
    print(model)
    print('')

    criterion = nn.CrossEntropyLoss(ignore_index=RU_TEXT.vocab.stoi['<pad>'])
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')
    trainer = Trainer(model, criterion, optimizer, scheduler, args.clip)
   
    epoch = 1
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    best_loss = math.inf

    while epoch < max_epoch and trainer.n_updates < max_update \
        and args.min_lr < trainer.get_lr():

        # training
        with tqdm(train_iter, dynamic_ncols=True) as pbar:
            train_loss = 0.0
            trainer.model.train()
            for samples in pbar:
                bsz = samples.src.size(1)
                loss = trainer.step(samples, args.tf_ratio)
                train_loss += loss.item()

                pbar.set_description(f"epoch {str(epoch).zfill(3)}")
                progress_state = OrderedDict(
                    loss=loss.item(),
                    ppl=math.exp(loss.item()),
                    bsz=len(samples),
                    lr=trainer.get_lr(), 
                    clip=args.clip, 
                    num_updates=trainer.n_updates)
                pbar.set_postfix(progress_state)
        train_loss /= len(train_iter)

        print(f"| epoch {str(epoch).zfill(3)} | train ", end="") 
        print(f"| loss {train_loss:.{4}} ", end="")
        print(f"| ppl {math.exp(train_loss):.{4}} ", end="")
        print(f"| lr {trainer.get_lr():.1e} ", end="")
        print(f"| clip {args.clip} ", end="")
        print(f"| num_updates {trainer.n_updates} |")
        
        valid_loss = 0.0
        trainer.model.eval()
        for samples in valid_iter:
            bsz = samples.src.size(1)
            loss = trainer.step(samples, tf_ratio=0.0)
            valid_loss += loss.item()

        valid_loss /= len(valid_iter)

        print(f"| epoch {str(epoch).zfill(3)} | valid ", end="") 
        print(f"| loss {valid_loss:.{4}} ", end="")
        print(f"| ppl {math.exp(valid_loss):.{4}} ", end="")
        print(f"| lr {trainer.get_lr():.1e} ", end="")
        print(f"| clip {args.clip} ", end="")
        print(f"| num_updates {trainer.n_updates} |")

        # saving model
        save_vars = {"train_args": args, 
                     "state_dict": model.state_dict()}

        if valid_loss < best_loss:
            best_loss = valid_loss
            save_model(save_vars, 'checkpoint_best.pt')
        save_model(save_vars, "checkpoint_last.pt")

        # update
        trainer.scheduler.step(valid_loss)
        epoch += 1

In [19]:
dataset_path = f'/content/drive/MyDrive/datasets/'
parser = argparse.ArgumentParser('')
train_opts(parser)
model_opts(parser)
args = parser.parse_args([
                  "--train", f"{dataset_path}sample_data/sample_train.tsv",
                  "--val",   f"{dataset_path}sample_data/sample_valid.tsv",
                  "--savedir", f"{dataset_path}/model",
                  "--gpu",
])
train(args)

  0%|          | 0/4832 [00:00<?, ?it/s]

Seq2seqAttn(
  (encoder): EncRNN(
    (embed): Embedding(38019, 200)
    (rnn): LSTM(200, 1024, num_layers=2)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): DecRNN(
    (embed): Embedding(80913, 200)
    (rnn): LSTM(200, 1024, num_layers=2)
    (w): Linear(in_features=2048, out_features=1024, bias=True)
    (attn): Attention()
    (out_projection): Linear(in_features=1024, out_features=80913, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)



epoch 001: 100%|██████████| 4832/4832 [1:22:35<00:00,  1.03s/it, loss=7.3, ppl=1.48e+3, bsz=32, lr=0.25, clip=0.1, num_updates=4832]


| epoch 001 | train | loss 7.698 | ppl 2.205e+03 | lr 2.5e-01 | clip 0.1 | num_updates 4832 |
| epoch 001 | valid | loss 7.306 | ppl 1.49e+03 | lr 2.5e-01 | clip 0.1 | num_updates 4832 |
model saved: /content/drive/MyDrive/datasets//model/checkpoint_best.pt


  0%|          | 0/4832 [00:00<?, ?it/s]

model saved: /content/drive/MyDrive/datasets//model/checkpoint_last.pt


epoch 002:   1%|▏         | 71/4832 [01:12<1:21:16,  1.02s/it, loss=7.46, ppl=1.73e+3, bsz=32, lr=0.25, clip=0.1, num_updates=4903]


KeyboardInterrupt: ignored

In [22]:
def load_field(path):
    with open(path, 'rb') as f:
        return dill.load(f)


def id2w(pred, field):
    sentence = [field.vocab.itos[i] for i in pred]
    if '<eos>' in sentence:
        return ' '.join(sentence[:sentence.index('<eos>')])
    return ' '.join(sentence)
 

def translate(args):
    device = torch.device('cuda' if args.gpu  else 'cpu')

    load_vars = torch.load(args.model)
    train_args = load_vars['train_args']
    model_params = load_vars['state_dict']

    dirname = os.path.dirname(args.model)

    test_iter = data.Iterator(test_data, batch_size=args.batch_size,
                    train=False, shuffle=False, sort=False, device=device) 
 
    model = Seq2seqAttn(train_args, (EN_TEXT, RU_TEXT), device).to(device)
    model.load_state_dict(model_params)

    model.eval()
    ins_a, outs_a = [], []
    for samples in test_iter:
        preds = model(samples.src, tgts=None, maxlen=args.maxlen, tf_ratio=0.0)
        preds = preds.max(2)[1].transpose(1, 0)
        outs = [id2w(pred, RU_TEXT) for pred in preds]
        ins = [id2w(item, EN_TEXT) for item in samples.src]
        ins_a.extend(ins)
        outs_a.extend(outs)
        # print('\n'.join(outs))
    return zip(ins_a, outs_a)


In [None]:
parser = argparse.ArgumentParser()
translate_opts(parser)
args = parser.parse_args([
                          "--model", f"{dataset_path}model/checkpoint_best.pt",
                          "--input", f'{dataset_path}sample_data/sample_test.txt',
                          "--gpu"
])
res = translate(args)