<b>Sequence To Sequence Model with Multi-Head Attention</b>

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import random

from tqdm import tqdm_notebook
import numpy as np
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import torchtext
from torchtext.data import Field

from lib.checkpoint import *
from lib.stopping import Stopping
from lib.tools import *
from lib.trainlogger import *
from lib.utilities import *

logger = logging.getLogger(__name__)

%load_ext watermark
%watermark -a "tb" -d -v -m -p sys,numpy,pandas,sklearn,torch,IPython
gpu_stat()

tb 2019-01-20 

CPython 3.6.4
IPython 6.2.1

sys 3.6.4 |Anaconda custom (64-bit)| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
numpy 1.14.2
pandas 0.22.0
sklearn 0.19.2
torch 1.0.0a0+1e45e7a
IPython 6.2.1

compiler   : GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 17.5.0
machine    : x86_64
processor  : i386
CPU cores  : 24
interpreter: 64bit
GPU Name: TITAN Xp
GPU Memory: 12.0GB
CUDA Version: (9, 1, 0)
GPU Free/Total Memory: 89%


In [2]:
# torch.cuda.is_available = lambda : False
# torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = True

In [3]:
H = HYPERPARAMETERS({
    'EXPERIMENT': 'Eng2Ger',
    'DESCRIPTION': 'Sequence To Sequence model',
    'TIMESTAMP': HYPERPARAMETERS.create_timestamp(),

    'MODEL_NAME': 'Eng2Ger_ENC_DEC',

    'PRELOAD_MODEL_PATH': None, #'Eng2Ger_ENC_DEC.tar',

    'ROOT_DIR': 'data',

    'TARGET_ENCODING': 'sts',  # ' ctc

    'BATCH_SIZE': 128,
    'NUM_WORKERS': 8,

    'EMBEDDING_SIZE': 256,
    'EMBEDDING_DROPOUT': 0.2,
    'RNN_HIDDEN_SIZE': 256,
    'RNN_NUM_LAYERS': 2,
    'RNN_DROPOUT': 0.2,
    'BIDIRECTIONAL': True,

    'LR': 0.001,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01),
    'WEIGHT_DECAY': 0,
    'MOMENTUM': 0.9,
    'NESTEROV': True,

    'TEACHER_FORCING_RATIO': 0.5,

    'LABEL_SMOOTHING' : 0.2,

    'MAX_GRAD_NORM': 400,

    'MAX_EPOCHS': 30,

    'STOPPING_PATIENCE': 80,

    'CHECKPOINT_INTERVAL': 10,
    'CHECKPOINT_RESTORE': False,

    'USE_CUDA': torch.cuda.is_available(),

    'SEED': 123456,
    
    'SEQ_MAX_LEN' :         50,
    'SRC_VOCAB_MAX_SIZE' :  50000,
    'TGT_VOCAB_MAX_SIZE' :  50000,

})

In [4]:
random.seed(H.SEED)
np.random.seed(H.SEED)
torch.manual_seed(H.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(H.SEED)
    torch.cuda.manual_seed_all(H.SEED)

In [5]:
SYM_SOS = '<sos>'
SYM_EOS = '<eos>'
SYM_PAD = '<pad>'
IDX_SOS = -1
IDX_EOS = -1
IDX_PAD = -1

In [6]:
import spacy

spacy_en = spacy.load('en')
spacy_de = spacy.load('de')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer( text )]
    return text.split()

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]
    return text.split()

preproc = lambda seq: seq + [SYM_EOS]

src = Field(sequential=True, tokenize=tokenize_en, lower=True, batch_first=True, 
            include_lengths=True)
tgt = Field(sequential=True, tokenize=tokenize_de, lower=True, batch_first=True, init_token= SYM_SOS,
            include_lengths=True, preprocessing=preproc)

In [7]:
def len_filter(example):
    return len(example.src) <= H.SEQ_MAX_LEN and len(example.tgt) <= H.SEQ_MAX_LEN

path = os.path.join(H.ROOT_DIR, "eng-ger-data.tsv")
SRC_FIELD_NAME = 'src'
TGT_FIELD_NAME = 'tgt'


train_data, valid_data, test_data= torchtext.data.TabularDataset(
    path=path, format='tsv',
    fields=[(SRC_FIELD_NAME, src), (TGT_FIELD_NAME, tgt)],
    filter_pred=len_filter
    ).split(split_ratio=[0.8, 0.1, 0.1])

In [8]:
class Vocabulary(object):
    def __init__(self, vocab):
        self.vocab = vocab
        
    def __call__(self, val):
        if isinstance(val, str):
            res = self.vocab.stoi[val] if val in self.vocab.stoi else None
        elif isinstance(val, int):
            res = self.vocab.itos[val] if val <= self.__len__() else None
        else:
            raise RuntimeError
        return res   
    
    def __len__(self):
        return len(self.vocab.itos)
    
    def __repr__(self):
        return 'Vocab(size=' + str(len(self.vocab.itos)) + ')'

In [9]:
src.build_vocab(train_data, max_size=H.SRC_VOCAB_MAX_SIZE, min_freq=2)
tgt.build_vocab(train_data, max_size=H.TGT_VOCAB_MAX_SIZE, min_freq=2)

input_vocab = Vocabulary(src.vocab)
output_vocab = Vocabulary(tgt.vocab)

print(input_vocab, output_vocab)

IDX_PAD = output_vocab(SYM_PAD)
IDX_SOS = output_vocab(SYM_SOS)
IDX_EOS = output_vocab(SYM_EOS)

IDX_PAD, IDX_SOS, IDX_EOS

Vocab(size=9510) Vocab(size=15657)


(1, 2, 3)

In [10]:
output_vocab(3)

'<eos>'

In [11]:
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits(
                                (train_data, valid_data, test_data), 
                                batch_size=H.BATCH_SIZE, repeat=False, 
                                sort=False, sort_within_batch=True, 
                                sort_key=lambda x: len(x.src))


batch = next(train_iter.__iter__())
input_variables = getattr(batch, 'src')
target_variables = getattr(batch, 'tgt')

len(train_iter), len(valid_iter), len(test_iter)

(1058, 133, 133)

In [12]:
for idx_batch, batch in enumerate(train_iter):
    inputs_cpu, input_sizes_cpu = getattr(batch, SRC_FIELD_NAME)
    labels_cpu, label_sizes_cpu = getattr(batch, TGT_FIELD_NAME)
    break

In [13]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, dropout=0.5):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = dropout if num_layers > 1 else 0.0

        self.rnn = nn.GRU(self.input_size, self.hidden_size, self.num_layers, batch_first=True, bias=True,
                          dropout=self.dropout, bidirectional=self.bidirectional)

    def forward(self, inputs, lengths):
        pack_seq = nn.utils.rnn.pack_padded_sequence(inputs, lengths, batch_first=True)
        pack_seq, hidden = self.rnn(pack_seq)
        outputs, lengths = nn.utils.rnn.pad_packed_sequence(pack_seq, batch_first=True)
        return outputs, lengths, hidden


In [14]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size=128, num_layers=1, 
                 embedding_dropout=0, rnn_dropout=0, bidirectional=True, initialize=None):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.embedding_dropout = embedding_dropout
        self.rnn_dropout = rnn_dropout
        self.initialize = initialize

        self.embedding = nn.Sequential(
            nn.Embedding(self.vocab_size, self.embedding_size),
            nn.Dropout(self.embedding_dropout) 
        )        
        
        self.rnn = RNN(self.embedding_size, self.hidden_size, self.num_layers, 
                       dropout=self.rnn_dropout, bidirectional=self.bidirectional)

        if self.initialize is not None:
            self.initialize(self)

    def forward(self, inputs, input_sizes):

        outputs = self.embedding(inputs)

        outputs, output_lengths, hidden = self.rnn(outputs, input_sizes)
        
        hidden = self._cat(hidden)

        return outputs, output_lengths, hidden

    def _cat(self, h):
        """
        (#directions * #layers, #batch, hidden_size) -> (#layers, #batch, #directions * hidden_size)
        """
        if self.bidirectional:
            h = torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2)
        return h

In [15]:
encoder_cpu = Encoder(len(input_vocab), H.EMBEDDING_SIZE, H.RNN_HIDDEN_SIZE, 
                         num_layers=H.RNN_NUM_LAYERS, rnn_dropout=H.RNN_DROPOUT, 
                         embedding_dropout=H.EMBEDDING_DROPOUT)

enc_outputs_cpu, enc_output_sizes_cpu, enc_hidden_cpu = encoder_cpu(inputs_cpu, input_sizes_cpu)

print(model_summary(encoder_cpu))

enc_outputs_cpu.shape, enc_output_sizes_cpu.shape, enc_hidden_cpu.shape

Summary for model: Encoder
________________________________________________________________________________
Layer (type)                        Shape                           Param #     
embedding.0 (Embedding)             ((9510, 256),)                  2434560     
________________________________________________________________________________
embedding.1 (Dropout)               ()                              0           
________________________________________________________________________________
rnn.rnn (GRU)                       ((768, 256), (768, 256), (768,) 1972224     
Total params:         4,406,784
Trainable params:     4,406,784
________________________________________________________________________________



(torch.Size([128, 7, 512]), torch.Size([128]), torch.Size([2, 128, 512]))

In [16]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, d_model, p):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model
        self.d_head = d_model // h
        self.fc_query = nn.Linear(d_model, h * self.d_head, bias=False)
        self.fc_key = nn.Linear(d_model, h * self.d_head, bias=False)
        self.fc_value = nn.Linear(d_model, h * self.d_head, bias=False)
        self.fc_concat = nn.Linear(h * self.d_head, d_model, bias=False)
        self.sm = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(p)
        self.attn_dropout = nn.Dropout(p)
        self.layernorm = nn.LayerNorm(d_model)

    def _prepare_proj(self, x):
        """Reshape the projectons to apply softmax on each head

        """
        b, l, d = x.size()
        return x.view(b, l, self.h, self.d_head).transpose(1, 2).contiguous().view(b * self.h, l, self.d_head)

    def forward(self, query, key, value, mask):
        b, len_query = query.size(0), query.size(1)
        len_key = key.size(1)

        # project inputs to multi-heads
        proj_query = self.fc_query(query)  # batch_size x len_query x h*d_head
        proj_key = self.fc_key(key)  # batch_size x len_key x h*d_head
        proj_value = self.fc_value(value)  # batch_size x len_key x h*d_head

        # prepare the shape for applying softmax
        proj_query = self._prepare_proj(proj_query)  # batch_size*h x len_query x d_head
        proj_key = self._prepare_proj(proj_key)  # batch_size*h x len_key x d_head
        proj_value = self._prepare_proj(proj_value)  # batch_size*h x len_key x d_head

        # get dotproduct softmax attns for each head
        attns = torch.bmm(proj_query, proj_key.transpose(1, 2))  # batch_size*h x len_query x len_key
        attns = attns / math.sqrt(self.d_head)
        attns = attns.view(b, self.h, len_query, len_key)
#         attns = attns.masked_fill_(mask.unsqueeze(1), -float('inf'))
        attns = self.sm(attns.view(-1, len_key))
    
        # return mean attention from all heads as coverage 
        coverage = torch.mean(attns.view(b, self.h, len_query, len_key), dim=1)
        attns = self.attn_dropout(attns)
        attns = attns.view(b * self.h, len_query, len_key)

        # apply attns on value
        out = torch.bmm(attns, proj_value)  # batch_size*h x len_query x d_head
        out = out.view(b, self.h, len_query, self.d_head).transpose(1, 2).contiguous()
        out = self.fc_concat(out.view(b, len_query, self.h * self.d_head))
        out = self.layernorm(query + self.dropout(out))
        return out, coverage

In [17]:
class Decoder(nn.Module):
    def __init__(self, vocab, max_seq_length, embedding_size=256, hidden_size=256, num_layers=1, embedding_dropout=0.5, 
                 rnn_dropout=0.5, teacher_forcing_ratio=0.5,
                 initialize=None):
        super(Decoder, self).__init__()
        self.vocab = vocab
        self.max_seq_length = max_seq_length
        self.vocab_size = len(self.vocab)
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_dropout = embedding_dropout
        self.rnn_dropout = rnn_dropout if num_layers > 1 else 0.0
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.initialize = initialize

        self.embedding = nn.Sequential(
            nn.Embedding(self.vocab_size, self.embedding_size, padding_idx=IDX_PAD),
            nn.Dropout(self.embedding_dropout)
        )

        self.attn = MultiHeadAttention(32, self.hidden_size, 0.2)
        
        self.rnn = nn.GRU(self.embedding_size, self.hidden_size, self.num_layers, batch_first=True, bias=True,
                          dropout=self.rnn_dropout)

        self.fc = nn.Linear(self.hidden_size, self.vocab_size)

        if self.initialize is not None:
            self.initialize(self)


    def forward(self, enc_inputs, enc_input_sizes, hidden, labels=None, label_sizes=None):

        if self.training:
            assert labels is not None and label_sizes is not None, "Need labels in trainings mode."

        use_cuda = next(self.parameters()).is_cuda

        batch_size = enc_inputs.size(0)
        inputs = torch.LongTensor([self.vocab("<sos>")] * batch_size).view(batch_size, 1)
        inputs = inputs.cuda() if use_cuda else inputs

        max_length = labels.size(1) if labels is not None else self.max_seq_length + 1

#         mask = self.get_mask(enc_input_sizes).unsqueeze(1)
#         mask = mask.cuda() if use_cuda else mask
        mask = None

        dec_output_sizes = torch.LongTensor(batch_size).fill_(max_length)
        dec_output_sizes = dec_output_sizes.cuda() if use_cuda else dec_output_sizes

        dec_outputs = []
        for t in range(max_length):

            outputs, hidden = self.step(inputs, hidden, enc_inputs, mask)
            dec_outputs.append(outputs)

            inputs = outputs.topk(1)[1].view(batch_size, 1)

            dec_output_sizes[inputs.squeeze(1).eq(self.vocab('<eos>')) * dec_output_sizes.gt(t)] = t
            if labels is None and dec_output_sizes.le(t + 1).all():
                break

            if self.training and random.random() < self.teacher_forcing_ratio:
                inputs = labels[:, t].view(batch_size, 1)

        dec_outputs = torch.cat(dec_outputs, dim=1)

        return dec_outputs, dec_output_sizes

    def step(self, inputs, hidden, enc_inputs, mask):
        batch_size, output_size = inputs.size(0), inputs.size(1)

        embeddings = self.embedding(inputs)

        outputs, hidden = self.rnn(embeddings, hidden)

        outputs, _ = self.attn(outputs, enc_inputs, enc_inputs, mask)

        outputs = self.fc(outputs.contiguous().squeeze(1))

        outputs = torch.log_softmax(outputs, dim=1).view(batch_size, output_size, -1)

        return outputs, hidden

    @staticmethod
    def get_mask(lengths):
        batch_size = lengths.numel()
        mask = (torch.arange(0, lengths.max()).type_as(lengths).repeat(batch_size, 1).gt(lengths.unsqueeze(1)))
        return mask

In [18]:
decoder_cpu = Decoder(output_vocab, H.SEQ_MAX_LEN, H.RNN_HIDDEN_SIZE*2, H.RNN_HIDDEN_SIZE*2,
                         num_layers=H.RNN_NUM_LAYERS, rnn_dropout=H.RNN_DROPOUT, 
                         embedding_dropout=H.EMBEDDING_DROPOUT, teacher_forcing_ratio=H.TEACHER_FORCING_RATIO)

decoder_cpu.train()
decoder_output_cpu = decoder_cpu(enc_outputs_cpu, enc_output_sizes_cpu, enc_hidden_cpu, labels_cpu, label_sizes_cpu)

print(model_summary(decoder_cpu))

inputs_cpu[0].shape, len(decoder_output_cpu), decoder_output_cpu[0].shape

Summary for model: Decoder
________________________________________________________________________________
Layer (type)                        Shape                           Param #     
embedding.0 (Embedding)             ((15657, 512),)                 8016384     
________________________________________________________________________________
embedding.1 (Dropout)               ()                              0           
________________________________________________________________________________
attn.fc_query (Linear)              ((512, 512),)                   262144      
________________________________________________________________________________
attn.fc_key (Linear)                ((512, 512),)                   262144      
________________________________________________________________________________
attn.fc_value (Linear)              ((512, 512),)                   262144      
________________________________________________________________________________
a

(torch.Size([7]), 2, torch.Size([128, 13, 15657]))

In [19]:
class NeuralMachineTranslator(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, max_seq_length, embedding_size=256, rnn_hidden_size=256, 
                 rnn_num_layers=1, rnn_dropout=0.5, embedding_dropout=0.5, teacher_forcing_ratio=0.5, 
                 initialize=None):
        super(NeuralMachineTranslator, self).__init__()
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_seq_length = max_seq_length
        self.embedding_size = embedding_size
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn_num_layers = rnn_num_layers
        self.embedding_dropout = embedding_dropout
        self.rnn_dropout = rnn_dropout
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.initialize = initialize

        self.enc = Encoder(len(src_vocab), embedding_size=self.embedding_size, 
                           hidden_size=self.rnn_hidden_size, num_layers=self.rnn_num_layers, 
                           embedding_dropout=self.embedding_dropout, rnn_dropout=self.rnn_dropout, 
                           bidirectional=True, initialize=self.initialize)

        self.dec = Decoder(tgt_vocab, max_seq_length=self.max_seq_length, embedding_size=self.rnn_hidden_size *2,
                           hidden_size=self.rnn_hidden_size * 2, num_layers=self.rnn_num_layers, 
                           embedding_dropout=self.embedding_dropout, rnn_dropout=self.rnn_dropout, 
                           teacher_forcing_ratio=self.teacher_forcing_ratio, initialize=self.initialize)

    def forward(self, inputs, input_sizes, labels=None, label_sizes=None):
        outputs, output_sizes, hidden = self.enc(inputs, input_sizes)

        outputs, output_sizes = self.dec(outputs, output_sizes, hidden, labels, label_sizes)

        return outputs, output_sizes

In [20]:
class Trainer(object):
    def __init__(self, model, loader, optimizer, scheduler, criterion, decoder, scorer, max_grade_norm=None):
        self.model = model
        self.loader = loader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.decoder = decoder
        self.scorer = scorer
        self.max_grade_norm = max_grade_norm
        self.use_cuda = next(self.model.parameters()).is_cuda

    def __call__(self, epoch):
        self.model.train(True)

        self.scheduler.step(epoch)

        train_lr = [float(param_group['lr']) for param_group in self.optimizer.param_groups][0]

        total_size, total_loss, total_score = 0, 0.0, 0.0
        for idx_batch, batch in enumerate(self.loader):
            inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
            labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
            if next(self.model.parameters()).is_cuda:
                inputs, labels = inputs.cuda(), labels.cuda()

            outputs, output_sizes = self.model(inputs, input_sizes, labels, label_sizes)

            loss = self.criterion(outputs, output_sizes, labels, label_sizes)
            total_loss += loss.item()

#             preds_seq, label_seq = self.decoder(outputs, output_sizes, labels, label_sizes)
#             total_score += self.scorer(preds_seq, label_seq)
            total_score += 1
    
            total_size += inputs.size(0)

            self.optimizer.zero_grad()
            loss.backward()
            if self.max_grade_norm is not None:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grade_norm)
            self.optimizer.step()

            del outputs
            del loss

        return total_loss / total_size, 1.0 - min(1.0, total_score / total_size), train_lr
    

In [21]:
class Evaluator(object):
    def __init__(self, model, loader, criterion, decoder, scorer):
        self.model = model
        self.loader = loader
        self.criterion = criterion
        self.decoder = decoder
        self.scorer = scorer
        self.use_cuda = next(self.model.parameters()).is_cuda

    def __call__(self):
        self.model.eval()

        with torch.no_grad():
            total_size, total_loss, total_score = 0, 0.0, 0.0
            for idx_batch, batch in enumerate(self.loader):
                inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
                labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
                if next(self.model.parameters()).is_cuda:
                    inputs, labels = inputs.cuda(), labels.cuda()

                outputs, output_sizes = self.model(inputs, input_sizes, labels, label_sizes)

#                 loss = self.criterion(outputs, output_sizes, labels, label_sizes)
#                 total_loss += loss.item()
                total_loss += 1

                preds_seq, label_seq = self.decoder(outputs, output_sizes, labels, label_sizes)
                total_score += self.scorer(preds_seq, label_seq)

                total_size += inputs.size(0)

                del outputs
#                 del loss

            return total_loss / total_size, 1.0 - min(1.0, total_score / total_size)


In [22]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, padding_idx, label_smoothing=0.0):
        super(LabelSmoothingLoss, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - label_smoothing
        self.smoothing = label_smoothing

    def __call__(self, inputs, input_sizes, labels, label_sizes):
        return self.forward(inputs, input_sizes, labels, label_sizes)

    def forward(self, inputs, input_sizes, labels, label_sizes):
        b, t, c = inputs.size()
        inputs = inputs.view(b * t, c)

        b, t = labels.size()
        labels = labels.view(b * t)

        true_dist = inputs.data.clone()
        true_dist.fill_(self.smoothing / (inputs.size(1) - 2))
        true_dist.scatter_(1, labels.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0

        mask = torch.nonzero(labels.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)

        return self.criterion(inputs, true_dist.detach())


In [23]:
class STSDecoder(object):
    def __init__(self, vocab):
        self.vocab = vocab

    @staticmethod
    def decode_labels(labels, label_sizes, vocab):
        idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')

        lseq = []
        for seq, size in zip(labels, label_sizes):
            lseq.append(' '.join([vocab(c.item()) for c in seq[0:size - 1] if c not in [idx_sos, idx_eos, idx_pad]]))

        return lseq

    @staticmethod
    def decode_probas(probas, probas_sizes, vocab, probabilities=False):
        max_vals, max_indices = torch.max(probas, 2)
        idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')

        decoded_seq = []
        for seq_idx, seq_len, seq_proba in zip(max_indices.cpu(), probas_sizes, max_vals):
            txt, probas = '', []

            for i in range(min(seq_len, len(seq_idx))):
                c = seq_idx[i].item()
                if c in [idx_sos, idx_eos, idx_pad]:
                    continue
                txt += vocab(c) + ' '
                probas.append(math.exp(seq_proba[i].item()))

            if probabilities:
                decoded_seq.append((txt.strip(), stats.mean(probas) if len(probas) > 0 else 0))
            else:
                decoded_seq.append(txt.strip())
        return decoded_seq

    def __call__(self, inputs, inputs_sizes, labels=None, label_sizes=None, probabilities=False):

        decoder_seq = self.decode_probas(inputs, inputs_sizes, self.vocab, probabilities=probabilities)

        label_seq = None
        if labels is not None and label_sizes is not None:
            label_seq = self.decode_labels(labels, label_sizes, self.vocab)

        return decoder_seq, label_seq

In [24]:
from lib.scorer import Scorer
from lib.stopping import Stopping

m = Metric([('train_loss', np.inf), ('train_score', np.inf), ('valid_loss', np.inf), ('valid_score', 0),
            ('train_lr', 0), ('valid_cer', np.inf)])

model = NeuralMachineTranslator(input_vocab, output_vocab, H.SEQ_MAX_LEN, 
                                embedding_size=H.EMBEDDING_SIZE, rnn_hidden_size=H.RNN_HIDDEN_SIZE,
                                rnn_num_layers=H.RNN_NUM_LAYERS, rnn_dropout=H.RNN_DROPOUT, 
                                embedding_dropout=H.EMBEDDING_DROPOUT, 
                                teacher_forcing_ratio=H.TEACHER_FORCING_RATIO,
                                initialize=torch_weight_init)
if H.USE_CUDA:
    model.cuda()

logging.info(model_summary(model, line_length=100))

if H.PRELOAD_MODEL_PATH:
    path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
    state = torch.load(path)
    model.load_state_dict(state)
    logging.info("Preloaded model: {}".format(path))

criterion = LabelSmoothingLoss(padding_idx=IDX_PAD, label_smoothing=H.LABEL_SMOOTHING)

sts_decoder = STSDecoder(output_vocab)

scorer = Scorer()

optimizer = optim.Adam(list(filter(lambda p: p.requires_grad, model.parameters())),
                       amsgrad=False,
                       betas=(0.9, 0.999),
                       eps=1e-08,
                       lr=H.LR,
                       weight_decay=H.WEIGHT_DECAY)

stopping = Stopping(model, patience=H.STOPPING_PATIENCE)

scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[H.LR_LAMBDA])

tlogger = TensorboardLogger(root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP)  # PytorchLogger()

checkpoint = Checkpoint(model, optimizer, stopping, m,
                        root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP, restore_from=-1,
                        interval=H.CHECKPOINT_INTERVAL, verbose=0)

trainer = Trainer(model, train_iter, optimizer, scheduler, criterion, sts_decoder, scorer, H.MAX_GRAD_NORM)

evaluator = Evaluator(model, valid_iter, criterion, sts_decoder, scorer)

In [25]:
epoch_start = 1
if H.CHECKPOINT_RESTORE:
    epoch_start = checkpoint.restore() + 1
    train_loader.batch_sampler.shuffle(epoch_start)

epoch = epoch_start
try:
    epoch_itr = tlogger.set_itr(range(epoch_start, H.MAX_EPOCHS + 1))

    for epoch in epoch_itr:

        with DelayedKeyboardInterrupt():

            m.train_loss, m.train_score, m.train_lr = trainer(epoch)

            m.valid_loss, m.valid_score = evaluator()

            if checkpoint:
                checkpoint.step(epoch)

            stopping_flag = stopping.step(epoch, m.valid_loss, m.valid_score)

            epoch_itr.log_values(m.train_loss, m.train_score, m.train_lr, m.valid_loss, m.valid_score,
                                 stopping.best_score_epoch, stopping.best_score)

            if stopping_flag:
                logger.info(
                    "Early stopping at epoch: %d, score %f" % (stopping.best_score_epoch, stopping.best_score))
                break

#             train_loader.batch_sampler.shuffle(epoch)

except KeyboardInterrupt:
    logger.info("Training interrupted at: {}".format(epoch))
    pass

checkpoint.create(epoch)

model.load_state_dict(stopping.best_score_state)
torch.save(model.state_dict(), os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))

logger.info(repr(tlogger))
logger.info(repr(stopping))
logger.info(repr(checkpoint))

logger.info("Training end.")

In [27]:
model_pre = NeuralMachineTranslator(input_vocab, output_vocab, H.SEQ_MAX_LEN, 
                                    embedding_size=H.EMBEDDING_SIZE, rnn_hidden_size=H.RNN_HIDDEN_SIZE,
                                    rnn_num_layers=H.RNN_NUM_LAYERS, rnn_dropout=H.RNN_DROPOUT, 
                                    embedding_dropout=H.EMBEDDING_DROPOUT, 
                                    teacher_forcing_ratio=H.TEACHER_FORCING_RATIO,
                                    initialize=torch_weight_init)
if H.USE_CUDA:
    model_pre.cuda()

path = os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar')
state = torch.load(path)
model_pre.load_state_dict(state)

sts_decoder = STSDecoder(output_vocab)
scorer = Scorer()


In [28]:
class Recognizer(object):
    def __init__(self, model, decoder, loader, probabilities=False):
        self.model = model
        self.decoder = decoder
        self.loader = loader
        self.probabilities = probabilities

        self.use_cuda = next(self.model.parameters()).is_cuda

    def __call__(self):

        self.model.eval()
        with torch.no_grad():

            decoder_seq = []
            for idx_batch, batch in enumerate(self.loader):
                inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)

                if next(self.model.parameters()).is_cuda:
                    inputs = inputs.cuda()

                logits, logit_sizes = self.model(inputs, input_sizes)

                seq, _ = self.decoder(logits, logit_sizes, None, None, probabilities=self.probabilities)

                decoder_seq.extend(seq)

                del logits

        return decoder_seq

In [29]:
recognizer = Recognizer(model_pre, sts_decoder, test_iter)

hypotheses = recognizer()


In [30]:
transcripts = []
for idx_batch, batch in enumerate(test_iter):
    labels, label_sizes = getattr(batch, TGT_FIELD_NAME)

    label_seq = STSDecoder.decode_labels(labels, label_sizes, output_vocab)
    transcripts.extend(label_seq)


In [31]:
transcripts[0:5]

['ich mag hunde .',
 'hört bitte damit auf .',
 "kontrollieren sie 's einfach .",
 'tom war angewidert .',
 'hallo , alle miteinander !']

In [32]:
hypotheses[0:5]

['ich mag hunde .',
 'unterlassen sie es bitte !',
 'kontrollier sie einfach .',
 'tom war angewidert .',
 'hallo zusammen !']

In [33]:
from lib.scorer import Scorer

recognizer = Recognizer(model_pre, sts_decoder, test_iter)

hypotheses = recognizer()

transcripts = []
for idx_batch, batch in enumerate(test_iter):
    target_variables, target_lengths = getattr(batch, TGT_FIELD_NAME)    
    label_seq = STSDecoder.decode_labels(target_variables, target_lengths, output_vocab)
    transcripts.extend(label_seq)

bleu = Scorer.get_moses_multi_bleu(hypotheses, transcripts, lowercase=False)
wer, cer = Scorer.get_wer_cer(hypotheses, transcripts)
acc = Scorer.get_acc(hypotheses, transcripts)

print('Test Summary \n'
        'Bleu: {bleu:.3f}\n'
        'WER:  {wer:.3f}\n'
        'CER:  {cer:.3f}\n'
        'ACC:  {acc:.3f}'.format(bleu=bleu, wer=wer * 100, cer=cer * 100, acc=acc * 100))


Test Summary 
Bleu: 41.110
WER:  35.228
CER:  33.395
ACC:  17.702


In [34]:
def beam_search(model, vocab, encoder_outputs, encoder_output_sizes, encoder_hidden,
                beam_size=3, alpha=0.1, beta=0.3, max_seq_len=64):
    vocab_size = len(vocab)
    idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')
    use_cuda = next(model.parameters()).is_cuda

    batch_size = encoder_outputs.size(0)
    inputs = torch.LongTensor([vocab("<sos>")] * batch_size).view(batch_size, 1)
    inputs = inputs.cuda() if use_cuda else inputs

    #         mask = self.get_mask(enc_input_sizes).unsqueeze(1)
    #         mask = mask.cuda() if use_cuda else mask
    mask = None

    decode_outputs, decode_hidden = model.dec.step(inputs, encoder_hidden, encoder_outputs, mask)

    search_outputs = []

    for batch_idx in range(decode_outputs.size(0)):
        probas = []
        preds = []

        dec_outputs = decode_outputs[batch_idx].unsqueeze(0).contiguous()
        dec_hidden = decode_hidden[:, batch_idx, :].unsqueeze(1).contiguous()
        enc_outputs = encoder_outputs[batch_idx, :, :].unsqueeze(0).contiguous()

        scores, scores_idx = dec_outputs.view(-1).topk(beam_size)
        scores_idx = scores_idx.fmod(vocab_size).view(beam_size, 1)

        dec_inputs = scores_idx.view(1, beam_size, 1)
        scores = scores.view(beam_size)
        dec_hidden = dec_hidden.repeat(1, beam_size, 1)
        enc_outputs = enc_outputs.repeat(beam_size, 1, 1)

        remaining_beams = beam_size
        for step in range(max_seq_len):
            dec_outputs, dec_hidden = model.dec.step(dec_inputs[-1], dec_hidden, enc_outputs, mask)

            dec_outputs = scores.unsqueeze(1) + dec_outputs[:, -1, :]
            scores, scores_idx = dec_outputs.view(-1).topk(remaining_beams)

            scores_idx = scores_idx.fmod(vocab_size).view(remaining_beams, 1)

            dec_inputs = torch.cat((dec_inputs, scores_idx.unsqueeze(0)), 0)

            index = (scores_idx[:, -1].eq(idx_eos) + scores_idx[:, -1].eq(idx_pad)).flatten()

            finished, continue_idx = index.nonzero().flatten(), (index ^ 1).nonzero().flatten()

            if preds == [] and (step + 1) == max_seq_len:
                finished = continue_idx
                remaining_beams = 0
                
            for idx in finished:
                probas.append(scores[idx].item())
                preds.append(dec_inputs[:, idx].flatten().tolist())
                remaining_beams -= 1

            if remaining_beams <= 0:
                break

            if len(continue_idx) > 0:
                dec_inputs = dec_inputs.index_select(1, continue_idx)
                dec_hidden = dec_hidden.index_select(1, continue_idx)
                enc_outputs = enc_outputs.index_select(0, continue_idx)
                scores = scores.index_select(0, continue_idx)

        len_penalties = [math.pow(len(pred), alpha) for pred in preds]
        final_scores = [probas[i] / len_penalties[i] for i in range(len(preds))]
        sorted_scores_arg = sorted(range(len(preds)), key=lambda i: -final_scores[i])
        best_beam = sorted_scores_arg[0]

        search_outputs.append(preds[best_beam])

    return search_outputs

In [35]:
%%time

model_pre.eval()
with torch.no_grad():

    hypotheses = []
    references = []
    for idx_batch, batch in enumerate(test_iter):
        inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
        labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
        if next(model_pre.parameters()).is_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()

        enc_outputs, enc_output_sizes, enc_hidden = model_pre.enc(inputs, input_sizes)
        max_seq_len = labels.size(1) if labels is not None else H.MAX_SEQ_LENGTH
         
        results = beam_search(model_pre, output_vocab, 
                              enc_outputs, enc_output_sizes, enc_hidden, 
                              beam_size=20, alpha=0.1, beta=0.3, max_seq_len=64)

        for row in results:
            hypotheses.append(' '.join([output_vocab(t) for t in row if t not in [IDX_PAD, IDX_SOS, IDX_EOS]]))

        references.extend(STSDecoder.decode_labels(labels[:,1:], label_sizes-1, output_vocab))

CPU times: user 7min 48s, sys: 619 ms, total: 7min 49s
Wall time: 7min 48s


In [36]:
hypotheses[0:5]

['ich mag hunde .',
 'unterlassen sie es bitte',
 'kontrollier sie einfach .',
 'tom war angewidert .',
 'hallo zusammen !']

In [37]:
references[0:5]

['ich mag hunde .',
 'hört bitte damit auf .',
 "kontrollieren sie 's einfach .",
 'tom war angewidert .',
 'hallo , alle miteinander !']

In [38]:
from lib.scorer import Scorer

bleu = Scorer.get_moses_multi_bleu(hypotheses, references, lowercase=False)
wer, cer = Scorer.get_wer_cer(hypotheses, references)
acc = Scorer.get_acc(hypotheses, references)

print('Test Summary \n'
        'Bleu: {bleu:.3f}\n'
        'WER:  {wer:.3f}\n'
        'CER:  {cer:.3f}\n'
        'ACC:  {acc:.3f}'.format(bleu=bleu, wer=wer * 100, cer=cer * 100, acc=acc * 100))


Test Summary 
Bleu: 38.210
WER:  37.487
CER:  35.314
ACC:  17.135


In [39]:
while True:
    seq_str = input("Type in a source sequence:")
    print(">> ", seq_str)
    if not len(seq_str):
        break
    #seq = seq_str.strip().lower().split()
    seq = tokenize_en(seq_str.strip().lower())
    print(seq)

    seq_id = [input_vocab(tok) for tok in seq]

    model_pre.eval()
    with torch.no_grad():

        src_id_seq = torch.LongTensor(seq_id).view(1, -1)
        src_id_seq = src_id_seq.cuda() if torch.cuda.is_available() else src_id_seq
        
        src_id_length = torch.LongTensor([len(seq_id)])

        outputs, outputs_sizes = model_pre(src_id_seq, src_id_length)
        tgt_seq = STSDecoder.decode_probas(outputs, outputs_sizes, output_vocab)
        print("<< ", ' '.join(tgt_seq))

print("Finished.")

Type in a source sequence:I am at home.
>>  I am at home.
['i', 'am', 'at', 'home', '.']
<<  ich bin zu hause .
Type in a source sequence:
>>  
Finished.
