<b>Transformer</b>

adapted from: https://github.com/zhangxiangnick/Transformer-py

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import random

from tqdm import tqdm_notebook
import numpy as np
import math, copy, time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import torchtext
from torchtext.data import Field

from lib.checkpoint import *
from lib.stopping import Stopping
from lib.tools import *
from lib.trainlogger import *
from lib.utilities import *

logger = logging.getLogger(__name__)

%load_ext watermark
%watermark -a "tb" -d -v -m -p sys,numpy,pandas,sklearn,torch,IPython
gpu_stat()

tb 2019-02-01 

CPython 3.6.4
IPython 6.2.1

sys 3.6.4 |Anaconda custom (64-bit)| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
numpy 1.14.2
pandas 0.22.0
sklearn 0.19.2
torch 1.0.0a0+1e45e7a
IPython 6.2.1

compiler   : GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 17.5.0
machine    : x86_64
processor  : i386
CPU cores  : 24
interpreter: 64bit
GPU Name: TITAN Xp
GPU Memory: 12.0GB
CUDA Version: (9, 1, 0)
GPU Free/Total Memory: 96%


In [2]:
# torch.cuda.is_available = lambda : False
# torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = True

In [3]:
H = HYPERPARAMETERS({
    'EXPERIMENT': 'Eng2Ger',
    'DESCRIPTION': 'Transformer model',
    'TIMESTAMP': HYPERPARAMETERS.create_timestamp(),

    'MODEL_NAME': 'Eng2Ger_TRANSFORMER',

    'PRELOAD_MODEL_PATH': None,

    'ROOT_DIR': 'data',

    'TARGET_ENCODING': 'sts',  # ' ctc

    'BATCH_SIZE': 64,
    'NUM_WORKERS': 8,

    'EMBEDDING_SIZE': 256,
    'EMBEDDING_DROPOUT': 0.2,
    'RNN_HIDDEN_SIZE': 256,
    'RNN_NUM_LAYERS': 2,
    'RNN_DROPOUT': 0.2,
    'BIDIRECTIONAL': True,

    'LR': 0.0003,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01),
    'WEIGHT_DECAY': 0,
    'MOMENTUM': 0.9,
    'NESTEROV': True,

    'LABEL_SMOOTHING' : 0.2,

    'MAX_GRAD_NORM': 1,

    'MAX_EPOCHS': 30,

    'STOPPING_PATIENCE': 80,

    'CHECKPOINT_INTERVAL': 10,
    'CHECKPOINT_RESTORE': False,

    'USE_CUDA': torch.cuda.is_available(),

    'SEED': 123456,
    
    'SEQ_MAX_LEN' :         50,
    'SRC_VOCAB_MAX_SIZE' :  50000,
    'TGT_VOCAB_MAX_SIZE' :  50000,

})

In [4]:
random.seed(H.SEED)
np.random.seed(H.SEED)
torch.manual_seed(H.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(H.SEED)
    torch.cuda.manual_seed_all(H.SEED)

In [5]:
SYM_SOS = '<sos>'
SYM_EOS = '<eos>'
SYM_PAD = '<pad>'
IDX_SOS = -1
IDX_EOS = -1
IDX_PAD = -1

In [6]:
import spacy

spacy_en = spacy.load('en')
spacy_de = spacy.load('de')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer( text )]
    return text.split()

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]
    return text.split()

preproc = lambda seq: [SYM_SOS] + seq + [SYM_EOS]

src = Field(sequential=True, tokenize=tokenize_en, lower=True, batch_first=True, 
            include_lengths=True)
tgt = Field(sequential=True, tokenize=tokenize_de, lower=True, batch_first=True, 
            include_lengths=True, preprocessing=preproc)

In [7]:
def len_filter(example):
    return len(example.src) <= H.SEQ_MAX_LEN and len(example.tgt) <= H.SEQ_MAX_LEN

path = os.path.join(H.ROOT_DIR, "eng-ger-data.tsv")
SRC_FIELD_NAME = 'src'
TGT_FIELD_NAME = 'tgt'

train_data, valid_data, test_data = torchtext.data.TabularDataset(
    path=path, format='tsv',
    fields=[(SRC_FIELD_NAME, src), (TGT_FIELD_NAME, tgt)],
    filter_pred=len_filter
    ).split(split_ratio=[0.8, 0.1, 0.1])

In [8]:
class Vocabulary(object):
    def __init__(self, vocab):
        self.vocab = vocab
        
    def __call__(self, val):
        if isinstance(val, str):
            res = self.vocab.stoi[val] if val in self.vocab.stoi else None
        elif isinstance(val, int):
            res = self.vocab.itos[val] if val <= self.__len__() else None
        else:
            raise RuntimeError
        return res   
    
    def __len__(self):
        return len(self.vocab.itos)
    
    def __repr__(self):
        return 'Vocab(size=' + str(len(self.vocab.itos)) + ')'

In [9]:
src.build_vocab(train_data, max_size=H.SRC_VOCAB_MAX_SIZE, min_freq=2)
tgt.build_vocab(train_data, max_size=H.TGT_VOCAB_MAX_SIZE, min_freq=2)

input_vocab = Vocabulary(src.vocab)
output_vocab = Vocabulary(tgt.vocab)

print(input_vocab, output_vocab)

IDX_PAD = output_vocab(SYM_PAD)
IDX_SOS = output_vocab(SYM_SOS)
IDX_EOS = output_vocab(SYM_EOS)

IDX_PAD, IDX_SOS, IDX_EOS

Vocab(size=9510) Vocab(size=15657)


(1, 3, 2)

In [10]:
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits(
                                (train_data, valid_data, test_data), 
                                batch_size=H.BATCH_SIZE, repeat=False, 
                                sort=False, sort_within_batch=True, 
                                sort_key=lambda x: len(x.src))


batch = next(train_iter.__iter__())
input_variables = getattr(batch, 'src')
target_variables = getattr(batch, 'tgt')

len(train_iter), len(valid_iter), len(test_iter)

(2115, 265, 265)

In [11]:
for idx_batch, batch in enumerate(train_iter):
    inputs_cpu, input_sizes_cpu = getattr(batch, SRC_FIELD_NAME)
    labels_cpu, label_sizes_cpu = getattr(batch, TGT_FIELD_NAME)
    break

In [12]:
import math

import numpy as np
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    def __init__(self, dim, dropout=0.0, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dim = dim
        self.dropout = dropout

        pe = torch.zeros(max_len, dim)
        position = torch.arange(0.0, max_len).unsqueeze(1).float()
        div_term = torch.exp((torch.arange(0.0, dim, 2) * -(math.log(10000.0) / dim)))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)
        self.dropout = nn.Dropout(p=self.dropout)

    def forward(self, x):
        x = x * math.sqrt(self.dim)
        x = x + self.pe[:, :x.size(1)]
        x = self.dropout(x)
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, d_model, droput):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.droput = droput

        self.d_head = d_model // self.num_heads

        self.fc_query = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)
        self.fc_key = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)
        self.fc_value = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)

        self.fc_concat = nn.Linear(self.num_heads * self.d_head, self.d_model, bias=False)

        self.softmax = nn.Softmax(dim=1)

        self.attn_dropout = nn.Dropout(self.droput)
        self.dropout = nn.Dropout(self.droput)

        self.norm = nn.LayerNorm(self.d_model)

    def _prepare_proj(self, x):
        """Reshape the projectons to apply softmax on each head
        """
        b, l, d = x.size()
        return x.view(b, l, self.num_heads, self.d_head).transpose(1, 2).contiguous().view(b * self.num_heads, l,
                                                                                           self.d_head)

    def forward(self, query, key, value, mask):
        b, len_query = query.size(0), query.size(1)
        len_key = key.size(1)

        # project inputs to multi-heads
        proj_query = self.fc_query(query)  # batch_size x len_query x h*d_head
        proj_key = self.fc_key(key)  # batch_size x len_key x h*d_head
        proj_value = self.fc_value(value)  # batch_size x len_key x h*d_head

        # prepare the shape for applying softmax
        proj_query = self._prepare_proj(proj_query)  # batch_size*h x len_query x d_head
        proj_key = self._prepare_proj(proj_key)  # batch_size*h x len_key x d_head
        proj_value = self._prepare_proj(proj_value)  # batch_size*h x len_key x d_head

        # get dotproduct softmax attns for each head
        attns = torch.bmm(proj_query, proj_key.transpose(1, 2))  # batch_size*h x len_query x len_key
        attns = attns / math.sqrt(self.d_head)
        attns = attns.view(b, self.num_heads, len_query, len_key)
        attns = attns.masked_fill_(mask.unsqueeze(1), -float('inf'))
        attns = self.softmax(attns.view(-1, len_key))

        # return mean attention from all heads as coverage
        coverage = torch.mean(attns.view(b, self.num_heads, len_query, len_key), dim=1)

        attns = self.attn_dropout(attns)
        attns = attns.view(b * self.num_heads, len_query, len_key)

        # apply attns on value
        out = torch.bmm(attns, proj_value)  # batch_size*h x len_query x d_head
        out = out.view(b, self.num_heads, len_query, self.d_head).transpose(1, 2).contiguous()

        out = self.fc_concat(out.view(b, len_query, self.num_heads * self.d_head))

        out = self.dropout(out).add_(query)
        out = self.norm(out)
        return out, coverage

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(PositionwiseFeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.drop = nn.Dropout(self.dropout)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, inputs):
        out = self.fc(inputs)
        out = self.drop(out).add_(inputs)
        out = self.norm(out)
        return out


class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, dropout, d_ff):
        super(EncoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.attention = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.ff = PositionwiseFeedForward(self.d_model, self.d_ff, self.dropout)

    def forward(self, query, key, value, mask):
        out, _ = self.attention(query, key, value, mask)
        out = self.ff(out)
        return out


class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, dropout, d_ff):
        super(DecoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.attention_tgt = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.attention_src = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.ff = PositionwiseFeedForward(d_model, self.d_ff, self.dropout)

    def forward(self, query, key, value, context, mask_tgt, mask_src):
        out, _ = self.attention_tgt(query, key, value, mask_tgt)
        out, coverage = self.attention_src(out, context, context, mask_src)
        out = self.ff(out)
        return out, coverage


class Encoder(nn.Module):
    def __init__(self, vocab_size, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.d_model = d_model
        self.padding_idx = padding_idx
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.dropout = dropout

        self.embeddings = nn.Embedding(self.vocab_size, self.d_model, padding_idx=self.padding_idx)

        self.pos_emb = PositionalEncoding(self.d_model, self.dropout, max_len=512)

        self.layers = nn.ModuleList(
            [EncoderLayer(self.num_heads, self.d_model, self.dropout, self.d_ff) for _ in range(self.num_layers)]
        )

    def forward(self, src):
        context = self.embeddings(src)  # batch_size x len_src x d_model

        context = self.pos_emb(context)

        mask_src = src.data.eq(self.padding_idx).unsqueeze(1)
        for _, layer in enumerate(self.layers):
            context = layer(context, context, context, mask_src)  # batch_size x len_src x d_model
        return context, mask_src


class Decoder(nn.Module):
    def __init__(self, vocab_size, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.d_model = d_model
        self.padding_idx = padding_idx
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.dropout = dropout

        self.embedding = nn.Embedding(self.vocab_size, self.d_model, padding_idx=self.padding_idx)

        self.pos_emb = PositionalEncoding(self.d_model, self.dropout, max_len=512)

        self.layers = nn.ModuleList(
            [DecoderLayer(self.num_heads, self.d_model, self.dropout, self.d_ff) for _ in range(self.num_layers)]
        )

        self.fc = nn.Linear(self.d_model, self.vocab_size, bias=True)

        # tie weight between word embedding and generator
        self.fc.weight = self.embedding.weight

        self.logsoftmax = nn.LogSoftmax(dim=1)

        # pre-save a mask to avoid future information in self-attentions in decoder
        # save as a buffer, otherwise will need to recreate it and move to GPU during every call
        mask = torch.ByteTensor(np.triu(np.ones((self.d_model, self.d_model)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

    def forward(self, tgt, context, mask_src):
        out = self.embedding(tgt)  # batch_size x len_tgt x d_model

        out = self.pos_emb(out)

        len_tgt = tgt.size(1)
        mask_tgt = tgt.data.eq(self.padding_idx).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        mask_tgt = torch.gt(mask_tgt, 0)
        for _, layer in enumerate(self.layers):
            out, coverage = layer(out, out, out, context, mask_tgt, mask_src)  # batch_size x len_tgt x d_model

        out = self.fc(out)  # batch_size x len_tgt x bpe_size

        out = self.logsoftmax(out.view(-1, self.vocab_size))
        return out, coverage


class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Transformer, self).__init__()
        self.src_vocab = src_vocab
        self.src_vocab_size = len(src_vocab)
        self.tgt_vocab = tgt_vocab
        self.tgt_vocab_size = len(tgt_vocab)
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.num_layers = num_layers
        self.dropout = dropout
        self.padding_idx = padding_idx

        self.encode = Encoder(self.src_vocab_size, self.num_heads, self.d_model, self.dropout, self.d_ff,
                              self.num_layers, self.padding_idx)
        self.decode = Decoder(self.tgt_vocab_size, self.num_heads, self.d_model, self.dropout, self.d_ff,
                              self.num_layers, self.padding_idx)

    def forward(self, src, src_sizes, tgt, tgt_sizes, ):
        context, mask_src = self.encode(src)
        outputs, _ = self.decode(tgt, context, mask_src)

        probas = outputs.view(src.size(0), -1, self.tgt_vocab_size)
        
        return probas, tgt_sizes-1

    def decode_greedy(self, inputs, max_seq_length=50, fixed_length=False):

        self.eval()
        with torch.no_grad():
            
            idx_sos, idx_eos = self.tgt_vocab('<sos>'), self.tgt_vocab('<eos>')

            context, mask_src = self.encode(inputs)

            batch_size = inputs.size(0)
            decode_input = torch.ones(batch_size, 1).fill_(idx_sos).type_as(inputs)

            dec_output_sizes = torch.LongTensor(batch_size).fill_(max_seq_length).type_as(inputs)

            dec_outputs = []
            for step in range(max_seq_length):
                outputs, _ = self.decode(decode_input, context, mask_src)
                outputs = outputs.view(batch_size, -1, self.tgt_vocab_size)

                dec_outputs.append(outputs[:, step, :].unsqueeze(1))

                preds = torch.max(outputs[:, -1, :], dim=1)[1]

                dec_output_sizes[preds.eq(idx_eos) * dec_output_sizes.gt(step)] = step
                if not fixed_length and dec_output_sizes.le(step + 1).all():
                    dec_output_sizes += 1
                    break

                decode_input = torch.cat([decode_input, preds.unsqueeze(1)], dim=1)

            dec_outputs = torch.cat(dec_outputs, dim=1)

        return dec_outputs, dec_output_sizes

    def decode_beam(self, inputs, labels=None, max_seq_length=50, beam_size=64, alpha=0.1, beta=0.3):

        context, mask_src = self.encode(inputs)

        max_seq_len = labels.size(1) if labels is not None else max_seq_length

        dec_outputs = []
        for idx in range(context.size(0)):
            target, _ = beam_search(self, self.tgt_vocab, context[idx].unsqueeze(0), mask_src[idx].unsqueeze(0),
                                    beam_size=beam_size, alpha=alpha, beta=beta, max_seq_len=max_seq_len)
            dec_outputs.append(target)

        return dec_outputs


def beam_search(model, vocab, context, mask_src, beam_size=64, alpha=0.1, beta=0.3, max_seq_len=64):
    probas = []
    preds = []
    probs = []
    coverage_penalties = []

    vocab_size = len(vocab)
    idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')

    decode_inputs = torch.LongTensor([idx_sos]).unsqueeze(1)
    if next(model.parameters()).is_cuda:
        decode_inputs = decode_inputs.cuda()

    decode_outputs, coverage = model.decode(decode_inputs, context, mask_src)

    scores, scores_idx = decode_outputs.view(-1).topk(beam_size)
    beam_idx = scores_idx / vocab_size
    pred_idx = (scores_idx - beam_idx * vocab_size).view(beam_size, -1)

    decode_inputs = torch.cat((decode_inputs.repeat(beam_size, 1), pred_idx), 1)
    context = context.repeat(beam_size, 1, 1)

    remaining_beams = beam_size
    for step in range(max_seq_len):
        decode_outputs, coverage = model.decode(decode_inputs, context, mask_src)

        decode_outputs = decode_outputs.view(remaining_beams, -1, vocab_size)
        decode_outputs = scores.unsqueeze(1) + decode_outputs[:, -1, :]
        scores, scores_idx = decode_outputs.view(-1).topk(remaining_beams)

        beam_idx = scores_idx / vocab_size
        pred_idx = (scores_idx - beam_idx * vocab_size).view(remaining_beams, -1)

        decode_inputs = torch.cat((decode_inputs[beam_idx], pred_idx), 1)

        index = decode_inputs[:, -1].eq(idx_eos) + decode_inputs[:, -1].eq(idx_pad)
        finished = index.nonzero().flatten()
        continue_idx = (index ^ 1).nonzero().flatten()

        for idx in finished:
            probas.append(scores[idx].item())
            preds.append(decode_inputs[idx, :].tolist())
            probs.append(coverage[idx, :, :])

            atten_prob = torch.sum(coverage[idx, :, :], dim=0)
            coverage_penalty = torch.log(atten_prob.masked_select(atten_prob.le(1)))
            coverage_penalty = beta * torch.sum(coverage_penalty).item()
            coverage_penalties.append(coverage_penalty)

            remaining_beams -= 1

        if len(continue_idx) > 0:
            scores = scores.index_select(0, continue_idx)
            decode_inputs = decode_inputs.index_select(0, continue_idx)
            context = context.index_select(0, continue_idx)

        if remaining_beams <= 0:
            break

    len_penalties = [math.pow(len(pred), alpha) for pred in preds]
    #     final_scores = [probas[i] / len_penalties[i] + coverage_penalties[i] for i in range(len(preds))]
    final_scores = [probas[i] / len_penalties[i] for i in range(len(preds))]

    sorted_scores_arg = sorted(range(len(preds)), key=lambda i: -final_scores[i])

    best_beam = sorted_scores_arg[0]

    return preds[best_beam], probs[best_beam]


In [13]:
model_cpu = Transformer(input_vocab, output_vocab, num_heads=8, d_model=512, 
                        dropout=0.1, d_ff=1024, num_layers=6, padding_idx=IDX_PAD)

outputs_cpu = model_cpu(inputs_cpu, input_sizes_cpu, labels_cpu, label_sizes_cpu)

# print(outputs_cpu.shape, output_sizes_cpu.shape)

# outputs_cpu, output_sizes_cpu = model_cpu.decode_greedy(inputs_cpu, labels_cpu)

# print(outputs_cpu.shape, output_sizes_cpu.shape)

# outputs_cpu, output_sizes_cpu = model_cpu.decode_beam(inputs_cpu, labels_cpu)

# print(outputs_cpu.shape, output_sizes_cpu.shape)

In [14]:
class STSDecoder(object):
    def __init__(self, vocab):
        self.vocab = vocab

    @staticmethod
    def decode_labels(labels, label_sizes, vocab):
        idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')
        lseq = []
        for seq, size in zip(labels, label_sizes):
            lseq.append(
                ' '.join([vocab(c.item()) for c in seq[0:size - 1] if c.item() not in [idx_sos, idx_eos, idx_pad]])
            )

        return lseq

    @staticmethod
    def decode_probas(probas, probas_sizes, vocab, probabilities=False):
        max_vals, max_indices = torch.max(probas, 2)
        idx_sos, idx_eos, idx_pad = vocab('<sos>'), vocab('<eos>'), vocab('<pad>')

        decoded_seq = []
        for seq_idx, seq_len, seq_proba in zip(max_indices.cpu(), probas_sizes, max_vals):
            txt, probas = '', []

            for i in range(min(seq_len, len(seq_idx))):
                c = seq_idx[i].item()
                if c in [idx_sos, idx_eos, idx_pad]:
                    continue
                txt += vocab(c) + ' '
                probas.append(math.exp(seq_proba[i].item()))

            if probabilities:
                decoded_seq.append((txt.strip(), stats.mean(probas) if len(probas) > 0 else 0))
            else:
                decoded_seq.append(txt.strip())
        return decoded_seq

    def __call__(self, inputs, inputs_sizes, labels=None, label_sizes=None, probabilities=False):

        decoder_seq = self.decode_probas(inputs, inputs_sizes, self.vocab, probabilities=probabilities)

        label_seq = None
        if labels is not None and label_sizes is not None:
            label_seq = self.decode_labels(labels, label_sizes, self.vocab)

        return decoder_seq, label_seq

In [15]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, tgt_vocab_size, label_smoothing=0.0, padding_idx=0):
        super(LabelSmoothingLoss, self).__init__()
        assert 0.0 < label_smoothing <= 1.0
        self.ignore_index = padding_idx

        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
        one_hot = torch.full((tgt_vocab_size,), smoothing_value)
        one_hot[self.ignore_index] = 0
        self.register_buffer('one_hot', one_hot.unsqueeze(0))

        self.confidence = 1.0 - label_smoothing

    def forward(self, outputs, output_sizes, targets, target_sizes):
        b, t, c = outputs.size()
        outputs = outputs.view(b * t, c)

        b, t = targets.size()
        targets = targets.view(b * t)        
        
        model_prob = self.one_hot.repeat(targets.size(0), 1)
        model_prob.scatter_(1, targets.unsqueeze(1), self.confidence)
        model_prob.masked_fill_((targets == self.ignore_index).unsqueeze(1), 0)

        return F.kl_div(outputs, model_prob, reduction='sum')

In [16]:
class NoamOptimizer(optim.Adam):
    def __init__(self, params, d_model, factor=2, warmup_steps=4000, betas=(0.9, 0.98), eps=1e-9):
        super(NoamOptimizer, self).__init__(params, betas=betas, eps=eps)
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.lr = 0
        self.step_num = 0
        self.factor = factor

    def step(self, closure=None):
        self.step_num += 1
        self.lr = self.lrate(self.step_num)
        for group in self.param_groups:
            group['lr'] = self.lr
        super(NoamOptimizer, self).step()

    def lrate(self, epoch):
        return self.factor * self.d_model ** (-0.5) * min(epoch ** (-0.5), epoch * self.warmup_steps ** (-1.5))


In [17]:
from torch import nn

class AccuracyScorer(nn.Module):

    def __init__(self, pad_index=0):
        super(AccuracyScorer, self).__init__()

        self.pad_index = pad_index

    def forward(self, outputs, output_sizes, targets, target_sizes):

        batch_size, seq_len, vocabulary_size = outputs.size()

        outputs = outputs.view(batch_size * seq_len, vocabulary_size)
        targets = targets.view(batch_size * seq_len)

        predicts = outputs.argmax(dim=1)
        corrects = predicts == targets

        corrects.masked_fill_((targets == self.pad_index), 0)

        correct_count = corrects.sum().item()
        count = (targets != self.pad_index).sum().item()

        return correct_count / float(count)

In [18]:
# https://discuss.pytorch.org/t/implementation-of-function-like-numpy-roll/964/8
def roll(x, shift, dim=-1, fill_pad = None):

    if 0 == shift:
        return x

    elif shift < 0:
        shift = -shift
        gap = x.index_select(dim, torch.arange(shift).to(x.device))
        if fill_pad is not None:
            gap = fill_pad * torch.ones_like(gap, device=x.device)
        return torch.cat([x.index_select(dim, torch.arange(shift, x.size(dim)).to(x.device)), gap], dim=dim)

    else:
        shift = x.size(dim) - shift
        gap = x.index_select(dim, torch.arange(shift, x.size(dim)).to(x.device))
        if fill_pad is not None:
            gap = fill_pad * torch.ones_like(gap, device=x.device)
        return torch.cat([gap, x.index_select(dim, torch.arange(shift).to(x.device))], dim=dim)

In [19]:
from lib.scorer import Scorer
from lib.stopping import Stopping

m = Metric([('train_loss', np.inf), ('train_score', np.inf), ('valid_loss', np.inf), ('valid_score', 0),
            ('train_lr', 0), ('valid_cer', np.inf)])

model = Transformer(input_vocab, output_vocab, num_heads=8, d_model=512,  dropout=0.1, d_ff=1024, 
                    num_layers=6, padding_idx=IDX_PAD)

for p in model.parameters():
    if p.dim() > 1:
        torch_weight_init(p)

if H.USE_CUDA:
    model.cuda()

logging.info(model_summary(model, line_length=100))

# if H.PRELOAD_MODEL_PATH:
#     path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
#     state = torch.load(path)
#     model.load_state_dict(state)
#     logging.info("Preloaded model: {}".format(path))

if H.PRELOAD_MODEL_PATH:
    path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
    state = torch.load(path)
    model.load_state_dict(state)
    logging.info("Preloaded model: {}".format(path))    
    
    
criterion = LabelSmoothingLoss(len(output_vocab), label_smoothing=H.LABEL_SMOOTHING, padding_idx=IDX_PAD)
if H.USE_CUDA:
    criterion.cuda()
    
sts_decoder = STSDecoder(output_vocab)

scorer = Scorer()
# scorer = AccuracyScorer(pad_index=IDX_PAD)

optimizer = optim.Adam(list(filter(lambda p: p.requires_grad, model.parameters())),
                       amsgrad=False,
                       betas=(0.9, 0.999),
                       eps=1e-08,
                       lr=H.LR,
                       weight_decay=H.WEIGHT_DECAY)

# optimizer = NoamOptimizer(list(filter(lambda p:p.requires_grad, model.parameters())),
#                           d_model=256, factor=2, warmup_steps=20000, betas=(0.9, 0.98), eps=1e-9)

stopping = Stopping(model, patience=H.STOPPING_PATIENCE)

scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[H.LR_LAMBDA])

tlogger = TensorboardLogger(root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP)  # PytorchLogger()

checkpoint = Checkpoint(model, optimizer, stopping, m,
                        root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP, restore_from=-1,
                        interval=H.CHECKPOINT_INTERVAL, verbose=0)


In [20]:
# path = os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar')
# state = torch.load(path)
# model.load_state_dict(state)

In [21]:
epoch_start = 1
if H.CHECKPOINT_RESTORE:
    epoch_start = checkpoint.restore() + 1
#     train_loader.batch_sampler.shuffle(epoch_start)

epoch = epoch_start
try:
    epoch_itr = tlogger.set_itr(range(epoch_start, H.MAX_EPOCHS + 1))

    for epoch in epoch_itr:
        
#         with DelayedKeyboardInterrupt():

        model.train(True)

#         scheduler.step()
    
        train_lr = [float(param_group['lr']) for param_group in optimizer.param_groups][0]

        total_size, total_loss, total_score = 0, 0.0, 0.0
        for idx_batch, batch in enumerate(train_iter):
            inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
            labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
            if next(model.parameters()).is_cuda:
                inputs, labels = inputs.cuda(), labels.cuda()

            probas, proba_sizes = model(inputs, input_sizes, labels, label_sizes) 

            loss = criterion(probas, proba_sizes, roll(labels, -1, dim=-1, fill_pad=IDX_PAD), label_sizes-1)
            total_loss += loss.item()      
            
            preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels, label_sizes)
            total_score += scorer(preds_seq, label_seq)
#             total_score += scorer(probas, proba_sizes, labels, label_sizes)
            
            total_size += inputs.size(0)

            optimizer.zero_grad()
            loss.backward()
            
            if H.MAX_GRAD_NORM is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), H.MAX_GRAD_NORM)
            optimizer.step()

            del probas
            del loss
            
        m.train_loss = total_loss / total_size
        m.train_score = 1.0 - min(1.0, total_score / total_size)
        m.train_lr = train_lr
    
        #-----------------------------------------------------------
        
        model.eval()
        
        with torch.no_grad():

            hypotheses = []
            references = []
            total_size, total_loss, total_score = 0, 0.0, 0.0
            for idx_batch, batch in enumerate(valid_iter):
                inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
                labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
                if next(model.parameters()).is_cuda:
                    inputs, labels = inputs.cuda(), labels.cuda()

                probas, proba_sizes = model.decode_greedy(inputs, labels.size(1), fixed_length=True)
                
                loss = criterion(probas, proba_sizes, roll(labels, -1, dim=-1, fill_pad=IDX_PAD), label_sizes-1)
                total_loss += loss.item()      

                preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels, label_sizes)
                total_score += scorer(preds_seq, label_seq)
#                 total_score += scorer(probas, proba_sizes, labels, label_sizes)

                total_size += inputs.size(0)
                
            del probas
            del loss

        m.valid_loss = total_loss / total_size
        m.valid_score = 1.0 - min(1.0, total_score / total_size)

        if checkpoint:
            checkpoint.step(epoch)

        stopping_flag = stopping.step(epoch, m.valid_loss, m.valid_score)

        epoch_itr.log_values(m.train_loss, m.train_score, m.train_lr, m.valid_loss, m.valid_score,
                             stopping.best_score_epoch, stopping.best_score)

        if stopping_flag:
            logger.info(
                "Early stopping at epoch: %d, score %f" % (stopping.best_score_epoch, stopping.best_score))
            break

#             train_loader.batch_sampler.shuffle(epoch)

except KeyboardInterrupt:
    logger.info("Training interrupted at: {}".format(epoch))
    pass

checkpoint.create(epoch)

model.load_state_dict(stopping.best_score_state)
torch.save(model.state_dict(), os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))

logger.info(repr(tlogger))
logger.info(repr(stopping))
logger.info(repr(checkpoint))

logger.info("Training end.")

In [22]:
1/0

ZeroDivisionError: division by zero

In [None]:
model_pre = Transformer(input_vocab, output_vocab, num_heads=8, d_model=512,  dropout=0.1, d_ff=1024, 
                    num_layers=6, padding_idx=IDX_PAD)


if H.USE_CUDA:
    model_pre.cuda()

path = os.path.join(H.EXPERIMENT, 'Eng2Ger_TRANSFORMER' + '.tar')
state = torch.load(path)
model_pre.load_state_dict(state)

scorer = Scorer()


In [None]:
hypotheses = []
references = []
for idx_batch, batch in enumerate(test_iter):
    inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
    labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
    if next(model_pre.parameters()).is_cuda:
        inputs, labels = inputs.cuda(), labels.cuda()

    probas, proba_sizes = model_pre.decode_greedy(inputs, labels.size(1))
    break


In [None]:
probas.shape

In [None]:
labels.shape

In [None]:
labels[0]

In [None]:
probas.max(2)[1][0]

In [None]:
proba_sizes[0]

In [None]:
label_sizes

In [None]:
label_sizes += 1

In [None]:
probas.max(2)[1][0]

In [None]:
roll(labels, -1, dim=-1, fill_pad=IDX_PAD)[0]

In [None]:
preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels, label_sizes)

In [None]:
preds_seq[0], label_seq[0]

In [None]:
_, max_indices = torch.max(probas, 2)

In [None]:
max_indices[0]

In [None]:
torch.LongTensor(max_indices.size(0)).fill_(max_indices.size(1))

In [None]:
for i in rangezx,wzxcvgbhhyfdszasxdcvbnjmk,l.;/''
for row in max_indices.eq(output_vocab('<eos>')).nonzero()

In [None]:
mask = torch.arange(0, max_indices.size(1)).repeat(max_indices.size(0), 1).to(max_indices.device)

In [None]:
mask

In [None]:
max_indices.eq(output_vocab('<eos>')).long() * mask

In [None]:
max_indices.index_select(1, max_indices.eq(output_vocab('<eos>')).nonzero())


In [None]:
max_indices.eq(output_vocab('<eos>')), dim=1

In [None]:
probas.shape

In [None]:
%%time 

model_pre.eval()
with torch.no_grad():

    hypotheses = []
    references = []
    for idx_batch, batch in enumerate(test_iter):
        inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
        labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
        if next(model_pre.parameters()).is_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()

        probas, proba_sizes = model_pre.decode_greedy( inputs, H.SEQ_MAX_LEN)
        
        preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels, label_sizes)

        hypotheses.extend(preds_seq)
        references.extend(label_seq)
        

In [None]:
CPU times: user 1min 7s, sys: 77.7 ms, total: 1min 7s
Wall time: 1min 7s

In [None]:
hypotheses[0:15]

In [None]:
references[0:15]

In [None]:
for h, r in zip(hypotheses, references):
    if h != r:
        print(h, r)

In [None]:
from lib.scorer import Scorer

bleu = Scorer.get_moses_multi_bleu(hypotheses, references, lowercase=False)
wer, cer = Scorer.get_wer_cer(hypotheses, references)
acc = Scorer.get_acc(hypotheses, references)


print('Test Summary \n'
            'Bleu: {bleu:.3f}\n'
            'WER:  {wer:.3f}\n'
            'CER:  {cer:.3f}\n'
            'ACC:  {acc:.3f}'.format(bleu=bleu, wer=wer * 100, cer=cer * 100, acc=acc * 100))


In [None]:
%%time

model_pre.eval()
with torch.no_grad():

    hypotheses = []
    references = []
    for idx_batch, batch in enumerate(test_iter):
        inputs, input_sizes = getattr(batch, SRC_FIELD_NAME)
        labels, label_sizes = getattr(batch, TGT_FIELD_NAME)
        if next(model.parameters()).is_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()

        context, mask_src = model_pre.encode(inputs)
        
        max_seq_len = labels.size(1) if labels is not None else H.MAX_SEQ_LENGTH

        outputs = model_pre.decode_beam(inputs, None, max_seq_len, beam_size=20, alpha=0.1, beta=0.3)

        for entry in outputs:
            hypotheses.append(' '.join([output_vocab(t) for t in entry if t not in [IDX_PAD, IDX_SOS, IDX_EOS]]))

        references.extend(STSDecoder.decode_labels(labels[:,1:], label_sizes-1, output_vocab)) 


In [None]:
from lib.scorer import Scorer

bleu = Scorer.get_moses_multi_bleu(hypotheses, references, lowercase=False)
wer, cer = Scorer.get_wer_cer(hypotheses, references)
acc = Scorer.get_acc(hypotheses, references)


print('Test Summary \n'
            'Bleu: {bleu:.3f}\n'
            'WER:  {wer:.3f}\n'
            'CER:  {cer:.3f}\n'
            'ACC:  {acc:.3f}'.format(bleu=bleu, wer=wer * 100, cer=cer * 100, acc=acc * 100))


In [None]:
while True:
    seq_str = input("Type in a source sequence:")
    print(">> ", seq_str)
    if not len(seq_str):
        break
    #seq = seq_str.strip().lower().split()
    seq = tokenize_en(seq_str.strip().lower())
    print(seq)

    seq_id = [input_vocab(tok) for tok in seq]

    model_pre.eval()
    with torch.no_grad():

        src_id_seq = torch.LongTensor(seq_id).view(1, -1)
        src_id_seq = src_id_seq.cuda() if torch.cuda.is_available() else src_id_seq
        
        probas, proba_sizes = model_pre.decode_greedy( src_id_seq, labels=None)

        tgt_seq = STSDecoder.decode_probas(probas, proba_sizes, output_vocab)
        
        print("<< ", ' '.join(tgt_seq))

print("Finished.")

In [None]:
model_pre = Transformer(input_vocab, output_vocab, num_heads=8, d_model=512,  dropout=0.1, d_ff=1024, 
                    num_layers=6, padding_idx=IDX_PAD)


if H.USE_CUDA:
    model_pre.cuda()

path = os.path.join(H.EXPERIMENT, 'Eng2Ger_TRANSFORMER' + '.tar')
state = torch.load(path)
model_pre.load_state_dict(state)

scorer = Scorer()
