### Refs
* https://towardsdatascience.com/how-to-use-torchtext-for-neural-machine-translation-plus-hack-to-make-it-5x-faster-77f3884d95
* https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec
* https://github.com/SamLynnEvans/Transformer

In [None]:
pip install spacy torchtext pandas

In [None]:
import torchtext
torchtext.__version__

###  Download the European Parliament Proceedings Parallel Corpus 1996–2011

In [None]:
!wget https://www.statmt.org/europarl/v7/fr-en.tgz

In [None]:
!tar zxvf fr-en.tgz

In [None]:
!spacy download en_core_web_sm  # https://spacy.io/models/en
!spacy download fr_core_news_sm

### Prepare datasets

In [None]:
europarl_en = open('europarl-v7.fr-en.en', encoding='utf-8').read().split('\n')
europarl_fr = open('europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')

In [None]:
import pandas as pd
raw_data = {'English' : [line for line in europarl_en], 'French': [line for line in europarl_fr]}
df = pd.DataFrame(raw_data, columns=["English", "French"])
# remove very long sentences and sentences where translations are 
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
df['fr_len' ] = df['French' ].str.count(' ')
df = df.query('fr_len < 80 & eng_len < 80')
df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')
df

In [None]:
from sklearn.model_selection import train_test_split
# create train and validation set 
train, vali = train_test_split(df, test_size=0.1)
train.to_csv("train.csv", index=False)
vali.to_csv("vali.csv", index=False)

### Showcase the construction of a custom Dataset 

In [None]:
from torch.utils.data import IterableDataset, DataLoader
class MyIterableDataset(IterableDataset):  
    def __iter__(self):  
        return iter([(j,str(i)) for i,j in enumerate('abcdefghij')]) 

print(list(DataLoader(MyIterableDataset(), batch_size=4)))

### Tokenization

In [1]:
import pandas as pd

df_train = pd.read_csv("train.csv", usecols=['English','French'])# , nrows=200) 
df_vali  = pd.read_csv( "vali.csv", usecols=['English','French'])# , nrows=100) 
display(df_train.head(8)) # df_train['fr_len'].max()

import spacy # a language-aware tokenizer library

spacy_en = spacy.load('en_core_web_sm') # https://spacy.io/models/en      initialize an english tokenizer 
spacy_fr = spacy.load('fr_core_news_sm')

def tokenizer_en(sentence):
    return [tok.text for tok in spacy_en.tokenizer(sentence)]
def tokenizer_fr(sentence):
    return [tok.text for tok in spacy_fr.tokenizer(sentence)]

from collections import Counter
from torchtext.vocab import vocab 
import torch 

def build_vocab():
    coll_en = []
    coll_fr = [] 
    for sentence_en, sentence_fr in df_train.to_dict('split')['data'] + df_vali.to_dict('split')['data']:
        coll_en.extend(tokenizer_en(sentence_en))
        coll_fr.extend(tokenizer_fr(sentence_fr))
    
    vocab_en = vocab(Counter(coll_en), specials=('<UNK>', '<BOS>', '<EOS>', '<PAD>'))
    vocab_fr = vocab(Counter(coll_fr), specials=('<UNK>', '<BOS>', '<EOS>', '<PAD>'))
    vocab_en.set_default_index(0)
    vocab_fr.set_default_index(0)
    torch.save(vocab_en, 'vocab_en.pth')
    torch.save(vocab_fr, 'vocab_fr.pth')
    
    vocab_obj = torch.load('vocab_en.pth')
    print(vocab_obj.get_stoi())
    
#build_vocab()    

import torch 
torch.set_printoptions(threshold=100_000)
vocab_en = torch.load('vocab_en.pth')
vocab_fr = torch.load('vocab_fr.pth') 

text_transform_en = lambda x: [vocab_en['<BOS>']] + [vocab_en[token] for token in tokenizer_en(x)] + [vocab_en['<EOS>']] 
text_transform_fr = lambda x: [vocab_fr['<BOS>']] + [vocab_fr[token] for token in tokenizer_fr(x)] + [vocab_fr['<EOS>']] 

# Print out the output of text_transform_en
print("input to the text_transform_en:      ", "here is an example")
print("output of the text_transform_en:     ", text_transform_en("here is an example aaa bbb."))
print("original text:                       ", ' '.join([vocab_en.lookup_token(word_index) for word_index in text_transform_en("here is an example aaa bbb.")]))
# Print out the output of text_transform_en
print("output of the text_transform_fr:     ", text_transform_fr('Merci beaucoup, Monsieur de Silguy ccc ddd.'))
print("input to the text_transform_fr:      ", 'Merci beaucoup, Monsieur de Silguy.')

Unnamed: 0,English,French
0,"That is not something we can prevent, and we s...",Nous ne pouvons l'empêcher et nous tenons à re...
1,"Thank you, Mr de Silguy.","Merci beaucoup, Monsieur de Silguy."
2,A mere trifle like leaving out the digits '19'...,Le seul fait d'avoir omis les chiffres «19» du...
3,"As parliamentarians, we should be reimbursed t...",Les députés doivent toucher des indemnités qui...
4,Since the EU was enlarged just over a year ago...,L’importance de notre puissant voisin de l’Est...
5,Today I would like to say here that if the Com...,"Aujourd'hui, j'aimerais indiquer que si la Com..."
6,"IDABC supports, initiates and manages the deli...","Le programme IDABC soutient, lance et gère la ..."
7,What I am describing here is the situation now...,Je vous décris là la situation actuelle et non...


input to the text_transform_en:       here is an example
output of the text_transform_en:      [1, 83, 5, 255, 378, 0, 0, 30, 2]
original text:                        <BOS> here is an example <UNK> <UNK> . <EOS>
output of the text_transform_fr:      [1, 28, 29, 30, 31, 32, 33, 0, 0, 27, 2]
input to the text_transform_fr:       Merci beaucoup, Monsieur de Silguy.


In [2]:
import torch 
from torch.utils.data import IterableDataset, DataLoader

class train(IterableDataset):
    def __iter__(self):  
        return iter(df_train.to_dict('split')['data']) 
    
class vali(IterableDataset):
    def __iter__(self):  
        return iter(df_vali.to_dict('split')['data']) 

from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    coll_en, coll_fr = [], []
    for sentence_en, sentence_fr in batch: 
        coll_en.append(torch.tensor(text_transform_en(sentence_en))) 
        coll_fr.append(torch.tensor(text_transform_fr(sentence_fr)))   
    coll_en = pad_sequence(coll_en, padding_value=3) # 
    coll_fr = pad_sequence(coll_fr, padding_value=3) # 
    return coll_en, coll_fr
 
train_dl = DataLoader(train(), batch_size=8, collate_fn=collate_batch) 
vali_dl  = DataLoader( vali(), batch_size=8, collate_fn=collate_batch) 

tensor_en, _ =next(iter(train_dl))
print(tensor_en.shape)
tensor_en

torch.Size([36, 8])


tensor([[  1,   1,   1,   1,   1,   1,   1,   1],
        [  4,  31,  36,  55,  64,  79, 102, 122],
        [  5,  32,  37,  56,  42,  80, 103,  80],
        [  6,  11,  38,  11,  65,  81,  11, 123],
        [  7,  33,  39,   8,  66,  39, 104, 124],
        [  8,  34,  40,  57,  67,  26,  12,  83],
        [  9,  35,  41,  58,  17,  82, 105,   5],
        [ 10,  30,  42,  59,  68,  83,  42,  42],
        [ 11,   2,  43,  42,  50,  15, 106,  54],
        [ 12,   3,  44,  60,  47,  84,  96, 125],
        [  8,   3,  45,  61,  69,  42, 107,  11],
        [ 13,   3,  44,  62,  11,  85,  52,   6],
        [ 14,   3,  46,  63,  27,  12, 108,  42],
        [ 15,   3,  42,  30,  70,  86, 109,  54],
        [ 16,   3,  47,   2,  71,  87, 110, 126],
        [ 11,   3,  48,   3,  22,  88, 111,  42],
        [ 17,   3,  49,   3,  42,  89,  50, 127],
        [ 18,   3,  26,   3,  72,  90, 112, 128],
        [  8,   3,  50,   3,  11,  91,  26,  30],
        [ 19,   3,  51,   3,  73,  76, 113,   2],


<img title="a title" alt="Alt text" src="https://miro.medium.com/max/380/1*2vyKzFlzIHfSmOU_lnQE4A.png">

### Embedding

In [3]:
from torch import nn

embedding_dim = 512 # length of embedding vector
vocab_size_src = len(vocab_en)
vocab_size_trg = len(vocab_fr)

class Embedder(nn.Module):   
    # num_embeddings (int) – size of the dictionary of embeddings 
    # embedding_dim  (int) – the size of each embedding vector 
    def __init__(self, vocab_size_src, embedding_dim): # d_model = dimension of model
        super().__init__()
        self.embed = nn.Embedding(num_embeddings=vocab_size_src, embedding_dim=embedding_dim)
        
    def forward(self, x):
        return self.embed(x)
    
embedder = Embedder(vocab_size_src, embedding_dim)

In [4]:
tensor_after_embedding = embedder(tensor_en)
print(tensor_en.shape, next(embedder.parameters()).shape, tensor_after_embedding.shape)

# let's prove each token has a uniq embedding vector
torch.equal(next(embedder.parameters())[tensor_en[23,4]], tensor_after_embedding[23,4]) 

torch.Size([36, 8]) torch.Size([109946, 512]) torch.Size([36, 8, 512])


True

### Positional encoding

In [5]:
# unsqueeze: add a new dim to the tensor. 
# For example, torch.unsqueeze(x, 0) can change the shape of a tensor from [3,4] to [1,3,4].
# torch.unsqueeze(x, 1) can change the shape of a tensor from [2,3] to [2,1,3].
# torch.unsqueeze(x, 2) can change the shape of a tensor from [2,3] to [2,3,1].
x = torch.tensor([[ 1,  2,  3,  4],
                  [11, 12, 13, 14],
                  [21, 22, 23, 24]])
x.shape,  torch.unsqueeze(x, 0).shape,  torch.unsqueeze(x, 1).shape,  torch.unsqueeze(x, 2).shape

(torch.Size([3, 4]),
 torch.Size([1, 3, 4]),
 torch.Size([3, 1, 4]),
 torch.Size([3, 4, 1]))

In [58]:
# broadcasting: prove a1 == a2, and b1 == b2
a1 = torch.tensor(
    [[[1, 2, 3]],
     [[3, 4, 5]]]) 
a2 = torch.tensor(
    [[[1, 2, 3],
      [1, 2, 3],
      [1, 2, 3]],
     [[3, 4, 5],
      [3, 4, 5],
      [3, 4, 5]], 
    ])  
b1 = torch.tensor(
    [[[1, 2, 3],
      [4, 5, 6],
      [7, 8, 9]]]) 
b2 = torch.tensor(
    [[[1, 2, 3],
      [4, 5, 6],
      [7, 8, 9]],
     [[1, 2, 3],
      [4, 5, 6],
      [7, 8, 9]]])

print(torch.equal(a1 + b1, a2 + b2))
a1 + b1

True


tensor([[[ 2,  4,  6],
         [ 5,  7,  9],
         [ 8, 10, 12]],

        [[ 4,  6,  8],
         [ 7,  9, 11],
         [10, 12, 14]]])

In [7]:
import math
class PositionalEncoder(nn.Module):
    # max_seq_len means each sentence has at most 80 words
    # max_seq_len means 80 position to encode
    def __init__(self, embedding_dim, max_seq_len = 80): 
        super().__init__()
        self.embedding_dim = embedding_dim

        # create constant 'pe' matrix with values dependant on pos and i
        pe = torch.zeros(max_seq_len, embedding_dim, requires_grad=False)
        for pos in range(max_seq_len):
            for i in range(0, embedding_dim, 2):
                pe[pos, i    ] = math.sin(pos / (10000 ** ((2 *  i     )/embedding_dim)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/embedding_dim))) 
                
        # we prepare a positional 80 x 1 x 512 encoding matrix
        # It will at most encode a 80-word sentence.
        # we choose 1 because the positional encoding is the same to every sentence.
        self.pe = pe.unsqueeze(1) 

    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.embedding_dim)
         
        # add constant to embedding 
        x = x + self.pe[:x.size(0)]  
        
        return x

posEncoder = PositionalEncoder(embedding_dim)

In [8]:
tensor_after_posEncoding = posEncoder(tensor_after_embedding) 
tensor_after_embedding.shape, posEncoder.pe.shape, tensor_after_posEncoding.shape

(torch.Size([36, 8, 512]), torch.Size([80, 1, 512]), torch.Size([36, 8, 512]))

### Masking

In [33]:
tensor_input, tensor_output = next(iter(train_dl)) 
print(tensor_input.shape, tensor_output.shape)

torch.Size([36, 8]) torch.Size([38, 8])


In [34]:
tensor_input = tensor_input.transpose(0,1)
print(tensor_input[0])

tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12,  8, 13, 14, 15, 16, 11, 17, 18,
         8, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,  2,  3,  3,  3,  3])


In [35]:
# creates mask with 0s wherever there is padding in the input
mask_input = (tensor_input != vocab_en['<PAD>']).unsqueeze(1)
print(mask_input.shape)
print(mask_input[0])

torch.Size([8, 1, 36])
tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False]])


In [36]:
# create mask as before
tensor_output = tensor_output.transpose(0,1) 
print(tensor_output[0])

size = tensor_output.size(1) # get seq_len for matrix
print(size)

mask_output = (tensor_output != vocab_fr['<PAD>']).unsqueeze(1)
print(mask_output.shape)
print(mask_output[0])

tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 12, 15, 16, 17, 18, 10,
         7, 19, 20, 21, 22, 23, 24, 25, 26, 27,  2,  3,  3,  3,  3,  3,  3,  3,
         3,  3])
38
torch.Size([8, 1, 38])
tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True, False,
         False, False, False, False, False, False, False, False]])


In [37]:
nopeak_mask = torch.triu(torch.ones(1, size, size, dtype=int), diagonal=1)
nopeak_mask = (nopeak_mask == 0)
print(nopeak_mask.shape)
print(nopeak_mask) 

torch.Size([1, 38, 38])
tensor([[[ True, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [ True,  True, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [ True,  True,  True, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [ True,  True,  True,  True, False, False, Fal

In [38]:
mask_output  = mask_output & nopeak_mask
print(mask_output.shape) 
print(mask_output) 

torch.Size([8, 38, 38])
tensor([[[ True, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [ True,  True, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [ True,  True,  True, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [ True,  True,  True,  True, False, False, Fal

### MultiHeadAttention

In [None]:
https://miro.medium.com/max/224/1*15E9qKg9bKnWdSRWCyY2iA.png

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, embedding_dim, dropout = 0.1):
        super().__init__()
        
        self.embedding_dim = embedding_dim
        self.d_k           = embedding_dim // h
        self.h             = h
        
        self.q_linear = nn.Linear(embedding_dim, embedding_dim)
        self.v_linear = nn.Linear(embedding_dim, embedding_dim)
        self.k_linear = nn.Linear(embedding_dim, embedding_dim)
        self.dropout  = nn.Dropout(dropout)
        self.out      = nn.Linear(embedding_dim, embedding_dim)
    
    def attention(self, q, k, v, d_k, mask=None, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
        
        if mask is not None:
            mask   = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
            
        scores = F.softmax(scores, dim=-1)
        
        if dropout is not None:
            scores = dropout(scores)
            
        output = torch.matmul(scores, v)
        return output
    
    def forward(self, q, k, v, mask=None): 
        bs = q.size(0)
        
        # perform linear operation and split into h heads        
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        
        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)        
        output = self.out(concat)
    
        return output

In [None]:
# build an encoder layer with one multi-head attention layer and one # feed-forward layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1    = Norm(d_model)
        self.norm_2    = Norm(d_model)
        self.attn      = MultiHeadAttention(heads, d_model)
        self.ff        = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x   

### Encoder

In [47]:
a = torch.tensor(
    [[[1, 2, 3]],
     [[3, 4, 5]]])

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_src, embedding_dim, N, h):
        super().__init__()
        self.N = N
        self.embedder   = Embedder(vocab_size_src, embedding_dim)
        self.posEncoder = PositionalEncoder(embedding_dim)
        self.layers     = get_clones(EncoderLayer(embedding_dim, h), N)
        self.norm = Norm(embedding_dim)
        
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.posEncoder(x)
        for i in range(N):
            x = self.layers[i](x, mask)
        return self.norm(x) 

### Multi-Headed Attention

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [None]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [None]:

# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff     = FeedForward(d_model).cuda()
        
    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x  = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x  = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
        x2 = self.norm_3(x)
        x  = x + self.dropout_3(self.ff(x2))
        return x
# We can then build a convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [None]:

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, h):
        super().__init__()
        self.N      = N
        self.embed  = Embedder(vocab_size, d_model)
        self.pe     = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, h), N)
        self.norm   = Norm(d_model)
        
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, h):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, h)
        self.decoder = Decoder(trg_vocab, d_model, N, h)
        self.out     = nn.Linear(d_model, trg_vocab)
        
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output  = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output    = self.out(d_output)
        return output
    
# we don't perform softmax on the output as this will be handled 
# automatically by our loss function

In [None]:

h = 8
N = 6
model = Transformer(vocab_size_src, vocab_size_trg, d_model, N, h)
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [None]:
# this code is very important! It initialises the parameters with a
# range of values that stops the signal fading or getting too big.
# See this blog for a mathematical explanation.
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

def train_model(epochs=0, print_every=100): 
    model.train() 
    
    ts  = time.time()
    total_loss = 0    
    for epoch in range(epochs): 
        for i, (batch_en, batch_fr) in enumerate(train_dl):
            src = batch_en.transpose(0,1)
            trg = batch_fr.transpose(0,1)
            # the French sentence we input has all words except
            # the last, as it is using each word to predict the next
            trg_input = trg[:, :-1]
            print(src, trg, trg_input)
            break
            # the words we are trying to predict            
            targets = trg[:, 1:].contiguous().view(-1)
            
            # create function to make masks using mask code above            
            src_mask, trg_mask = create_masks(src, trg_input)
            
            # Draw DAG
            preds = model(src, trg_input, src_mask, trg_mask)  
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), results, ignore_index=target_pad)    
                  
            # SGD
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            total_loss += loss.data[0]
            if i % print_every == 0:
                loss_avg = total_loss / print_every
                ts_now = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S') 
                print(f"[{ts_now}][{(time.time()-ts)/60:.1f}m] epoch_{epoch + 1} iter_{i + 1} loss_{loss_avg} ")
                total_loss = 0 
                
        break
                
train_model(epochs=3)

In [None]:
def translate(model, src, max_len = 80, custom_string=False): 
    model.eval()
    if custom_sentence == True:
        src = tokenize_en(src)
        sentence=Variable(torch.LongTensor([[EN_TEXT.vocab.stoi[tok] for tok in sentence]])).cuda()
        src_mask = (src != input_pad).unsqueeze(-2)
        e_outputs = model.encoder(src, src_mask)
    
    outputs = torch.zeros(max_len).type_as(src.data)
    outputs[0] = torch.LongTensor([FR_TEXT.vocab.stoi['<sos>']])
    
    
    for i in range(1, max_len):    
            
        trg_mask = np.triu(np.ones((1, i, i), k=1).astype('uint8')
        trg_mask= Variable(torch.from_numpy(trg_mask) == 0).cuda()
        
        out = model.out(model.decoder(outputs[:i].unsqueeze(0), e_outputs, src_mask, trg_mask))
        out = F.softmax(out, dim=-1)
        val, ix = out[:, -1].data.topk(1)
        
        outputs[i] = ix[0][0]
        if ix[0][0] == FR_TEXT.vocab.stoi['<eos>']:
            break
                           
    return ' '.join( [FR_TEXT.vocab.itos[ix] for ix in outputs[:i]] )