# Transformer by PyTorch (Attention Is All You Need)

![Transformer](fig/transformer.png)

In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Preparing Data

In [2]:
import spacy
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    """
    Tokenize German text. 
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenize English text.
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [3]:
from torchtext.data import Field, BucketIterator

# Set `batch_first=False` in the `Field`.
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', 
            lower=True, include_lengths=True, batch_first=False)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', 
            lower=True, include_lengths=True, batch_first=False)

In [4]:
from torchtext.datasets import Multi30k

train_data, valid_data, test_data = Multi30k.splits(exts=['.de', '.en'], 
                                                    # fields=[SRC, TRG], 
                                                    fields=[('src', SRC), ('trg', TRG)], 
                                                    root="../assets/data")

In [5]:
print(train_data[0].src)
print(train_data[0].trg)

['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [6]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

len(SRC.vocab), len(TRG.vocab)

(7854, 5893)

In [7]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [8]:
for batch in train_iterator:
    batch_src, batch_src_lens = batch.src
    batch_trg, batch_trg_lens = batch.trg
    break
print(batch_src)
print(batch_src_lens)
print(batch_trg)
print(batch_trg_lens)

tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  5,   5,  43,  ...,   5,  18,  18],
        [ 13,  13, 253,  ...,  13,  30,   0],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]], device='cuda:0')
tensor([14, 17, 12, 11, 17, 21, 12, 16, 14, 11, 23, 23,  8, 11,  9, 14, 19, 20,
        12, 16,  9, 11, 13, 20, 21, 29, 13, 22, 14, 16, 10,  9, 15, 12, 17, 10,
        14, 22, 17, 20, 23, 23, 12, 17, 15, 19, 17, 15, 16,  7, 14, 15, 16, 12,
        17, 14, 18, 18, 14, 14, 17, 21, 12, 12,  9, 19, 12, 14, 12, 11, 10, 13,
        18, 14,  9, 11, 10, 12, 10, 25, 14, 18, 15, 16, 15, 18, 13,  9, 21, 11,
        20, 12, 13, 14, 14, 17, 10, 13, 18, 30, 14, 12, 13,  9, 10, 15, 13, 10,
        12, 15, 13, 18, 17, 13, 11, 12, 10, 16, 12, 13, 24, 14, 19, 19, 10, 20,
        12, 11], device='cuda:0')
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,    4,   48,  ...,    4,   16,   

## Building the Model
### Multi-Head Attention

`nn.MultiheadAttention.forward`
* `key_padding_mask` is an binary mask - when the value is `True`, the corresponding value on the (`trg_step` * `src_step`) energy matrix will be filled with `-inf` before passing to `softmax`.  
* `attn_mask` is an additive mask (i.e. the values will be added to the energy matrix before `softmax`). Hence, the value being `-inf` means "masked", and the value being `0` means "not-masked". 
    * This mask aims to prevent attention to certain positions.  
    * A 2D mask will be broadcasted for all the batches while a 3D mask allows to specify a different mask for the entries of each batch. 

In [9]:
SRC_IN_DIM = len(SRC.vocab)
TRG_IN_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
ENC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
DEC_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]


attention = nn.MultiheadAttention(embed_dim=HID_DIM, num_heads=ENC_HEADS, 
                                  dropout=ENC_DROPOUT).to(device)
src_emb = nn.Embedding(SRC_IN_DIM, HID_DIM, padding_idx=ENC_PAD_IDX).to(device)
trg_emb = nn.Embedding(TRG_IN_DIM, HID_DIM, padding_idx=DEC_PAD_IDX).to(device)
# The dropout may cause the sum of attention not equaling 1. 
attention.eval()
src_emb.eval()
trg_emb.eval()

# mask: (batch, src_step)
mask = (batch_src == src_emb.padding_idx).T
# K: (src_step, batch, hid_dim)
K = src_emb(batch_src)
# Q: (trg_step, batch, hid_dim)
Q = trg_emb(batch_trg)

# attened_values: (trg_step, batch, hid_dim)
# attens: (batch, trg_step, src_step)
# attens is the average attention weights over heads
attened_values, attens = attention(Q, K, K, key_padding_mask=mask)

print(batch_src.size())
print(batch_trg.size())
print(attens.size())
print(attened_values.size())

torch.Size([30, 128])
torch.Size([29, 128])
torch.Size([128, 29, 30])
torch.Size([29, 128, 256])


In [10]:
print((attens.sum(dim=-1) - 1).abs().max())
print(((attens == 0) == mask.unsqueeze(1)).all())

tensor(1.1921e-07, device='cuda:0', grad_fn=<MaxBackward1>)
tensor(True, device='cuda:0')


### Encoder

In [11]:
encoder_layer = nn.TransformerEncoderLayer(d_model=HID_DIM, nhead=ENC_HEADS, dim_feedforward=ENC_PF_DIM, dropout=ENC_DROPOUT).to(device)

# mask: (batch, src_step)
mask = (batch_src == src_emb.padding_idx).T
# outs: (src_step, batch, hid_dim)
outs = encoder_layer(K, src_key_padding_mask=mask)

print(batch_src.size())
print(outs.size())

torch.Size([30, 128])
torch.Size([30, 128, 256])


In [12]:
encoder = nn.TransformerEncoder(encoder_layer, num_layers=ENC_LAYERS).to(device)

enc_outs = encoder(K, src_key_padding_mask=mask)

print(batch_src.size())
print(enc_outs.size())

torch.Size([30, 128])
torch.Size([30, 128, 256])


### Decoder

In [13]:
# Use `torch.triu` to create the masking matrix
# This is an additive masking matrix
(torch.ones(5, 5) * -np.inf).triu(1)

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

In [14]:
decoder_layer = nn.TransformerDecoderLayer(d_model=HID_DIM, nhead=DEC_HEADS, dim_feedforward=DEC_PF_DIM, dropout=DEC_DROPOUT).to(device)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=DEC_LAYERS).to(device)

# src_mask: (batch, src_step)
src_mask = (batch_src == src_emb.padding_idx).T
# trg_mask: (trg_step, trg_step)
trg_mask = (torch.ones(batch_trg.size(0), batch_trg.size(0), device=device) * -np.inf).triu(1)

dec_outs = decoder(Q, enc_outs, tgt_mask=trg_mask, memory_key_padding_mask=src_mask)

print(batch_src.size())
print(batch_trg.size())
print(dec_outs.size())

torch.Size([30, 128])
torch.Size([29, 128])
torch.Size([29, 128, 256])


### Transformer

In [15]:
transformer = nn.Transformer(d_model=HID_DIM, custom_encoder=encoder, custom_decoder=decoder).to(device)

trans_outs = transformer(K, Q, src_key_padding_mask=src_mask, memory_key_padding_mask=src_mask, tgt_mask=trg_mask)

print(batch_src.size())
print(batch_trg.size())
print(trans_outs.size())

torch.Size([30, 128])
torch.Size([29, 128])
torch.Size([29, 128, 256])


In [16]:
transformer.eval()
print(encoder.training, decoder.training)

enc_outs = encoder(K, src_key_padding_mask=src_mask)
dec_outs = decoder(Q, enc_outs, tgt_mask=trg_mask, memory_key_padding_mask=src_mask)

trans_outs = transformer(K, Q, src_key_padding_mask=src_mask, memory_key_padding_mask=src_mask, tgt_mask=trg_mask)

print((dec_outs == trans_outs).all())

False False
tensor(True, device='cuda:0')


### Seq2Seq

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, transformer: nn.Transformer, src_voc_dim: int, trg_voc_dim: int, 
                 src_pad_idx: int, trg_pad_idx: int, dropout: float, max_len: int=100):
        super().__init__()
        self.transformer = transformer
        hid_dim = transformer.d_model

        self.src_tok_emb = nn.Embedding(src_voc_dim, hid_dim, padding_idx=src_pad_idx)
        self.src_pos_emb = nn.Embedding(max_len, hid_dim)
        self.trg_tok_emb = nn.Embedding(trg_voc_dim, hid_dim, padding_idx=trg_pad_idx)
        self.trg_pos_emb = nn.Embedding(max_len, hid_dim)

        self.fc = nn.Linear(hid_dim, trg_voc_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = hid_dim ** 0.5

    def forward(self, src: torch.Tensor, trg: torch.Tensor):
        # src: (src_step, batch)
        # trg: (trg_step-1, batch)
        # For the target sequence, the `<eos>` token should be sliced off before passing to the decoder. 
        # As there are no more tokens to be predicted after `<eos>`. 
        trg = trg[:-1]

        # src_mask: (batch, src_step)
        src_mask = (src == self.src_tok_emb.padding_idx).T
        # trg_mask: (trg_step-1, trg_step-1)
        trg_mask = (torch.ones(trg.size(0), trg.size(0), device=trg.device) * -np.inf).triu(1)
        
        # src_embedded: (src_step, batch, hid_dim)
        src_pos = torch.arange(src.size(0), device=src.device).unsqueeze(-1).repeat(1, src.size(1))
        src_embedded = self.dropout(self.src_tok_emb(src)*self.scale + self.src_pos_emb(src_pos))
        # trg_embedded: (trg_step-1, batch, hid_dim)
        trg_pos = torch.arange(trg.size(0), device=trg.device).unsqueeze(-1).repeat(1, trg.size(1))
        trg_embedded = self.dropout(self.trg_tok_emb(trg)*self.scale + self.trg_pos_emb(trg_pos))

        # trans_outs: (trg_step-1, batch, hid_dim)
        trans_outs = transformer(src_embedded, trg_embedded, src_key_padding_mask=src_mask, memory_key_padding_mask=src_mask, tgt_mask=trg_mask)
        # preds: (trg_step-1, batch, trg_voc_dim)
        return self.fc(trans_outs)

    def translate(self, src: torch.Tensor, sos: int, trg_max_len: int=50):
        # Ensure `dropout` off, or the result would change randomly. 
        self.eval()
        # src: (src_step, batch)
        # src_mask: (batch, src_step)
        src_mask = (src == self.src_tok_emb.padding_idx).T

        # src_embedded: (src_step, batch, hid_dim)
        src_pos = torch.arange(src.size(0), device=src.device).unsqueeze(-1).repeat(1, src.size(1))
        src_embedded = self.dropout(self.src_tok_emb(src)*self.scale + self.src_pos_emb(src_pos))

        # Create an target sequence. 
        # trg: (trg_step, batch)
        trg = torch.ones(trg_max_len, src.size(1), 
                         dtype=torch.long, device=src.device) * sos
        trg_pos = torch.arange(trg.size(0), device=trg.device).unsqueeze(-1).repeat(1, trg.size(1))

        # The inference would be slow, since there is much repeated computation. 
        for t in range(1, trg.size(0)):
            # (2) The input target sequence should be `trg[:(t+1)]`; with `<eos>` token sliced off, 
            # it becomes `trg[:t]`. Note that the elements in step `t-1` is just predicted in the 
            # last loop.  
            # trg_mask_t: (trg_step-1=t, trg_step-1=t)
            trg_mask_t = (torch.ones(t, t, device=trg.device) * -np.inf).triu(1)

            # trg_embedded_t: (trg_step-1=t, batch, hid_dim)
            trg_embedded_t = self.dropout(self.trg_tok_emb(trg[:t])*self.scale + self.trg_pos_emb(trg_pos[:t]))

            # trans_outs_t: (trg_step-1=t, batch, hid_dim)
            trans_outs_t = transformer(src_embedded, trg_embedded_t, src_key_padding_mask=src_mask, memory_key_padding_mask=src_mask, tgt_mask=trg_mask_t)

            # preds_t: (batch, trg_step-1=t, trg_out_dim)
            preds_t = self.fc(trans_outs_t)

            # top1: (batch, )
            top1 = preds_t[-1].argmax(dim=-1)
            trg[t] = top1
            
        # The decoder's output at the last step is the desired result (over all steps). 
        # `trg[1:]` equals `preds_t.argmax(dim=-1)` - if the `dropout` off
        assert (trg[1:] == preds_t.argmax(dim=-1)).all().item()
        return trg[1:]

In [18]:
model = Seq2Seq(transformer, SRC_IN_DIM, TRG_IN_DIM, ENC_PAD_IDX, DEC_PAD_IDX, ENC_DROPOUT).to(device)
preds = model(batch_src, batch_trg)

print(batch_src.size())
print(batch_trg.size())
print(preds.size())

torch.Size([30, 128])
torch.Size([29, 128])
torch.Size([28, 128, 5893])


In [19]:
DEC_SOS_IDX = TRG.vocab.stoi[TRG.init_token]
DEC_EOS_IDX = TRG.vocab.stoi[TRG.eos_token]
pred_indexes = model.translate(batch_src, DEC_SOS_IDX)

print(pred_indexes.size())

torch.Size([49, 128])


In [20]:
# Check if data are mixed across different samples in a batch.
model.eval()
preds_012 = model(batch_src[:, 0:3], batch_trg[:, 0:3])
preds_123 = model(batch_src[:, 1:4], batch_trg[:, 1:4])
(preds_012[:, 1:] == preds_123[:, :2]).all()

tensor(True, device='cuda:0')

## Training the Model

In [21]:
# This is important...
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name and param.dim() > 1:
            nn.init.xavier_uniform_(param.data)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# encoder_layer = nn.TransformerEncoderLayer(d_model=HID_DIM, nhead=ENC_HEADS, dim_feedforward=ENC_PF_DIM, dropout=ENC_DROPOUT).to(device)
# encoder = nn.TransformerEncoder(encoder_layer, num_layers=ENC_LAYERS).to(device)
# decoder_layer = nn.TransformerDecoderLayer(d_model=HID_DIM, nhead=DEC_HEADS, dim_feedforward=DEC_PF_DIM, dropout=DEC_DROPOUT).to(device)
# decoder = nn.TransformerDecoder(decoder_layer, num_layers=DEC_LAYERS).to(device)
# transformer = nn.Transformer(d_model=HID_DIM, custom_encoder=encoder, custom_decoder=decoder).to(device)

transformer = nn.Transformer(d_model=HID_DIM, nhead=ENC_HEADS, num_encoder_layers=ENC_LAYERS, num_decoder_layers=DEC_LAYERS, dim_feedforward=ENC_PF_DIM, dropout=ENC_DROPOUT).to(device)
model = Seq2Seq(transformer, SRC_IN_DIM, TRG_IN_DIM, ENC_PAD_IDX, DEC_PAD_IDX, ENC_DROPOUT).to(device)

model.apply(init_weights)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,039,621 trainable parameters


In [22]:
# Initialize Embeddings 
ENC_UNK_IDX = SRC.vocab.stoi[SRC.unk_token]
DEC_UNK_IDX = TRG.vocab.stoi[TRG.unk_token]

model.src_tok_emb.weight.data[ENC_UNK_IDX].zero_()
model.src_tok_emb.weight.data[ENC_PAD_IDX].zero_()
model.trg_tok_emb.weight.data[DEC_UNK_IDX].zero_()
model.trg_tok_emb.weight.data[DEC_PAD_IDX].zero_()

print(model.src_tok_emb.weight[:5, :8])
print(model.trg_tok_emb.weight[:5, :8])

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0184, -0.0200, -0.0165,  0.0215, -0.0038,  0.0216, -0.0175,  0.0079],
        [-0.0094,  0.0131, -0.0214, -0.0269,  0.0110, -0.0021, -0.0092,  0.0062],
        [ 0.0064,  0.0035, -0.0241,  0.0014, -0.0258,  0.0052,  0.0032,  0.0126]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0137, -0.0169, -0.0053,  0.0119, -0.0081,  0.0172, -0.0046,  0.0161],
        [-0.0128,  0.0117, -0.0254,  0.0119, -0.0251,  0.0178, -0.0033,  0.0144],
        [ 0.0173, -0.0076, -0.0297, -0.0124, -0.0151,  0.0155,  0.0272, -0.0047]],
       device='cuda:0', grad_fn=<SliceBackward>)


In [23]:
loss_func = nn.CrossEntropyLoss(ignore_index=DEC_PAD_IDX, reduction='mean')
# The `lr` is important...
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

In [24]:
def train_epoch(model, iterator, optimizer, loss_func, clip):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        # Forward pass
        batch_src, batch_src_lens = batch.src
        batch_trg, batch_trg_lens = batch.trg
        # preds: (batch, trg_step-1, trg_out_dim)
        preds = model(batch_src, batch_trg)
        
        # Calculate loss
        preds_flattened = preds.view(-1, preds.size(-1))
        batch_trg_flattened = batch_trg[1:].flatten()
        loss = loss_func(preds_flattened, batch_trg_flattened)

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)

        # Update weights
        optimizer.step()
        # Accumulate loss
        epoch_loss += loss.item()
    return epoch_loss/len(iterator)

def eval_epoch(model, iterator, loss_func):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            # Forward pass
            batch_src, batch_src_lens = batch.src
            batch_trg, batch_trg_lens = batch.trg
            # preds: (batch, trg_step-1, trg_out_dim)
            preds = model(batch_src, batch_trg)
            
            # Calculate loss
            preds_flattened = preds.view(-1, preds.size(-1))
            batch_trg_flattened = batch_trg[1:].flatten()
            loss = loss_func(preds_flattened, batch_trg_flattened)
            
            # Accumulate loss and acc
            epoch_loss += loss.item()
    return epoch_loss/len(iterator)

In [25]:
import time
N_EPOCHS = 10
CLIP = 1
best_valid_loss = np.inf

for epoch in range(N_EPOCHS):
    t0 = time.time()
    train_loss = train_epoch(model, train_iterator, optimizer, loss_func, CLIP)
    valid_loss = eval_epoch(model, valid_iterator, loss_func)
    epoch_secs = time.time() - t0

    epoch_mins, epoch_secs = int(epoch_secs // 60), int(epoch_secs % 60)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "models/tut7-model.pt")
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')

Epoch: 01 | Epoch Time: 0m 13s
	Train Loss: 4.023 | Train PPL:  55.855
	 Val. Loss: 2.689 |  Val. PPL:  14.717
Epoch: 02 | Epoch Time: 0m 13s
	Train Loss: 2.536 | Train PPL:  12.626
	 Val. Loss: 2.112 |  Val. PPL:   8.262
Epoch: 03 | Epoch Time: 0m 13s
	Train Loss: 2.030 | Train PPL:   7.615
	 Val. Loss: 1.862 |  Val. PPL:   6.439
Epoch: 04 | Epoch Time: 0m 13s
	Train Loss: 1.717 | Train PPL:   5.570
	 Val. Loss: 1.707 |  Val. PPL:   5.514
Epoch: 05 | Epoch Time: 0m 13s
	Train Loss: 1.489 | Train PPL:   4.434
	 Val. Loss: 1.633 |  Val. PPL:   5.120
Epoch: 06 | Epoch Time: 0m 13s
	Train Loss: 1.310 | Train PPL:   3.708
	 Val. Loss: 1.600 |  Val. PPL:   4.955
Epoch: 07 | Epoch Time: 0m 14s
	Train Loss: 1.166 | Train PPL:   3.209
	 Val. Loss: 1.597 |  Val. PPL:   4.936
Epoch: 08 | Epoch Time: 0m 13s
	Train Loss: 1.043 | Train PPL:   2.839
	 Val. Loss: 1.592 |  Val. PPL:   4.914
Epoch: 09 | Epoch Time: 0m 14s
	Train Loss: 0.938 | Train PPL:   2.554
	 Val. Loss: 1.622 |  Val. PPL:   5.065
E

In [26]:
model.load_state_dict(torch.load("models/tut7-model.pt", map_location=device))

valid_loss = eval_epoch(model, valid_iterator, loss_func)
test_loss = eval_epoch(model, test_iterator, loss_func)

print(f'Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')
print(f'Test Loss: {test_loss:.3f} |  Test PPL: {np.exp(test_loss):7.3f}')

Val. Loss: 1.592 |  Val. PPL:   4.914
Test Loss: 1.646 |  Test PPL:   5.187


## Check Embeddings
* The Embeddings of `<unk>` and `<pad>` tokens
    * Because the `padding_idx` has been passed to `nn.Embedding`, so the `<pad>` embedding will remain zeros throughout training.  
    * While the `<unk>` embedding will be learned.

In [27]:
print(model.src_tok_emb.weight[:5, :8])
print(model.trg_tok_emb.weight[:5, :8])

tensor([[ 0.0145, -0.0028, -0.0133, -0.0288, -0.0227,  0.0068,  0.0158, -0.0114],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0326, -0.0211, -0.0186,  0.0078, -0.0133,  0.0187, -0.0087, -0.0062],
        [ 0.0179,  0.0209, -0.0283, -0.0350,  0.0054,  0.0049, -0.0063, -0.0033],
        [ 0.0119,  0.0186, -0.0199, -0.0004, -0.0388,  0.0182,  0.0030, -0.0090]],
       device='cuda:0', grad_fn=<SliceBackward>)
tensor([[-0.0110,  0.0086,  0.0037, -0.0252, -0.0145, -0.0162,  0.0091, -0.0139],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0040, -0.0098,  0.0058,  0.0036, -0.0020,  0.0027,  0.0047,  0.0114],
        [-0.0127,  0.0116, -0.0252,  0.0118, -0.0249,  0.0176, -0.0033,  0.0143],
        [ 0.0193, -0.0043, -0.0161,  0.0036, -0.0015,  0.0110,  0.0005, -0.0074]],
       device='cuda:0', grad_fn=<SliceBackward>)


## Inference

In [28]:
def translate(model, src_tokens):
    """
    Single sentence translation.
    """
    model.eval()
    src_indexes = [SRC.vocab.stoi[tok] for tok in src_tokens]
    src = torch.tensor(src_indexes, dtype=torch.long, device=device).unsqueeze(-1)

    with torch.no_grad():
        # pred_indexes: (batch, trg_step-1)
        pred_indexes = model.translate(src, DEC_SOS_IDX)

    trans_tokens = []
    for idx in pred_indexes.flatten():
        tok = TRG.vocab.itos[idx.item()]
        trans_tokens.append(tok)
        if tok == TRG.eos_token:
            break
    return trans_tokens

In [29]:
ex_idx = 0
src_tokens = train_data[ex_idx].src
trg_tokens = train_data[ex_idx].trg
src_tokens = [SRC.init_token] + src_tokens + [SRC.eos_token]
trans_tokens = translate(model, src_tokens)

print(" ".join(src_tokens))
print(" ".join(trans_tokens))
print(" ".join(trg_tokens))

<sos> zwei junge weiße männer sind im freien in der nähe vieler büsche . <eos>
two young white men are outside near many bushes . <eos>
two young , white males are outside near many bushes .


In [30]:
ex_idx = 1
src_tokens = train_data[ex_idx].src
trg_tokens = train_data[ex_idx].trg
src_tokens = [SRC.init_token] + src_tokens + [SRC.eos_token]
trans_tokens = translate(model, src_tokens)

print(" ".join(src_tokens))
print(" ".join(trans_tokens))
print(" ".join(trg_tokens))

<sos> mehrere männer mit schutzhelmen bedienen ein antriebsradsystem . <eos>
several men in hard hats are operating a <unk> . <eos>
several men in hard hats are operating a giant pulley system .


In [31]:
ex_idx = 2
src_tokens = train_data[ex_idx].src
trg_tokens = train_data[ex_idx].trg
src_tokens = [SRC.init_token] + src_tokens + [SRC.eos_token]
trans_tokens = translate(model, src_tokens)

print(" ".join(src_tokens))
print(" ".join(trans_tokens))
print(" ".join(trg_tokens))

<sos> ein kleines mädchen klettert in ein spielhaus aus holz . <eos>
a little girl climbs into a wooden playhouse . <eos>
a little girl climbing into a wooden playhouse .


In [32]:
ex_idx = 3
src_tokens = train_data[ex_idx].src
trg_tokens = train_data[ex_idx].trg
src_tokens = [SRC.init_token] + src_tokens + [SRC.eos_token]
trans_tokens = translate(model, src_tokens)

print(" ".join(src_tokens))
print(" ".join(trans_tokens))
print(" ".join(trg_tokens))

<sos> ein mann in einem blauen hemd steht auf einer leiter und putzt ein fenster . <eos>
a man in a blue shirt stands on a ladder cleaning a window . <eos>
a man in a blue shirt is standing on a ladder cleaning a window .


## BLEU

*BLEU* (Bilingual Evaluation Understudy) is a metric measuring the quality of translation.  
BLEU looks at the overlap in the predicted and actual target sequences in terms of their *n-grams*.  
BLEU gives a number between 0 and 1 for each sequence, and a higher BLEU suggests better translation quality.  
BLEU being 1 means a perfect overlap, i.e., a perfect translation.  

In [33]:
from torchtext.data.metrics import bleu_score

def calc_bleu(data):
    trg_data = []
    trans_data = []
    for ex in data:
        src_tokens = ex.src
        trg_tokens = ex.trg
        src_tokens = [SRC.init_token] + src_tokens + [SRC.eos_token]
        trans_tokens = translate(model, src_tokens)

        # The groudtruth may contain multiple actual sentences (right translations). 
        trg_data.append([trg_tokens])
        trans_data.append(trans_tokens[:-1])  # Cut off the <eos> token

    return bleu_score(trans_data, trg_data)

In [34]:
bleu = calc_bleu(test_data)

print(f'BLEU score is: {bleu*100:.2f}')

BLEU score is: 36.00
