# NLP LAB 2 - Encoder-decoder model

Authors:
* Aurelien ROUXEL
* Ethan MACHAVOINE
* Jonathan POELGER

## Tutorial

In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List


# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [2]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['< unk >', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

2023-05-22 18:27:01.491522: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-22 18:27:03.069388: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-22 18:27:03.074338: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-22 18:27:03.074597: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

In [3]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [4]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [5]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [6]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [7]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [8]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [9]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



Epoch: 1, Train loss: 5.343, Val loss: 4.113, Epoch time = 43.071s
Epoch: 2, Train loss: 3.762, Val loss: 3.323, Epoch time = 42.175s
Epoch: 3, Train loss: 3.158, Val loss: 2.891, Epoch time = 42.501s
Epoch: 4, Train loss: 2.767, Val loss: 2.633, Epoch time = 42.569s
Epoch: 5, Train loss: 2.480, Val loss: 2.444, Epoch time = 42.618s
Epoch: 6, Train loss: 2.247, Val loss: 2.303, Epoch time = 42.641s
Epoch: 7, Train loss: 2.054, Val loss: 2.201, Epoch time = 42.794s
Epoch: 8, Train loss: 1.894, Val loss: 2.116, Epoch time = 42.846s
Epoch: 9, Train loss: 1.754, Val loss: 2.055, Epoch time = 43.865s
Epoch: 10, Train loss: 1.630, Val loss: 1.995, Epoch time = 43.958s
Epoch: 11, Train loss: 1.520, Val loss: 1.961, Epoch time = 44.006s
Epoch: 12, Train loss: 1.419, Val loss: 1.948, Epoch time = 43.913s
Epoch: 13, Train loss: 1.334, Val loss: 1.956, Epoch time = 44.014s
Epoch: 14, Train loss: 1.252, Val loss: 1.953, Epoch time = 43.750s
Epoch: 15, Train loss: 1.171, Val loss: 1.939, Epoch time

In [10]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))

 A group of people stand in an auditorium . 


Result:
* A group of people stand in an auditorium .

## 1. Theoretical questions

#### In the positional encoding, why are we using a combination of sinus and cosinus ?

 * The combination of sinus and cosinus functions in the positional encodings is used to ensures that the positional information is represented in a continuous and smooth manner. This allows the model to generalize well to sequences of different lengths and enables the model to learn token positions better.

#### In the Seq2SeqTransformer class, what is the parameter nhead for ?
 
 * The nhead parameter in the Seq2SeqTransformer class corresponds to the number of attention heads in the Transformer model. Each attention head will learn to focus on different aspects of the input sequence, attending to different positions and learning different relationships.

#### In the Seq2SeqTransformer class, what is the point of the generator ?

 * The purpose of the generator layer is to transform the hidden states into a probability distribution over the target vocabulary. Each element of the output tensor represents the likelihood of the corresponding word in the vocabulary being the next word in the output sequence.

#### Describe the goal of the `create_mask` function. Why does it handle differently the source and target masks ?

* The create_mask function is responsible for creating masks that indicate in the input sequence and in the output sequence which positions should be attended to and which positions should be ignored during the self-attention calculation in the Transformer model. The goal of the create_mask function is to ensure that the model attends only to the relevant positions and prevents attending to future positions during training and prediction. It handles the source and the target differently because:
    * The source mask is used to prevent the model from attending to the padding tokens in the input sequence.
    * The target mask is used in both the encoder and decoder parts of the Transformer. In the encoder, it serves the same purpose as the source mask but, in the decoder, the target mask has an additional role, which is to prevent attending to future positions during training.

## 2. Decoding functions

#### Top-k sampling with temperature.

In [11]:
def top_k_decode(model, src, src_mask, max_len, start_symbol, top_k = 1, temperature = 1.0):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out[-1, :] / temperature
        filtered_logits, indices = torch.topk(out, top_k, dim=-1)
        probabilities = torch.nn.functional.softmax(filtered_logits, dim=-1)
        next_word = torch.multinomial(probabilities, num_samples=1).item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


In [12]:
def translate_topk(model: torch.nn.Module, src_sentence: str, topk, temperature):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = top_k_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, top_k = topk, temperature = temperature).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [13]:
print(translate_topk(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", 40, 0.8))

 an woman . black at . boy blue a  of <unk> 


Result:
*    an woman . black at . boy blue a  of $< unk >$ 

#### 

#### Top-p sampling with temperature.

In [24]:
def top_p_decode(model, src, src_mask, max_len, start_symbol, top_p = 1, temperature = 1.0):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out[-1, :] / temperature
        sorted_probs, sorted_indices = torch.sort(out, descending=True)
        cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_probs, dim=-1), dim=-1)
        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
        sorted_indices_to_remove[:, 0] = 0
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        out[indices_to_remove] = float('-inf')
        out = torch.nn.functional.softmax(out, dim=-1)
        next_word = torch.multinomial(out, num_samples=1).item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

In [25]:
def translate_topp(model: torch.nn.Module, src_sentence: str, topp, temperature):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = top_p_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, top_p = topp, temperature = temperature).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [28]:
print(translate_topp(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", 0.6, 0.8))

 brick women it hands sign helmet watching sweatshirt all climbing workers young a food workers


Result:
* brick women it hands sign helmet watching sweatshirt all climbing workers young a food workers

#### Play with the k, p and temperature parameters, and compare a few translation samples for each approach.

In [30]:
parameters_k = [10, 20, 40]
parameters_p = [0.3, 0.5, 0.7]
parameters_temp = [0.5, 0.8, 1.2]
sentences = ["Wie lange lebst du schon in Stuttgart ?", "Eine Gruppe von Menschen steht vor einem Iglu .", "Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. "]
translations = ["How long have you been living in Stuttgart?", "A group of people standing in front of an igloo .", "If possible, enter whole words that should appear in the article text, but especially in the lemma."]
for temp in parameters_temp:
    for k, p in zip(parameters_k, parameters_p):
        print("Top-K: " + str(k) + " Top-P: " + str(p) + " Temperature: " + str(temp))
        for sentence, translation in zip(sentences, translations):
            print("Sentence: " + sentence)
            print("Translation Top-K: " + translate_topk(transformer, sentence, k, temp))
            print("Translation Top-P: " + translate_topp(transformer, sentence, p, temp))
            print("Translation Greedy: " + translate(transformer, sentence))
            print("True Translation: " + translation)
            print("")

Top-K: 10 Top-P: 0.3 Temperature: 0.5
Sentence: Wie lange lebst du schon in Stuttgart ?
Translation Top-K:   <unk> .  <pad> <unk> <pad> . on A A <pad>  <unk>
Translation Top-P:  arm very run Men fire fire stone left Men adult fire fire fire fire
Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
True Translation: How long have you been living in Stuttgart?

Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
Translation Top-K:  . the <unk> <unk> <unk> <unk> <pad> 
Translation Top-P:  putting gray gray gray gray gray gray photo photo between down down down down down
Translation Greedy:  A group of people stand in an auditorium . 
True Translation: A group of people standing in front of an igloo .

Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
Translation Top-K:  a a <pad>  on <unk> in <unk> .  <pad>  on on A <unk> A <pad>  a <pad> 
Translation Top-P:  fishing cross cross both playing b

Result:

Top-K: 10 Top-P: 0.3 Temperature: 0.5
* Sentence: Wie lange lebst du schon in Stuttgart ?
    
    * Translation Top-K:   < unk > .  < pad > < unk >  < pad > . on A A < pad >  < unk >
    * Translation Top-P:  arm very run Men fire fire stone left Men adult fire fire fire fire
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
    
    * Translation Top-K:  . the < unk > < unk > < unk > < unk > < pad > 
    * Translation Top-P:  putting gray gray gray gray gray gray photo photo between down down down down down
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    
    * Translation Top-K:  a a < pad >  on < unk > in < unk > .  < pad >  on on A < unk > A < pad >  a < pad > 
    * Translation Top-P:  fishing cross cross both playing both playing African eating full cross does does does sand does does sand their crowded climbing while
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 20 Top-P: 0.5 Temperature: 0.5
* Sentence: Wie lange lebst du schon in Stuttgart ?
    
    * Translation Top-K:  a < pad > < unk > < unk > < pad > in < unk > 
    * Translation Top-P:  head old court crowd cross leaning leaning floor Men floor Men climbing climbing while
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
    
    * Translation Top-K:  < pad > < pad > < unk > < unk >  < pad >  on 
    * Translation Top-P:  sign cart cart cart cart smiling climbing smiling climbing An climbing climbing climbing An down
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    
    * Translation Top-K:  woman A < pad > on man < pad > 
    * Translation Top-P:  fishing cross old African both stage fire fire watching fire fire fire cowboy fire men men fire fire climbing stage stage their
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 40 Top-P: 0.7 Temperature: 0.5
* Sentence: Wie lange lebst du schon in Stuttgart ?
    
    * Translation Top-K:  and < unk > with < unk > shirt < pad >  A people . < unk > girl , < unk >
    * Translation Top-P:  volleyball hat old women fire front Men computer fire while floor fire fire football
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
    
    * Translation Top-K:  white . in < pad > < unk > white girl < pad > his < pad > < unk > < pad > the are ,
    * Translation Top-P:  wave cart cart cart gray smiling smiling cart left climbing down climbing climbing down climbing
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    
    * Translation Top-K:  < unk > in is < unk > is to  are  
    * Translation Top-P:  fishing cross old both full cross both water stage pushing stage fire fire men dog men men fire men climbing both while
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 10 Top-P: 0.3 Temperature: 0.8
* Sentence: Wie lange lebst du schon in Stuttgart ?
    
    * Translation Top-K:  < unk > < pad > . in A < pad > in < unk > 
    * Translation Top-P:  running old court rocks fire football Men fire floor fire fire fire fire while
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
    
    * Translation Top-K:   the < pad > < unk > A < unk > 
    * Translation Top-P:  posing drink climbing working ready An working smiling climbing climbing workers between climbing while climbing
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    
    * Translation Top-K:  < unk > < unk > < pad > 
    * Translation Top-P:  fishing cross leaning fence front have scarf stage crowded arms left fire fire watching their day stage climbing their appears men climbing
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 20 Top-P: 0.5 Temperature: 0.8
* Sentence: Wie lange lebst du schon in Stuttgart ?
    
    * Translation Top-K:  < unk > A and are the < pad > is man woman < unk > on < pad > of 
    * Translation Top-P:  car full floor - face computer motorcycle hat motorcycle aged full talking aged lined
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
   
    * Translation Top-K:  woman 
    * Translation Top-P:  ramp who bridge Man Many wave between smiling Men short men soccer woods helmet short
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    
    * Translation Top-K:  are and in  man Two < unk > the to man < pad > . Two the < pad > man < unk > woman of  < unk > < pad >
    * Translation Top-P:  where leaning cross its glasses others baseball crowded sky fire on fish leaning green crowded walks subway while tall bus adult jumping
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 40 Top-P: 0.7 Temperature: 0.8
* Sentence: Wie lange lebst du schon in Stuttgart ?
    
    * Translation Top-K:  < pad > is man his < unk > black man men at of 
    * Translation Top-P:  jersey cross glasses leaning motorcycle Men Men jeans while instrument while colorful Men Men
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
    * Translation Top-K:  , shirt girl Two white on < unk > people shirt < pad > young the < unk > 
    * Translation Top-P:  posing who over down left climbing he snowy down down stairs orange down fire several
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    * Translation Top-K:  < pad > at < unk > young of at  black , blue woman and the of  
    * Translation Top-P:  trying stage stage colorful shirts waiting tan playing jump skateboard bench background between she dog men attire while performs basketball crowded climbing
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 10 Top-P: 0.3 Temperature: 1.2
* Sentence: Wie lange lebst du schon in Stuttgart ?

    * Translation Top-K:  < unk > . 
    * Translation Top-P:  fishing African rocks leaning colorful other court hat piece Men climbing fire climbing while
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
   
    * Translation Top-K:  < pad > < pad > < pad > < pad > 
    * Translation Top-P:  putting cart toward family sidewalk Man gray gray food climbing sweatshirt paper riding Children window
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
   
    * Translation Top-K:   in a < unk > < unk > the the . A  A < pad > A the < unk > < unk > . < pad > < pad > a < unk > 
    * Translation Top-P:  edge shoes fire motorcycle scarf pants cross People cellphone on attire watching both scarf their couple green men men between catch An
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 20 Top-P: 0.5 Temperature: 1.2
* Sentence: Wie lange lebst du schon in Stuttgart ?
 
    * Translation Top-K:  < unk > . < pad > to the . A < unk > < unk > < unk > < pad > of < unk > the
    * Translation Top-P:  head cross old band line event backpack between stone aged leaning while - paper
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
    
    * Translation Top-K:  are , are a a of to a  the a Two 
    * Translation Top-P:  left sign going going between enjoying suit smiling smiling Men he event left Men while
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    
    * Translation Top-K:  
    * Translation Top-P:  yard run tree setting does playing table cross dog drinking full colorful jersey This while stage at filled football African African cap
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.


Top-K: 40 Top-P: 0.7 Temperature: 1.2
* Sentence: Wie lange lebst du schon in Stuttgart ?
    
    * Translation Top-K:  are .  sitting and A woman man young of standing at boy a
    * Translation Top-P:  inside fishing worker fishing making music leaning outdoor other leaning like elderly hat blue
    * Translation Greedy:  Large long - haired great Dane can be using matching factory ? 
    * True Translation: How long have you been living in Stuttgart?


* Sentence: Eine Gruppe von Menschen steht vor einem Iglu .
    
    * Translation Top-K:  sitting man wearing  his < pad > of at < pad > of black the at in black
    * Translation Top-P:  building gray hats person doing where floor something This tank camera < unk > orange ground construction
    * Translation Greedy:  A group of people stand in an auditorium . 
    * True Translation: A group of people standing in front of an igloo .


* Sentence: Möglichst ganze Wörter eingeben, die im Artikeltext, insbesondere aber im Lemma vorkommen sollen. 
    
    * Translation Top-K:  his . blue A man dog shirt boy < pad > 
    * Translation Top-P:  driving hands run stage " outfit their tree track stone both poses fire crowded does takes baby basketball cutting fence tall elderly
    * Translation Greedy:  A female snow athlete with matching belt , who is crouched together in the snowy moment . 
    * True Translation: If possible, enter whole words that should appear in the article text, but especially in the lemma.

## 3. Compute the BLEU score of the model

In [46]:
from sacrebleu.metrics import BLEU, CHRF, TER
bleu = BLEU()
predictions_topp = []
predictions_topk = []
predictions_greedy = []
for sentence in sentences:
    predictions_topp.append(translate_topp(transformer, sentence, 0.8, 0.8))
    predictions_topk.append(translate_topk(transformer, sentence, 60, 0.8))
    predictions_greedy.append(translate(transformer, sentence))

In [47]:
bleu.corpus_score(predictions_topp, translations)

BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 17.000 hyp_len = 51 ref_len = 3)

Result:
* BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 17.000 hyp_len = 51 ref_len = 3)

In [48]:
bleu.corpus_score(predictions_topk, translations)

BLEU = 0.57 1.5/0.8/0.4/0.2 (BP = 1.000 ratio = 22.333 hyp_len = 67 ref_len = 3)

Result:
* BLEU = 0.57 1.5/0.8/0.4/0.2 (BP = 1.000 ratio = 22.333 hyp_len = 67 ref_len = 3)

In [49]:
bleu.corpus_score(predictions_greedy, translations)

BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 12.667 hyp_len = 38 ref_len = 3)

Result: 
* BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 12.667 hyp_len = 38 ref_len = 3)