# Neural Machine Translation - Introduction to Deep Learning Homework

We will train a Neural Machine Translation model using Transformers on a translation task, using 2 types of normalization and compare their performance on the training and validation set

## Install Imports and requirements

In [2]:
!pip -qq install sacrebleu subword-nmt sacremoses googletrans==3.1.0a0 wandb 

In [1]:
import time
import copy
import sys

import torch
import wandb

import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from torch import optim
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from functools import partial
from tqdm import tqdm as tqdm

import nmt_dataset
import nnet_models
from subword_nmt.apply_bpe import BPE
%matplotlib inline

## Download the Training data

In [2]:
data_dir = 'data'
source_lang, target_lang = 'en', 'fr'
model_dir = 'models/{}-{}'.format(source_lang, target_lang)
!bash download-data.sh

## Preparing the dataset

### Load the BPE Model for tokenization

Byte-Pair Encoding (BPE) was introduced in Neural Machine Translation of Rare Words with Subword Units (Sennrich et al., 2015). BPE relies on a pre-tokenizer that splits the training data into words. Pretokenization can be as simple as space tokenization

BPE creates a base vocabulary consisting of all symbols that occur in the set of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until the vocabulary has attained the desired vocabulary size.

Source: [HuggingFace](https://huggingface.co/docs/transformers/tokenizer_summary)

In [3]:
def reset_seed(seed=1234):
    np.random.seed(seed)
    torch.manual_seed(seed)

bpe_path = os.path.join(data_dir, 'bpecodes.de-en-fr')

with open(bpe_path) as bpe_codes:
    bpe_model = BPE(bpe_codes)

def preprocess(line, is_source=True, source_lang=None, target_lang=None):
    return bpe_model.segment(line.lower())

def postprocess(line):
    return line.replace('@@ ', '')

def load_data(source_lang, target_lang, split='train', max_size=None):
    # max_size: max number of sentence pairs in the training corpus (None = all)
    path = os.path.join(data_dir, '{}.{}-{}'.format(split, *sorted([source_lang, target_lang])))
    return nmt_dataset.load_dataset(path, source_lang, target_lang, preprocess=preprocess, max_size=max_size)   # set max_size to 10000 for fast debugging


### Load the parallel corpora for the language pair

The script will load a corpus and tokenize it with the BPE model!

In [4]:
train_data = load_data(source_lang, target_lang, 'train', max_size=10000)   # set max_size to 10000 for fast debugging
valid_data = load_data(source_lang, target_lang, 'valid')
test_data = load_data(source_lang, target_lang, 'test')

In [13]:
test_data.iloc[:5]

Unnamed: 0,source_data,target_data,source_tokenized,target_tokenized
0,all tom needed was time .,"tout ce que tom avait besoin , c' était du tem...","[all, tom, needed, was, time, .]","[tout, ce, que, tom, avait, besoin, ,, c', éta..."
1,i 'm waiting for my friend .,j' attends mon amie .,"[i, 'm, waiting, for, my, friend, .]","[j', attends, mon, amie, .]"
2,why do you need a knife ?,pourquoi as-tu besoin d' un couteau ?,"[why, do, you, need, a, knife, ?]","[pourquoi, as-tu, besoin, d', un, couteau, ?]"
3,he lives off the grid .,il vit en marge de la société .,"[he, lives, off, the, gri@@, d, .]","[il, vit, en, mar@@, ge, de, la, société, .]"
4,the bridge is built of wood .,ce pont est fait en bois .,"[the, bridge, is, built, of, wo@@, od, .]","[ce, pont, est, fait, en, bois, .]"


Why you have the symbol @?? À explorer

### Load the dictionaries

In [16]:
source_dict_path = os.path.join(model_dir, 'dict.{}.txt'.format(source_lang))
target_dict_path = os.path.join(model_dir, 'dict.{}.txt'.format(target_lang))

source_dict = nmt_dataset.load_or_create_dictionary(
    source_dict_path,
    train_data['source_tokenized'],
    minimum_count=10,
    reset=False    # set reset to True if you're changing the data or the preprocessing
)

target_dict = nmt_dataset.load_or_create_dictionary(
    target_dict_path,
    train_data['target_tokenized'],
    minimum_count=10,
    reset=False
)

multi_model_dir = os.path.join('pretrained_models', 'de-en-fr')

multi_dict = nmt_dataset.load_or_create_dictionary(
        os.path.join(multi_model_dir, 'dict.txt'),
        dataset=None,
        minimum_count=10,
        reset=False
)

### Use the dictionary to map each token into a binarized token 

In [17]:
nmt_dataset.binarize(train_data, multi_dict, multi_dict, sort=True)
nmt_dataset.binarize(valid_data, multi_dict, multi_dict, sort=False)
nmt_dataset.binarize(test_data, multi_dict, multi_dict, sort=False)

### 5. Build the batch dataset

Here we create our dataset that consists of a batch of sequence of words of the same lengths in order to be fed to our model!

In [18]:
max_len = 30       # maximum 30 tokens per sentence (longer sequences will be truncated)
batch_size = 512   # maximum 512 tokens per batch (decrease if you get OOM errors, increase to speed up training)

reset_seed()

# *****START CODE
train_iterator = nmt_dataset.BatchIterator(train_data, source_lang, target_lang, batch_size=batch_size, max_len=max_len, shuffle=True)
valid_iterator = nmt_dataset.BatchIterator(valid_data, source_lang, target_lang, batch_size=batch_size, max_len=max_len, shuffle=False)
test_iterator = nmt_dataset.BatchIterator(test_data, source_lang, target_lang, batch_size=batch_size, max_len=max_len, shuffle=False)
# *****END CODE

## Training our models

### Model definition

Here we will create **4** models:
+ A shallow model with the residual normalization 
+ A shallow model with the classic normalization
+ A deep model with the classic normalization
+ A deep model with the classic normalization

In [23]:
shallow_transformer_encoder_preLN = nnet_models.TransformerEncoder(
    input_size=len(source_dict),
    hidden_size=512,
    num_layers=1,
    dropout=0.0,
    heads=4,
    normalize_before = True
)
shallow_transformer_decoder_preLN = nnet_models.TransformerDecoder(
    output_size=len(target_dict),
    hidden_size=512,
    num_layers=1,
    heads=4,
    dropout=0.0,
    normalize_before = True
)

shallow_transformer_model_preLN = nnet_models.EncoderDecoder(
    shallow_transformer_encoder_preLN,
    shallow_transformer_decoder_preLN,
    lr=0.001,
    use_cuda=True,
    target_dict=target_dict
)

In [24]:
def save_model(model, checkpoint_path):
    dirname = os.path.dirname(checkpoint_path)
    if dirname:
        os.makedirs(dirname, exist_ok=True)
    torch.save(model, checkpoint_path)

def train_model(
        train_iterator,
        valid_iterators,
        model,
        checkpoint_path,
        epochs=1,
        validation_frequency=1
    ):
    """
    train_iterator: instance of nmt_dataset.BatchIterator or nmt_dataset.MultiBatchIterator
    valid_iterators: list of nmt_dataset.BatchIterator
    model: instance of nnet_models.EncoderDecoder
    checkpoint_path: path of the model checkpoint
    epochs: iterate this many times over train_iterator
    validation_frequency: validate the model every N epochs
    """

    reset_seed()

    best_bleu = -1
    for epoch in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        print('Epoch: [{}/{}]'.format(epoch, epochs))

        # Iterate over training batches for one epoch
        for i, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)):
            t = time.time()
            running_loss += model.train_step(batch)

        # Average training loss for this epoch
        # *****START CODE
        epoch_loss = running_loss / len(train_iterator)
        # *****END CODE

        print("loss={:.3f}, time={:.2f}".format(epoch_loss, time.time() - start))
        sys.stdout.flush()

        # Evaluate and save the model
        if epoch % validation_frequency == 0:
            bleu_scores = []
            
            # Compute BLEU over all validation sets
            for valid_iterator in valid_iterators:
                # *****START CODE
                src, tgt = valid_iterator.source_lang, valid_iterator.target_lang
                translation_output = model.translate(valid_iterator, postprocess)
                bleu_score = translation_output.score
                output = translation_output.output
                # *****END CODE

                with open(os.path.join(model_dir, 'valid.{}-{}.{}.out'.format(src, tgt, epoch)), 'w') as f:
                    f.writelines(line + '\n' for line in output)

                print('{}-{}: BLEU={}'.format(src, tgt, bleu_score))
                sys.stdout.flush()
                bleu_scores.append(bleu_score)

            # Average the validation BLEU scores
            bleu_score = round(sum(bleu_scores) / len(bleu_scores), 2)
            if len(bleu_scores) > 1:
                print('BLEU={}'.format(bleu_score))

            # Update the model's learning rate based on current performance.
            # This scheduler divides the learning rate by 10 if BLEU does not improve.
            model.scheduler_step(bleu_score)

            # Save a model checkpoint if it has the best validation BLEU so far
            if bleu_score > best_bleu:
                best_bleu = bleu_score
                save_model(model, checkpoint_path)

        print('=' * 50)

    print("Training completed. Best BLEU is {}".format(best_bleu))

In [25]:
train_model(train_iterator, [valid_iterator], shallow_transformer_model_preLN,
                epochs=1,
                checkpoint_path='models')

  0%|          | 0/20465 [00:00<?, ?it/s]

Epoch: [1/1]





ValueError: too many values to unpack (expected 2)