# Improving NMT using Pre-Trained Embeddings: French-English Case Study

#### Welcome to our project! From this moment on, you will be diving into a journey through transformer models. 
First of all, we will walk you through the Simple Single-Head, Single-Layer Transformer encoder we created for the CS 4644 Deep Learning Homework; then you'll move on to the famous transformer architecture defined in the notorious paper "Attention is All You Need" by Ashish Vaswani et. al 2017. 
To conclude, you will find the focus of our research, namely understanding the effect of using Pre-trained Language Models to create powerful embeddings.

In [1]:
!git clone https://github.com/susannapaoli/NLP-final-project.git

Cloning into 'NLP-final-project'...
remote: Enumerating objects: 350, done.[K
remote: Counting objects: 100% (52/52), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 350 (delta 30), reused 0 (delta 0), pack-reused 298[K
Receiving objects: 100% (350/350), 195.40 KiB | 3.69 MiB/s, done.
Resolving deltas: 100% (215/215), done.


In [2]:
%cd /content/NLP-final-project

/content/NLP-final-project


# Importing libraries

In [3]:
import math
import time
import io
import numpy as np
import csv
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import nltk

from torchtext.datasets import Multi30k
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm_notebook, tqdm
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

In [4]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

2023-04-21 21:39:38.246424: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
2023-04-21 21:39:52.081068: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical ope

## Import necessary models

In [5]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
from Transformer_baseline import TransformerTranslatorBaseline
from Transformer import TransformerTranslator
from Transformer_BERT import TransformerTranslatorBERT
from Transformer_GPT import TransformerTranslatorGPT
from Transformer_XLNet import TransformerTranslatorXLNET

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("You are using device: %s" % device)

You are using device: cpu


# Data Pre-processing Utils

First of all, we wrote some pre-processing functions for our data

In [8]:
def build_vocab(filepath, tokenizer):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  return vocab(counter, specials=['<unk>', '<pad>', '<sos>', '<eos>'], min_freq=2)

def data_process(filepaths):
  raw_fr_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_fr, raw_en) in zip(raw_fr_iter, raw_en_iter):
    raw_en_l=raw_en.lower()     #turn sentences to lower case 
    raw_fr_l=raw_fr.lower()
    fr_tensor = torch.tensor([fr_vocab[token] for token in fr_tokenizer(raw_fr_l)],
                            dtype=torch.long)
    en_tensor = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en_l)],
                            dtype=torch.long)
    if len(fr_tensor) <= MAX_LEN-2 and len(en_tensor) <= MAX_LEN-2:
        data.append((fr_tensor, en_tensor))
  return data

def generate_batch(data_batch):
  
    fr_batch, en_batch = [], []
    for (fr_item, en_item) in data_batch:
          en_batch.append(torch.cat([torch.tensor([SOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
          fr_batch.append(torch.cat([torch.tensor([SOS_IDX]), fr_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    fr_batch = pad_sequence(fr_batch, padding_value=PAD_IDX)
    fix=torch.ones(MAX_LEN,en_batch.shape[1])
    two= pad_sequence([fr_batch,en_batch, fix], padding_value=PAD_IDX)
    fr_batch=two[:,0,]
    en_batch=two[:,1,]
    return fr_batch, en_batch

Then we have defined the Baseline training function, together with the evaluate function to observe the loss during the training


In [9]:
def train(model, dataloader, optimizer, criterion, scheduler=None, device='cpu'):
    model.train()

    # Instantiate total loss
    total_loss = 0.

    # Get the progress bar for later modification
    progress_bar = tqdm_notebook(dataloader, ascii=True)

    # Mini-batch training
    for batch_idx, data in enumerate(progress_bar):
        source = data[0].transpose(1, 0).to(device)
        target = data[1].transpose(1, 0).to(device)

        translation = model(source)
        translation = translation.reshape(-1, translation.shape[-1])
        target = target.reshape(-1)

        optimizer.zero_grad()
        loss = criterion(translation, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_description_str(
            "Batch: %d, Loss: %.4f" % ((batch_idx + 1), loss.item()))

    return total_loss, total_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device='cpu'):
    # Set the model to eval mode to avoid weights update
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        # Get the progress bar
        progress_bar = tqdm_notebook(dataloader, ascii=True)
        for batch_idx, data in enumerate(progress_bar):
            source = data[0].transpose(1, 0).to(device)
            target = data[1].transpose(1, 0).to(device)

            translation = model(source)
            translation = translation.reshape(-1, translation.shape[-1])
            target = target.reshape(-1)

            loss = criterion(translation, target)
            total_loss += loss.item()
            progress_bar.set_description_str(
                "Batch: %d, Loss: %.4f" % ((batch_idx + 1), loss.item()))

    avg_loss = total_loss / len(dataloader)
    return total_loss, avg_loss

At this point we implemented the translate function and the clean sentence function, whose aim is to remove useless words from the translation

In [10]:
def translate(model, dataloader):
    model.eval()
    translated_t = []
    translated_s = []
    with torch.no_grad():
        # Get the progress bar 
        progress_bar = tqdm(dataloader, ascii = True)
        for batch_idx, data in enumerate(progress_bar):
            source = data[0].transpose(1,0).to(device)
            target = data[1].transpose(1,0).to(device)

            translation = model(source)
            translated_t.append(target)
            translated_s.append(translation)
    return translated_t, translated_s

def clean_sentences(source):
  words_to_remove = ["<sos>","<eos>","<pad>","\n", "."]
  result = []
  tokenized = []
  for sent in source:
    filtered_words = [word for word in sent if word not in words_to_remove]
    joined_string = ' '.join(filtered_words)
    result.append(joined_string)
    tokenized.append(filtered_words)
  reference = np.array(result)
  #tokenized = np.array(tokenized)
  return reference, tokenized

In addition, a function for the BLEU score computation has been introduced, so that we get a sense of the quality of the translations obtained with the model

In [11]:
def calculate_bleu(tokenized_ref, tokenized_can):
  b = 0
  for i in range(len(tokenized_ref)):
    bleu = sentence_bleu([tokenized_ref[i]], tokenized_can[i], (1/2, 1/2, 0, 0))
    b += bleu
  return b / len(tokenized_ref)

In the following section, we have defined the train, evaluate and translate function for the improved model

In [12]:
def train_improved(model, dataloader, optimizer, criterion, scheduler=None, device='cpu'):
    model.train()

    # Instantiate total loss
    total_loss = 0.

    # Get the progress bar for later modification
    progress_bar = tqdm_notebook(dataloader, ascii=True)

    # Mini-batch training
    for batch_idx, data in enumerate(progress_bar):
        source = data[0].transpose(1, 0).to(device)
        target = data[1].transpose(1, 0).to(device)

        translation = model(source, target)
        translation = translation.reshape(-1, translation.shape[-1])
        target = target.reshape(-1)

        optimizer.zero_grad()
        loss = criterion(translation, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_description_str(
            "Batch: %d, Loss: %.4f" % ((batch_idx + 1), loss.item()))

    return total_loss, total_loss / len(dataloader)

def evaluate_improved(model, dataloader, criterion, device='cpu'):
    # Set the model to eval mode to avoid weights update
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        # Get the progress bar
        progress_bar = tqdm_notebook(dataloader, ascii=True)
        for batch_idx, data in enumerate(progress_bar):
            source = data[0].transpose(1, 0).to(device)
            target = data[1].transpose(1, 0).to(device)

            translation = model(source, target)
            translation = translation.reshape(-1, translation.shape[-1])
            target = target.reshape(-1)

            loss = criterion(translation, target)
            total_loss += loss.item()
            progress_bar.set_description_str(
                "Batch: %d, Loss: %.4f" % ((batch_idx + 1), loss.item()))

    avg_loss = total_loss / len(dataloader)
    return total_loss, avg_loss

def translate_improved(model, dataloader):
    model.eval()
    translated_t = []
    translated_s = []
    with torch.no_grad():
        # Get the progress bar 
        progress_bar = tqdm(dataloader, ascii = True)
        for batch_idx, data in enumerate(progress_bar):
            source = data[0].transpose(1,0).to(device)
            target = data[1].transpose(1,0).to(device)
          
            translation = model(source, target)
            translated_t.append(target)
            translated_s.append(translation)

    return translated_t, translated_s

In the end, we introduced a plot function to trace the value of the loss during the epochs

In [13]:
def plot_loss(loss, model, title):
  epochs = range(len(loss))
  plt.figure(figsize = (12,8))
  plt.title(title)
  plt.plot(epochs, loss, color='black')
  plt.show()

# Data preprocessing

In [14]:
MAX_LEN = 20
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.fr.gz', 'train.en.gz')
val_urls = ('val.fr.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.fr.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

fr_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

100%|██████████| 604k/604k [00:00<00:00, 15.4MB/s]
100%|██████████| 569k/569k [00:00<00:00, 13.4MB/s]
100%|██████████| 23.0k/23.0k [00:00<00:00, 9.35MB/s]
100%|██████████| 21.6k/21.6k [00:00<00:00, 10.5MB/s]
100%|██████████| 22.3k/22.3k [00:00<00:00, 10.0MB/s]
100%|██████████| 21.1k/21.1k [00:00<00:00, 9.96MB/s]


In [15]:
fr_vocab = build_vocab(train_filepaths[0], fr_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)
fr_vocab.set_default_index(fr_vocab['<unk>'])
en_vocab.set_default_index(en_vocab['<unk>'])

In [16]:
train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [17]:
BATCH_SIZE = 128
PAD_IDX = fr_vocab['<pad>']
SOS_IDX = fr_vocab['<sos>']
EOS_IDX = fr_vocab['<eos>']

In [18]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=False, collate_fn=generate_batch)
valid_loader = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=False, collate_fn=generate_batch)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn=generate_batch)

In [19]:
input_size = len(fr_vocab)
output_size = len(en_vocab)
print (input_size,output_size)

6556 6192


In [20]:
fr_vocab['<sos>']

2

# Train Baseline Transformer

## Train Transformer Baseline

In [31]:
learning_rate = 1e-4
EPOCHS = 20
hidden_dim=400
num_heads=2
dim_feedforward=2096
dim_k=64
dim_v=64
dim_q=64
max_length=50

trans_model_baseline = TransformerTranslatorBaseline(input_size, output_size, device, num_heads = num_heads, max_length = max_length, hidden_dim = hidden_dim ,dim_feedforward = dim_feedforward).to(device)

optimizer = torch.optim.Adam(trans_model_baseline.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [32]:
avg_loss_trans_baseline = []
for epoch_idx in range(EPOCHS):
    print("-----------------------------------")
    print("Epoch %d" % (epoch_idx+1))
    print("-----------------------------------")
    
    train_loss, avg_train_loss = train(trans_model_baseline, train_loader, optimizer, criterion, device=device)
    scheduler.step(train_loss)

    val_loss, avg_val_loss = evaluate(trans_model_baseline, valid_loader, criterion, device=device)
    avg_loss_trans_baseline.append(avg_val_loss)
    print("Training Loss: %.4f. Validation Loss: %.4f. " % (avg_train_loss, avg_val_loss))
    print("Training Perplexity: %.4f. Validation Perplexity: %.4f. " % (np.exp(avg_train_loss), np.exp(avg_val_loss)))

-----------------------------------
Epoch 1
-----------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/176 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 4.7053. Validation Loss: 3.7601. 
Training Perplexity: 110.5340. Validation Perplexity: 42.9512. 
-----------------------------------
Epoch 2
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 3.6679. Validation Loss: 3.5334. 
Training Perplexity: 39.1712. Validation Perplexity: 34.2388. 
-----------------------------------
Epoch 3
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 3.4478. Validation Loss: 3.3631. 
Training Perplexity: 31.4308. Validation Perplexity: 28.8795. 
-----------------------------------
Epoch 4
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 3.2772. Validation Loss: 3.2447. 
Training Perplexity: 26.5016. Validation Perplexity: 25.6547. 
-----------------------------------
Epoch 5
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 3.1429. Validation Loss: 3.1502. 
Training Perplexity: 23.1716. Validation Perplexity: 23.3408. 
-----------------------------------
Epoch 6
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 3.0282. Validation Loss: 3.0699. 
Training Perplexity: 20.6591. Validation Perplexity: 21.5397. 
-----------------------------------
Epoch 7
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.9264. Validation Loss: 3.0001. 
Training Perplexity: 18.6600. Validation Perplexity: 20.0874. 
-----------------------------------
Epoch 8
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.8352. Validation Loss: 2.9407. 
Training Perplexity: 17.0340. Validation Perplexity: 18.9290. 
-----------------------------------
Epoch 9
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.7520. Validation Loss: 2.8897. 
Training Perplexity: 15.6737. Validation Perplexity: 17.9875. 
-----------------------------------
Epoch 10
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.6751. Validation Loss: 2.8447. 
Training Perplexity: 14.5135. Validation Perplexity: 17.1965. 
-----------------------------------
Epoch 11
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.6031. Validation Loss: 2.8039. 
Training Perplexity: 13.5059. Validation Perplexity: 16.5090. 
-----------------------------------
Epoch 12
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.5353. Validation Loss: 2.7667. 
Training Perplexity: 12.6200. Validation Perplexity: 15.9055. 
-----------------------------------
Epoch 13
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.4710. Validation Loss: 2.7329. 
Training Perplexity: 11.8345. Validation Perplexity: 15.3771. 
-----------------------------------
Epoch 14
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.4099. Validation Loss: 2.7020. 
Training Perplexity: 11.1333. Validation Perplexity: 14.9099. 
-----------------------------------
Epoch 15
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.3517. Validation Loss: 2.6743. 
Training Perplexity: 10.5030. Validation Perplexity: 14.5020. 
-----------------------------------
Epoch 16
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.2959. Validation Loss: 2.6490. 
Training Perplexity: 9.9330. Validation Perplexity: 14.1400. 
-----------------------------------
Epoch 17
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.2427. Validation Loss: 2.6264. 
Training Perplexity: 9.4190. Validation Perplexity: 13.8245. 
-----------------------------------
Epoch 18
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.1921. Validation Loss: 2.6065. 
Training Perplexity: 8.9538. Validation Perplexity: 13.5516. 
-----------------------------------
Epoch 19
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.1435. Validation Loss: 2.5889. 
Training Perplexity: 8.5295. Validation Perplexity: 13.3147. 
-----------------------------------
Epoch 20
-----------------------------------


  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.0969. Validation Loss: 2.5734. 
Training Perplexity: 8.1405. Validation Perplexity: 13.1098. 


## Translation Transformer Baseline

In [33]:
target_tb, translation_tb = translate(trans_model_baseline, test_loader)

100%|##########| 7/7 [00:00<00:00, 22.55it/s]


In [34]:
raw_tb = np.array([list(map(lambda x: en_vocab.get_itos()[x], target_tb[i][j])) for i in range(len(target_tb)) for j in range(len(target_tb[i]))])

In [35]:
reference_tb, tokenized_ref_tb = clean_sentences(raw_tb)

In [36]:
reference_tb[:20]

array(['a man in an orange hat starring at something',
       'a <unk> terrier is running on lush green grass in front of a white fence',
       'a girl in karate uniform breaking a stick with a front kick',
       'people are fixing the roof of a house',
       'a group of people standing in front of an igloo',
       'a guy works on a building',
       'a man in a vest is sitting in a chair and holding magazines',
       'a mother and her young song enjoying a beautiful day outside',
       'a woman holding a bowl of food in a kitchen',
       'man sitting using tool at a table in his home',
       'three people sit in a cave',
       'a girl in a jean dress is walking along a raised balance beam',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men pretend to be <unk> while women look on',
       'people standing outside of a building',
       'a teenager plays her trumpet on the field at a game',
    

In [37]:
token_trans_tb = [np.argmax(translation_tb[i].cpu().numpy(), axis = 2) for i in range(len(translation_tb))]
translated_tb = np.array([list(map(lambda x: en_vocab.get_itos()[x], token_trans_tb[i][j])) for i in range(len(token_trans_tb)) for j in range(len(token_trans_tb[i]))])

In [38]:
candidate_tb, tokenized_can_tb = clean_sentences(translated_tb)

In [39]:
candidate_tb[:20]

array(['a man in an orange hat looking looking something something',
       'a terrier of runs runs on the grass in in a front',
       'a girl in a gear a a a stick a with',
       'people are are to the of a',
       'a group of people standing in front a a a',
       'a guy works working on building',
       'a man in a sits in in a chair holding holding holding',
       'a mother and young young are enjoying a a nice',
       'a woman holding a dish food in a kitchen kitchen kitchen',
       'a man sitting at a table at a , train',
       'three people are sitting in a',
       'a girl in a dress dress jeans dress is a beam dress <unk>',
       'a blond giving hand a a a in the sand sand',
       'the person in a striped shirt is a',
       'two men are pretending <unk> be their while while women women look',
       'people standing in outside front of building building',
       'a teenage plays playing the the field the field during game game',
       'a woman is a a on a a trampo

In [40]:
bleu_tb  = calculate_bleu(tokenized_ref_tb, tokenized_can_tb)
print(f"BLEU score for Baseline Transformer is: {bleu_tb}")

BLEU score for Baseline Transformer is: 0.4227558425546378


# Train Improved Transformer

## Train Transformer Improved model

In [None]:
learning_rate = 1e-4
EPOCHS = 1
hidden_dim=400
num_heads=16
dim_feedforward=2096
dim_k=96
dim_v=96
dim_q=96
max_length=50
N_layers = 6

trans_model = TransformerTranslator(input_size, output_size, device, pad_idx = PAD_IDX, N_layers = N_layers, batch=BATCH_SIZE, num_heads = num_heads, max_length = max_length, hidden_dim = hidden_dim ,dim_feedforward = dim_feedforward).to(device)

optimizer = torch.optim.Adam(trans_model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
avg_loss_trans = []
for epoch_idx in range(EPOCHS):
    print("-----------------------------------")
    print("Epoch %d" % (epoch_idx+1))
    print("-----------------------------------")
    
    train_loss, avg_train_loss = train_improved(trans_model, train_loader, optimizer, criterion, device=device)
    scheduler.step(train_loss)

    val_loss, avg_val_loss = evaluate_improved(trans_model, valid_loader, criterion, device=device)
    avg_loss_trans.append(avg_val_loss)
    print("Training Loss: %.4f. Validation Loss: %.4f. " % (avg_train_loss, avg_val_loss))
    print("Training Perplexity: %.4f. Validation Perplexity: %.4f. " % (np.exp(avg_train_loss), np.exp(avg_val_loss)))

-----------------------------------
Epoch 1
-----------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/176 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.9883. Validation Loss: 1.1636. 
Training Perplexity: 19.8517. Validation Perplexity: 3.2015. 


## Translation Improved model

In [None]:
target_t, translation_t = translate_improved(trans_model, test_loader)

100%|##########| 7/7 [00:04<00:00,  1.50it/s]


In [None]:
raw_t = np.array([list(map(lambda x: en_vocab.get_itos()[x], target_t[i][j])) for i in range(len(target_t)) for j in range(len(target_t[i]))])

In [None]:
reference_t, tokenized_ref_t = clean_sentences(raw_t)

In [None]:
reference_t[:20]

array(['a man in an orange hat starring at something',
       'a <unk> terrier is running on lush green grass in front of a white fence',
       'a girl in karate uniform breaking a stick with a front kick',
       'people are fixing the roof of a house',
       'a group of people standing in front of an igloo',
       'a guy works on a building',
       'a man in a vest is sitting in a chair and holding magazines',
       'a mother and her young song enjoying a beautiful day outside',
       'a woman holding a bowl of food in a kitchen',
       'man sitting using tool at a table in his home',
       'three people sit in a cave',
       'a girl in a jean dress is walking along a raised balance beam',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men pretend to be <unk> while women look on',
       'people standing outside of a building',
       'a teenager plays her trumpet on the field at a game',
    

In [None]:
token_trans_t = [np.argmax(translation_t[i].cpu().numpy(), axis = 2) for i in range(len(translation_t))]
translated_t = np.array([list(map(lambda x: en_vocab.get_itos()[x], token_trans_t[i][j])) for i in range(len(token_trans_t)) for j in range(len(token_trans_t[i]))])

In [None]:
candidate_t, tokenized_can_t = clean_sentences(translated_t)

In [None]:
candidate_t[:20]

array(['a man in an orange hat at something',
       'a <unk> an is running on , green grass in front of a white fence',
       'a girl in street uniform picture a stick with a front ball',
       'people are the a of a house',
       'a group of people standing in front of an orange',
       'a guy works on a building',
       'a man in a boy is sitting in a chair and holding',
       'a and and her young little enjoying a beautiful day outside',
       'a woman holding a are of food in a kitchen',
       'man sitting using two at a table in his',
       'three people sit in a blue',
       'a girl in a sitting dress is walking along a race',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men of to be <unk> while women look on',
       'people standing outside of a building',
       'a a plays her a on the field at a game',
       'a woman does a and on a , on the beach',
       'a man is standing by a 

In [None]:
bleu_t  = calculate_bleu(tokenized_ref_t, tokenized_can_t)
print(f"BLEU score for improved transformer is: {bleu_t}")

BLEU score for improved transformer is: 0.7694534075375278


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Train BERT transformer

## Train Transformer BERT model

In [None]:
learning_rate = 1e-4
EPOCHS = 1
hidden_dim=768
num_heads=12
dim_feedforward=2200
dim_k=96
dim_v=96
dim_q=96
max_length=50
N_layers = 5

trans_bert = TransformerTranslatorBERT(input_size, output_size, device, pad_idx = PAD_IDX, N_layers = N_layers, batch=BATCH_SIZE, num_heads = num_heads, max_length = max_length, hidden_dim = hidden_dim ,dim_feedforward = dim_feedforward).to(device)

optimizer = torch.optim.Adam(trans_bert.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
avg_loss_trans = []
for epoch_idx in range(EPOCHS):
    print("-----------------------------------")
    print("Epoch %d" % (epoch_idx+1))
    print("-----------------------------------")
    
    train_loss, avg_train_loss = train_improved(trans_bert, train_loader, optimizer, criterion, device=device)
    scheduler.step(train_loss)

    val_loss, avg_val_loss = evaluate_improved(trans_bert, valid_loader, criterion, device=device)
    avg_loss_trans.append(avg_val_loss)
    print("Training Loss: %.4f. Validation Loss: %.4f. " % (avg_train_loss, avg_val_loss))
    print("Training Perplexity: %.4f. Validation Perplexity: %.4f. " % (np.exp(avg_train_loss), np.exp(avg_val_loss)))

-----------------------------------
Epoch 1
-----------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/176 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 1.8817. Validation Loss: 0.3591. 
Training Perplexity: 6.5646. Validation Perplexity: 1.4320. 


## Translation BERT model

In [None]:
target_b, translation_b = translate_improved(trans_bert, test_loader)

100%|##########| 7/7 [00:10<00:00,  1.52s/it]


In [None]:
raw_b = np.array([list(map(lambda x: en_vocab.get_itos()[x], target_b[i][j])) for i in range(len(target_b)) for j in range(len(target_b[i]))])

In [None]:
reference_b, tokenized_ref_b = clean_sentences(raw_b)

In [None]:
reference_b[:20]

array(['a man in an orange hat starring at something',
       'a <unk> terrier is running on lush green grass in front of a white fence',
       'a girl in karate uniform breaking a stick with a front kick',
       'people are fixing the roof of a house',
       'a group of people standing in front of an igloo',
       'a guy works on a building',
       'a man in a vest is sitting in a chair and holding magazines',
       'a mother and her young song enjoying a beautiful day outside',
       'a woman holding a bowl of food in a kitchen',
       'man sitting using tool at a table in his home',
       'three people sit in a cave',
       'a girl in a jean dress is walking along a raised balance beam',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men pretend to be <unk> while women look on',
       'people standing outside of a building',
       'a teenager plays her trumpet on the field at a game',
    

In [None]:
token_trans_b = [np.argmax(translation_b[i].cpu().numpy(), axis = 2) for i in range(len(translation_b))]
translated_b = np.array([list(map(lambda x: en_vocab.get_itos()[x], token_trans_b[i][j])) for i in range(len(token_trans_b)) for j in range(len(token_trans_b[i]))])

In [None]:
candidate_b, tokenized_can_b = clean_sentences(translated_b)

In [None]:
candidate_b[:20]

array(['a man in an orange hat surfer at something',
       'a <unk> terrier is running on of green grass in front of a white fence',
       'a girl in karate uniform plants a stick with a front kick',
       'people are fixing the roof of a house',
       'a group of people standing in front of an of',
       'a guy works on a building',
       'a man in a vest is sitting in a chair and holding between',
       'a mother and her young song enjoying a beautiful day outside',
       'a woman holding a bowl of food in a kitchen',
       'man sitting using tool at a table in his home',
       'three people sit in a cave',
       'a girl in a jean dress is walking along a raised balance beam',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men shacking to be <unk> while women look on',
       'people standing outside of a building',
       'a teenager plays her trumpet on the field at a game',
       'a woma

In [None]:
bleu_b  = calculate_bleu(tokenized_ref_b, tokenized_can_b)
print(f"BLEU score for improved transformer with BERT embeddings is: {bleu_b}")

BLEU score for improved transformer with BERT embeddings is: 0.9565196866929444


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Train XLNet Transformer

## Train Transformer XLNet model

In [21]:
learning_rate = 1e-4
EPOCHS =1
hidden_dim=768
num_heads=8
dim_feedforward=2200
dim_k=96
dim_v=96
dim_q=96
max_length=50
N_layers = 3

trans_xlnet = TransformerTranslatorXLNET(input_size, output_size, device, pad_idx = PAD_IDX, N_layers = N_layers, batch=BATCH_SIZE, num_heads = num_heads, max_length = max_length, hidden_dim = hidden_dim ,dim_feedforward = dim_feedforward).to(device)

optimizer = torch.optim.Adam(trans_xlnet.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
avg_loss_trans = []
for epoch_idx in range(EPOCHS):
    print("-----------------------------------")
    print("Epoch %d" % (epoch_idx+1))
    print("-----------------------------------")
    
    train_loss, avg_train_loss = train_improved(trans_xlnet, train_loader, optimizer, criterion, device=device)
    scheduler.step(train_loss)

    val_loss, avg_val_loss = evaluate_improved(trans_xlnet, valid_loader, criterion, device=device)
    avg_loss_trans.append(avg_val_loss)
    print("Training Loss: %.4f. Validation Loss: %.4f. " % (avg_train_loss, avg_val_loss))
    print("Training Perplexity: %.4f. Validation Perplexity: %.4f. " % (np.exp(avg_train_loss), np.exp(avg_val_loss)))

-----------------------------------
Epoch 1
-----------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/176 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.0586. Validation Loss: 0.4627. 
Training Perplexity: 7.8352. Validation Perplexity: 1.5884. 


## Translation XLNet model

In [23]:
target_x, translation_x = translate_improved(trans_xlnet, test_loader)

100%|##########| 7/7 [00:11<00:00,  1.60s/it]


In [24]:
raw_x = np.array([list(map(lambda x: en_vocab.get_itos()[x], target_x[i][j])) for i in range(len(target_x)) for j in range(len(target_x[i]))])

In [25]:
reference_x, tokenized_ref_x = clean_sentences(raw_x)

In [26]:
reference_x[:20]

array(['a man in an orange hat starring at something',
       'a <unk> terrier is running on lush green grass in front of a white fence',
       'a girl in karate uniform breaking a stick with a front kick',
       'people are fixing the roof of a house',
       'a group of people standing in front of an igloo',
       'a guy works on a building',
       'a man in a vest is sitting in a chair and holding magazines',
       'a mother and her young song enjoying a beautiful day outside',
       'a woman holding a bowl of food in a kitchen',
       'man sitting using tool at a table in his home',
       'three people sit in a cave',
       'a girl in a jean dress is walking along a raised balance beam',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men pretend to be <unk> while women look on',
       'people standing outside of a building',
       'a teenager plays her trumpet on the field at a game',
    

In [27]:
token_trans_x = [np.argmax(translation_x[i].cpu().numpy(), axis = 2) for i in range(len(translation_x))]
translated_x = np.array([list(map(lambda x: en_vocab.get_itos()[x], token_trans_x[i][j])) for i in range(len(token_trans_x)) for j in range(len(token_trans_x[i]))])

In [28]:
candidate_x, tokenized_can_x = clean_sentences(translated_x)

In [29]:
candidate_x[:20]

array(['a man in an orange hat kids at something',
       'a <unk> is is running on while green grass in front of a white fence',
       'a girl in karate uniform people a stick with a front kick',
       'people are fixing the roof of a house',
       'a group of people standing in front of an',
       'a guy works on a building',
       'a man in a vest is sitting in a chair and holding rock',
       'a mother and her young into enjoying a beautiful day outside',
       'a woman holding a bowl of food in a kitchen',
       'man sitting using the at a table in his home',
       'three people sit in a',
       'a girl in a dress is walking along a raised balance uniform',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men crowd to be <unk> while women look on',
       'people standing outside of a building',
       'a teenager plays her trumpet on the field at a game',
       'a woman does a that on a tr

In [41]:
bleu_x  = calculate_bleu(tokenized_ref_x, tokenized_can_x)
print(f"BLEU score for improved transformer with XLNet embeddings is: {bleu_x}")

BLEU score for improved transformer with XLNet embeddings is: 0.9145379181591919


# Congratulations! You made it through the "Journey to the center of Transformers"

# Extra analysis: GPT2 

## Train Transformer GPT model

In [None]:
learning_rate = 1e-4
EPOCHS =1
hidden_dim=768
num_heads=12
dim_feedforward=3000
dim_k=96
dim_v=96
dim_q=96
max_length=60
N_layers = 5

trans_gpt = TransformerTranslatorGPT(input_size, output_size, device, pad_idx = PAD_IDX, N_layers = N_layers, batch=BATCH_SIZE, num_heads = num_heads, max_length = max_length, hidden_dim = hidden_dim ,dim_feedforward = dim_feedforward).to(device)

optimizer = torch.optim.Adam(trans_gpt.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
avg_loss_trans = []
for epoch_idx in range(EPOCHS):
    print("-----------------------------------")
    print("Epoch %d" % (epoch_idx+1))
    print("-----------------------------------")
    
    train_loss, avg_train_loss = train_improved(trans_gpt, train_loader, optimizer, criterion, device=device)
    scheduler.step(train_loss)

    val_loss, avg_val_loss = evaluate_improved(trans_gpt, valid_loader, criterion, device=device)
    avg_loss_trans.append(avg_val_loss)
    print("Training Loss: %.4f. Validation Loss: %.4f. " % (avg_train_loss, avg_val_loss))
    print("Training Perplexity: %.4f. Validation Perplexity: %.4f. " % (np.exp(avg_train_loss), np.exp(avg_val_loss)))

-----------------------------------
Epoch 1
-----------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/176 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(dataloader, ascii=True)


  0%|          | 0/6 [00:00<?, ?it/s]

Training Loss: 2.9501. Validation Loss: 1.1587. 
Training Perplexity: 19.1075. Validation Perplexity: 3.1859. 


## Translation GPT model

In [None]:
target_g, translation_g = translate_improved(trans_gpt, test_loader)

100%|##########| 7/7 [00:12<00:00,  1.81s/it]


In [None]:
raw_g = np.array([list(map(lambda x: en_vocab.get_itos()[x], target_g[i][j])) for i in range(len(target_g)) for j in range(len(target_g[i]))])

In [None]:
reference_g, tokenized_ref_g = clean_sentences(raw_g)

In [None]:
reference_g[:20]

array(['a man in an orange hat starring at something',
       'a <unk> terrier is running on lush green grass in front of a white fence',
       'a girl in karate uniform breaking a stick with a front kick',
       'people are fixing the roof of a house',
       'a group of people standing in front of an igloo',
       'a guy works on a building',
       'a man in a vest is sitting in a chair and holding magazines',
       'a mother and her young song enjoying a beautiful day outside',
       'a woman holding a bowl of food in a kitchen',
       'man sitting using tool at a table in his home',
       'three people sit in a cave',
       'a girl in a jean dress is walking along a raised balance beam',
       'a blond holding hands with a guy in the sand',
       'the person in the striped shirt is mountain climbing',
       'two men pretend to be <unk> while women look on',
       'people standing outside of a building',
       'a teenager plays her trumpet on the field at a game',
    

In [None]:
token_trans_g = [np.argmax(translation_g[i].cpu().numpy(), axis = 2) for i in range(len(translation_g))]
translated_g = np.array([list(map(lambda x: en_vocab.get_itos()[x], token_trans_g[i][j])) for i in range(len(token_trans_g)) for j in range(len(token_trans_g[i]))])

In [None]:
candidate_g, tokenized_can_g = clean_sentences(translated_g)

In [None]:
candidate_g[:20]

array(['a man in an orange hat race at something down down down down down down down',
       'a <unk> down is running on male green grass in front of a white fence down',
       'a girl in from in a stick with a front look down down down',
       'people are two the baby of a their down down down down down down down down',
       'a group of people standing in front of an male down down down down down down',
       'a guy old on a building down down down down down down down down',
       'a man in a hat is sitting in a chair and holding bus down down down',
       'a man and her young off enjoying a to day outside down down down down down',
       'a woman holding a down of food in a and down down down down down',
       'man sitting using off at a table in his down down down down down down',
       'three people sit in a down down down down down down down down down down',
       'a girl in a race dress is walking along a from behind man down down down',
       'a blond holding hands w

In [None]:
bleu_g  = calculate_bleu(tokenized_ref_g, tokenized_can_g)
print(f"BLEU score for improved transformer is: {bleu_g}")

BLEU score for improved transformer is: 0.4903291636490951


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
