# Fall 2022: DS-GA 1011 NLP with Representation Learning
## Homework 2
## Part 3: Neural Machine Translation (30 pts)
In this part, you implement Transformer encoder for Neural Machine Translation (NMT) using a sequence to sequence (seq2seq) model for English to French translation with PyTorch.

---
### 1 Transformer Encoder (18 pts)

In [1]:
# Add utilities path
import sys

path_to_utils = 'pyfiles'
sys.path.append(path_to_utils)

In [2]:
# Import custom modules
import global_variables
import nmt_dataset
import nnet_models_new

In [3]:
# Load data
import os

source_name = 'en'
target_name = 'fr'

base_saved_models_dir = '.'
saved_models_dir = os.path.join(base_saved_models_dir, source_name+'2'+target_name)

main_data_path = './data/'

path_to_train_data = {'source':main_data_path+'train.'+source_name, 
                      'target':main_data_path+'train.'+target_name}
path_to_val_data = {'source': main_data_path+'valid.'+source_name, 
                      'target':main_data_path+'valid.'+target_name}

In [4]:
saved_language_model_dir = os.path.join(saved_models_dir, 'lang_obj')

dataset_dict = {'train': nmt_dataset.LanguagePair(source_name = source_name, target_name=target_name, 
                    filepath = path_to_train_data, 
                    lang_obj_path = saved_language_model_dir,
                     minimum_count = 1), 

                'val': nmt_dataset.LanguagePair(source_name = source_name, target_name=target_name, 
                    filepath = path_to_val_data, 
                    lang_obj_path = saved_language_model_dir,
                    minimum_count = 1)}

In [5]:
MAX_LEN = int(dataset_dict['train'].main_df['source_len'].quantile(0.9999)) # 32
batchSize = 64

In [6]:
from functools import partial
from torch.utils.data import DataLoader

dataloader_dict = {'train': DataLoader(dataset_dict['train'], batch_size = batchSize, 
                            collate_fn = partial(nmt_dataset.vocab_collate_func, MAX_LEN=MAX_LEN),
                            shuffle = True, num_workers=0), 
                    'val': DataLoader(dataset_dict['val'], batch_size = batchSize, 
                            collate_fn = partial(nmt_dataset.vocab_collate_func, MAX_LEN=MAX_LEN),
                            shuffle = True, num_workers=0) }

In [13]:
# Configuration
source_lang_obj = dataset_dict['train'].source_lang_obj
target_lang_obj = dataset_dict['train'].target_lang_obj

source_vocab = dataset_dict['train'].source_lang_obj.n_words;
target_vocab = dataset_dict['train'].target_lang_obj.n_words;
hidden_size = 512
enc_layers = 1
lr = 0.25; # try 0.01 later
longest_label = 1;
gradient_clip = 0.3;
use_cuda = True

num_epochs = 20

#### 1.1 Encoder (9 pts)

In [160]:
# add library / configuration for positional embeddings
import torch
from torch import nn
from math import sqrt, sin, cos

## add config
d_model = 512
n_heads = 2
seq_len = 512

In [177]:
# Add transformer as encoder in seq2seq model

# code below can help you to start it, but feel free to start from scratch

class EncoderTransformer(nn.Module):
    def __init__(self, n_vocab, d_model, seq_len, enc_layers, n_heads):

        super().__init__()
        
        # you need to add more things here
        self.embed = nn.Embedding(n_vocab, d_model)
        self.pos_embed = nn.Embedding(seq_len, d_model) # sinusoidal embedding
        

        # Relative positional embeddings taken from https://arxiv.org/pdf/1706.03762.pdf.
        self.pos_embed.requires_grad = False
        embeddings = self.pos_embed.weight
        
        with torch.no_grad():
            for pos in range(seq_len):
                for idx in range(d_model // 2):
                    embeddings[pos, 2 * idx] = sin(pos / 10000**(2 * idx / d_model))
                    embeddings[pos, 2 * idx + 1] = cos(pos / 10000**(2 * idx / d_model))

        encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=enc_layers)

        
    def forward(self, text_vec):
        # some helpful directions below, check the MLM lab for more details
        _, seq_len = text_vec.size()
        embeddings = self.embed(text_vec)
        
        positions = torch.arange(seq_len, device=text_vec.device)
        pos_embeddings = self.pos_embed(positions).unsqueeze(dim=0)
        
#         print('size check: ', embeddings.size(), pos_embeddings.size())
        embeddings = embeddings + pos_embeddings  # apply pos embedding
        
        output = self.transformer(embeddings)
#         print('output:', output, output.shape)

        hidden_encoder = torch.sum(output,1,True).permute(1,0,2)
#         print('hidden:', hidden_encoder, hidden_encoder.shape)
        
        return output, hidden_encoder

In [170]:
a = torch.randn(1,3,4) 
print(a, a.shape)
print()

b = a.permute(1,0,2)
print(b, b.shape)
print()

c = torch.sum(b, 0, True)
print(c, c.shape)
print()

d = torch.sum(a, 1, True)
print(d, d.shape)
print()



tensor([[[-0.9330, -0.1128,  0.9978, -0.4210],
         [ 0.1949,  0.4050,  0.0911, -0.1466],
         [ 1.8391,  2.0360, -0.5387,  0.0095]]]) torch.Size([1, 3, 4])

tensor([[[-0.9330, -0.1128,  0.9978, -0.4210]],

        [[ 0.1949,  0.4050,  0.0911, -0.1466]],

        [[ 1.8391,  2.0360, -0.5387,  0.0095]]]) torch.Size([3, 1, 4])

tensor([[[ 1.1009,  2.3282,  0.5502, -0.5581]]]) torch.Size([1, 1, 4])

tensor([[[ 1.1009,  2.3282,  0.5502, -0.5581]]]) torch.Size([1, 1, 4])



In [178]:
encoder = EncoderTransformer(n_vocab=source_vocab, d_model=d_model, seq_len=seq_len, \
                             enc_layers=enc_layers, n_heads=n_heads)

#### 1.2 Decoder(s) (9 pts)

In [179]:
# Basic RNN decoder (no attention)
decoder_rnn_1 = nnet_models_new.DecoderRNN(target_vocab, hidden_size, enc_layers)

full_model_1 = nnet_models_new.seq2seq(encoder, decoder_rnn_1,
                              lr = lr, 
                              use_cuda = use_cuda, 
                              hiddensize = hidden_size, 
                              numlayers = enc_layers, 
                              target_lang=dataset_dict['train'].target_lang_obj,
                              longest_label = longest_label,
                              clip = gradient_clip)

In [180]:
# RNN Decoder with Encoder attention

In [181]:
# RNN Decoder with Encoder & Self attention

#### Training & Evaluation

In [184]:
'''
Reasonable range:
Basic RNN decoder (no attention): around 10
RNN decoder with encoder attention: around 15
RNN decoder with encoder attention and self-attention: around 25
'''

from tqdm import notebook
import time

def train_model(dataloader, nmt, num_epochs=20, val_every=1, saved_model_path = '.', enc_type ='rnn'):
    
    # we need to plot loss and blew for the future quesitons 
    # create an emply list for both loss and bleu score
    train_loss_list, train_bleu_list = [], []
    val_loss_list, val_bleu_list = [], []
    best_bleu = -1
    
    # start epoch
    for epoch in range(num_epochs):

        start = time.time()
        train_loss = 0

        print('Epoch: [{}/{}]'.format(epoch, num_epochs))
        
        ###
        # training part
        for i, data in notebook.tqdm(enumerate(dataloader['train'])):
            _, curr_loss = nmt.train_step(data);
            train_loss += curr_loss
        
        ## loss
        train_loss = train_loss / len(dataloader['train']) 
        print('training loss and bleu score:')
        print("epoch {} loss = {}, time = {}".format(epoch, train_loss,
                                                        time.time() - start))
        
        # bleu score
        train_bleu_score = nmt.get_bleu_score(dataloader['train'])
        print('training blue:', train_bleu_score)
        print()
        
        # save results to plot 
        train_loss_list.append(train_loss)
        train_bleu_list.append(train_bleu_score)
        
        sys.stdout.flush()
        
        ###
        # validation part
        # initialize loss and set the model to eval mode
        val_loss = 0
        nmt.eval()
        
        # this is validation set, so no calculation with no grad
        with torch.no_grad():
            for i, data in notebook.tqdm(enumerate(dataloader['val'])):
                _, curr_loss = nmt.train_step(data);
                val_loss += curr_loss
                
            ## loss
            val_loss = val_loss / len(dataloader['val']) 
            val_loss_list.append(val_loss)
            print('valiation loss and bleu score:')
            print("epoch {} loss = {}, time = {}".format(epoch, val_loss,
                                                            time.time() - start))     
        
        if epoch%val_every == 0:
            val_bleu_score = nmt.get_bleu_score(dataloader['val']);
            val_bleu_list.append(val_bleu_score)
            print('validation bleu: ', val_bleu_score)
            sys.stdout.flush()

            nmt.scheduler_step(val_bleu_score);

            if val_bleu_score > best_bleu:
                best_bleu = val_bleu_score
#                 save_models(nmt, saved_model_path, enc_type);

        print('='*50)

    print("Training completed. Best BLEU is {}".format(best_bleu))  

In [185]:
train_model(dataloader_dict, full_model_1, 
                  num_epochs = num_epochs, 
                  saved_model_path = saved_models_dir, 
                  enc_type = 'rnn_test')

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch: [0/20]


0it [00:00, ?it/s]

KeyboardInterrupt: 

---
### 2 Attention visualization (12 pts)

In [None]:
# Model was trained in ~2 hours, i.e. you can expect attention maps
# to look quite 'hard' (less soft spreading) i.e. attending to some particular token in the input