# CPSC532S Assignment 3:  RNNs for Language Modeling

In [None]:
from collections import Counter
from collections import defaultdict
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F
import json
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

# Data Acquisition


The goal of this assignment is to translate English to Pig Latin. For this assignment, you must download the data and extract it into `data/`. The dataset contains four files, each containing a single caption on each line. There are two files for training (English vs Pig Latin) and two files for validation. We should have 20,000 sentences (one sentence per image in Assignment 2) in the training captions and 500 sentences in the validation captions (five sentences per image in Assignment 2).

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Load the data into memory.
mscoco_train = json.load(open("/content/drive/My Drive/Colab Notebooks/train_captions.json"))
mscoco_val  = json.load(open('/content/drive/My Drive/Colab Notebooks/val_captions.json'))

mscoco_piglatin_train = json.load(open('/content/drive/My Drive/Colab Notebooks/piglatin_train_captions.json'))
mscoco_piglatin_val  = json.load(open('/content/drive/My Drive/Colab Notebooks/piglatin_val_captions.json'))

train_sentences = [entry['caption'] for entry in mscoco_train['annotations']]
val_sentences = [entry['caption'] for entry in mscoco_val['annotations']]

piglatin_train_sentences = [entry['caption'] for entry in mscoco_piglatin_train['annotations']]
piglatin_val_sentences = [entry['caption'] for entry in mscoco_piglatin_val['annotations']]

print(len(train_sentences))
print(len(val_sentences))
print(len(piglatin_train_sentences))
print(len(piglatin_val_sentences))
print(train_sentences[0])
print(piglatin_train_sentences[0])
print(val_sentences[0])
print(piglatin_val_sentences[0])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
20000
500
20000
500
A very clean and well decorated empty bathroom
Away eryvay eanclay andway ellway ecoratedday emptyway athroombay
Set of bananas hanging off of a banana tree.
Etsay ofway ananasbay anginghay offway ofway away ananabay eetray.


# Preprocessing

The code provided below creates word embeddings for you to use. After creating the vocabulary, we construct both one-hot embeddings and word2vec embeddings. 


In [None]:
import nltk
nltk.download('punkt')

sentences = train_sentences
piglatin_sentences = piglatin_train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]
piglatin_sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in piglatin_sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 2000
word_counts = Counter([word for sentence in sentences for word in sentence])
piglatin_word_counts = Counter([word for sentence in piglatin_sentences for word in sentence])
word_counts = word_counts + piglatin_word_counts
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}

# Build the one hot embeddings
one_hot_embeddings = np.eye(vocabularySize)


# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
piglatin_filtered_sentences = [[word for word in sentence if word in word2index] for sentence in piglatin_sentences]
all_filtered_sentences = filtered_sentences + piglatin_filtered_sentences
w2v = Word2Vec(all_filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])
piglatin_maxSequenceLength = max([len(sentence) for sentence in piglatin_sentences])

if piglatin_maxSequenceLength > maxSequenceLength:
    maxSequenceLength = piglatin_maxSequenceLength


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




# Utilities functions


Please look through the functions provided below carefully, as you will need to use all of them at some point in your assignment.

In [None]:
def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentence, predicted_sentence):
    """
    Given a reference sentence, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = word_tokenize(reference_sentence.lower())
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu([reference_tokenized], predicted_tokenized)

%matplotlib inline
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words.split(' ') +['<EOS>'])

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

score1 = compute_bleu("<SOS>" + train_sentences[0], "<SOS>" + train_sentences[0])
score2 = compute_bleu("<SOS>" + train_sentences[0], "<SOS>" + train_sentences[5])

print('BLEU score distnace between \n  "' + train_sentences[0] + '" \nand\n  "'+ train_sentences[0] + '" \nis: ' + str(score1) +'\n\n')
print('BLEU score distnace between \n  "' + train_sentences[0] + '" \nand\n  "'+ train_sentences[5] + '" \nis: ' + str(score2) +'\n\n')


BLEU score distnace between 
  "A very clean and well decorated empty bathroom" 
and
  "A very clean and well decorated empty bathroom" 
is: 1.0


BLEU score distnace between 
  "A very clean and well decorated empty bathroom" 
and
  "A few people sit on a dim transportation system. " 
is: 0.1933853138176172




#Part 1: Encoder-Decoder Language Translation with Teacher-Forcing


## 1.1 Building a Language Decoder

We now implement a language decoder. For now, we will have the decoder take a single training sample at a time (as opposed to batching). For our purposes, we will also avoid defining the embeddings as part of the model and instead pass in embedded inputs. While this is sometimes useful, as it learns/tunes the embeddings, we avoid doing it for the sake of simplicity and speed.

Remember to use LSTM hidden units!

In [None]:
class DecoderLSTM(nn.Module):
    def __init__(self):
        super(DecoderLSTM, self).__init__()
        
        self.hidden_dim = 300
        wordEncodingSize = 2000
        
        # Your code goes here (~4 lines or less)

    def init_hidden(self):
        return # Your code goes here (1 line)


    def init_cell(self):
        return # Your code goes here (1 line)

    def forward(self, input_sentence, hidden, cell):
      # Your code goes here (~4 lines or less)
      
      return output, hidden, cell


## 1.2.  Building Language Encoder

We now build a language encoder, which will encode an input word by word, and ultimately output a hidden state that we can then be used by our decoder.

In [None]:
class EncoderLSTM(nn.Module):
    # Your code goes here
    def __init__(self):
        super(EncoderLSTM, self).__init__()
        
        self.hidden_dim = 300
        wordEncodingSize = 2000
        
        # Your code goes here (~3 lines)
       
    def init_hidden(self):
        return # Your code goes here (1 line)

    def init_cell(self):
        return # Your code goes here (1 line)

    def forward(self, input_sentence, hidden, cell):
        # Your code goes here (~2 lines of code)
        
        return output, hidden , cell
            

## 1.3. Connecting Encoder to Decoder and Train End-to-End and Train with Teacher Forcing

We now connect our newly created encoder with our decoder, to train an end-to-end seq2seq architecture. 

For the purposes of Part 1, the only interaction between the encoder and the decoder is that the *last hidden state of the encoder is used as the initial hidden state of the decoder*. This will be different for Part 2 and 3 where we will extend this punction.

In [None]:
def train(input_sentence, output_sentence, encoder,
          decoder, encoder_optimizer,
          decoder_optimizer, 
          criterion, 
          teacher_forcing_ratio = 1,
          decoderType = "LSTM",
          embeddings = one_hot_embeddings): 
    """
    Given a single training sample, go through a single step of training.
    """
    use_teacher_forcing = True if np.random.rand() < teacher_forcing_ratio else False
    # Your code goes here

    if decoderType == "LSTM": 
        if use_teacher_forcing: 
            # Your code goes here
        elif not use_teacher_forcing:
            # Your code goes here

    if decoderType == "AttentionLSTM": 
        if use_teacher_forcing: 
            # Your code goes here
        elif not use_teacher_forcing:
            # Your code goes here

    if decoderType == "Transformer":
        # Your code goes here (you can assume transformer uses teacher_forcing_ratio = 1)

    # Your code goes here
    return final_loss

In [None]:
# Train the model and monitor the loss. Remember to use Adam optimizer and CrossEntropyLoss
encoder = EncoderLSTM()
decoder = DecoderLSTM()
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.0005) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0005) 
criterion = nn.CrossEntropyLoss()
epochs = 5

print("Start training end-to-end network ......")
for epoch in range(epochs):
    epoch_loss=[]
    count=0
    for id, sentence in enumerate(filtered_sentences):
        target_variable = piglatin_filtered_sentences[id]
        loss = train(sentence, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio = 1, decoderType="LSTM")
        
        count = count+1
        if count%500==0:
            print("Single sentence Loss (epoch %d) : %f" % (epoch, loss))
        epoch_loss.append(loss)
        
    print("Loss (epoch %d) : %f" % (epoch, np.sum(epoch_loss)/len(filtered_sentences))) 

## 1.4. Building Language Decoder MAP Inference

We now define a method to perform inference with our decoder. 

In [None]:
def inference(sentence, encoder, decoder, decoderType="LSTM", embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    input_tensor = torch.Tensor(preprocess_one_hot(sentence))
    input_length = input_tensor.shape[0]

    # Initialize encoder & decoder 

    for ei in range(1,input_length):
        # Iteratively run the encoder 
        # 1. Get the current word index
        # 2. Convert to a 1-hot encoding
        # 3. Run one step of the encoder
        # 4. Save the encoder hidden states for future processing

    # Set the initial hidden and cell state of the RNN decoder to the last 
    # hidden and cell state of the encoder

    # Start the decoding with <SOS> token

    # Iterate up to the max_length of output
    for i in range(max_length):
        if decoderType == "LSTM": 
            # Run the simple decoder 

        if decoderType == "AttentionLSTM":
            # Run the attention decoder (this will be done in Part 2)

        if decoderType == "Transformer":
            # Run the transformer decoder (this will be done in Part 3)

    return word_list, decoder_attentions

In [None]:
# Lets test it 
sentence = "A very clean and well decorated empty bathroom" 
input_sentence = ["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] 
output_sentence, _ = inference(input_sentence, encoder, decoder, decoderType="LSTM")

print("English: " + sentence)
print("Pig Latin: " + output_sentence)

## 1.5. Building Language Decoder Sampling Inference

We now modify the inference method to sample from the distribution outputted by the LSTM rather than taking the most probable word.

It might be useful to take a look at the output of your model and (depending on your implementation) modify it so that the outputs sum to 1. 

In [None]:
def sampling_inference(sentence, encoder, decoder, decoderType="LSTM", embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    input_tensor = torch.Tensor(preprocess_one_hot(sentence))
    input_length = input_tensor.shape[0]

    # Initialize encoder & decoder 

    for ei in range(1,input_length):
        # Iteratively run the encoder 
        # 1. Get the current word index
        # 2. Convert to a 1-hot encoding
        # 3. Run one step of the encoder
        # 4. Save the encoder hidden states for future processing

    # Set the initial hidden and cell state of the RNN decoder to the last 
    # hidden and cell state of the encoder

    # Start the decoding with <SOS> token

    # Iterate up to the max_length of output
    for i in range(max_length):
        if decoderType == "LSTM": 
            # Run the simple decoder 

        if decoderType == "AttentionLSTM":
            # Run the attention decoder (this will be done in Part 2)

        if decoderType == "Transformer":
            # Run the transformer decoder (this will be done in Part 3)


    return word_list, decoder_attentions
    

In [None]:
# Lets test it 
sentence = "A very clean and well decorated empty bathroom" 
input_sentence = ["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] 

print("English: " + sentence)

for i in range(5):
    output_sentence, _ = sampling_inference(input_sentence, encoder, decoder, decoderType="LSTM")
    print("Pig Latin: " + output_sentence)

## 1.6. Testing 

We must now define a method that allows us to do inference using the seq2seq architecture. We then run the 500 validation captions through this method, and ultimately compare the **reference** and **generated** sentences using our **BLEU** similarity score method defined above, to identify the average BLEU score.

In [None]:
# Perform inference for all validation sequences and report the average BLEU score
avg_score=[]

# iterate over the validation set 
for idx, input_sentence in enumerate(val_sentences): 
    # output_sentence, _ = inference(...)
    # target_sentence = ... 
    # score = compute_blue(...)
    avg_score.append(score)
    if idx < 10 :
        print('BLEU score distance between \n  "' + target_sentence + '" \nand\n  "'+ output_sentence + '" \n is: ' + str(score) +'\n\n')

final_score = np.sum(avg_score)/len(val_sentences)
print("Average BLUE score : %f" % (final_score)) 


# EXPECTED < Average BLUE score (ArgMAX inference): 0.464803 > 
# EXPECTED < Average BLUE score (sampling inference): 0.477803 > 

## 1.7. Experiment with Teacher Forcing

Redo steps 1.3 and 1.6 with teacher_forcing_ratio = 0.9 and 0.8. Comment on the results, speed of convergence and the quality of results. Note that in most real scenarious the teacher forcing is actually annealed; starting with teacher forcing = 1.

In [None]:
# Your code goes here

## 1.8. Encoding as Generic Feature Representation

We now use the final hidden state of our encoder, to identify the nearest neighbor amongst the training sentences for each sentence in our validation data.

It would be effective to first define a method that would generate all of the hidden states and store these hidden states **on the CPU**, and then loop over the generated hidden states to identify/output the nearest neighbors.

In [None]:
def final_encoder_hidden(sentence):
    # Your code goes here

# Now run all training data and validation data to store hidden states
    # Your code goes here

In [None]:
# Now get nearest neighbors and print

# Part 2: Attention LSTM Decoder

## 2.1. Implementing Additive Attention

In [None]:
class AdditiveAttention(nn.Module):
    def __init__(self, hidden_size):
        super(AdditiveAttention, self).__init__()
        self.hidden_size = hidden_size
        self.attention_network = nn.Sequential(
                                    nn.Linear(hidden_size*2, hidden_size),
                                    nn.ReLU(),
                                    nn.Linear(hidden_size, 1)
                                 )
        self.softmax = nn.Softmax(dim=1)

    def forward(self, queries, keys, values):
        """The forward pass of the additive attention mechanism.

        Arguments:
            queries: The current decoder hidden state. (batch_size x hidden_size)
            keys: The encoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)
            values: The encoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)

        Returns:
            context: weighted average of the values (batch_size x 1 x hidden_size)
            attention_weights: Normalized attention weights for each encoder hidden state. (batch_size x 1 x seq_len)

            The attention_weights must be a softmax weighting over the seq_len annotations.
        """

        # ------------
        # Your code goes here
        # ------------
        # batch_size = 1
        # expanded_queries = ...
        # concat_inputs = ...
        # unnormalized_attention = ...
        # attention_weights = ...
        # context = ...

        return context, attention_weights

## 2.2. Attention Decoder

In [None]:
class AttentionDecoder(nn.Module):
    def __init__(self):
        super(AttentionDecoder, self).__init__()

        self.hidden_dim = 300
        wordEncodingSize = 2000
        self.dropout_p = 0.1
        self.linear_input = nn.Linear(wordEncodingSize, self.hidden_dim)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_dim*2, self.hidden_dim)
        self.attention = AdditiveAttention(hidden_size=self.hidden_dim)
        self.linear = nn.Linear(self.hidden_dim, vocabularySize)
        self.hidden = self.init_hidden()
        self.cell = self.init_cell()

    def init_hidden(self):    
        return torch.randn(1,1, self.hidden_dim).cuda()

    def init_cell(self):
        return torch.randn(1,1, self.hidden_dim).cuda()

    def forward(self, input_sentence, hidden, cell, encoder_annotations):
        embed = self.dropout(self.linear_input(input_sentence.view(1,-1)))

        # ------------
        # Your code goes here
        # ------------
        # embed_current = ...
        # context, attention_weights = ...
        # embed_and_context = ...
        
        return output, hidden, cell, attention_weights


## 2.3. Training Attention Decoder

Note that you will need to modify the train() procedure for Part 1 to handles the AttentionLSTM.

In [None]:
# Train the model and monitor the loss. Remember to use Adam optimizer and CrossEntropyLoss
encoder = EncoderLSTM()
decoder = AttentionDecoder()
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.0005) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0005) 
criterion = nn.CrossEntropyLoss()
epochs = 5

print("Start training end to end network ......")
for epoch in range(epochs):
    epoch_loss=[]
    count=0
    for id, sentence in enumerate(filtered_sentences):
        target_variable = piglatin_filtered_sentences[id]
        loss = train(sentence, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio = 1, decoderType="AttentionLSTM")
        count = count+1
        if count%500==0:
            print("Single sentence Loss (epoch %d) : %f" % (epoch, loss))
        epoch_loss.append(loss)
        
    print("Loss (epoch %d) : %f" % (epoch, np.sum(epoch_loss)/len(filtered_sentences))) 

## 2.4. Testing Attention Decoder
Note that you will need to modify the inference() procedure for Part 1 to handle Attention LSTM

In [None]:
# Perform inference for all validation sequences and report the average BLEU score
avg_score=[]

# iterate over the validation set 
for idx, input_sentence in enumerate(val_sentences): 
    # output_sentence, _ = inference(...)
    # target_sentence = ... 
    # score = compute_blue(...)
    avg_score.append(score)
    if idx < 10 :
        print('BLEU score distance between \n  "' + target_sentence + '" \nand\n  "'+ output_sentence + '" \n is: ' + str(score) +'\n\n')

final_score = np.sum(avg_score)/len(val_sentences)
print("Average BLUE score : %f" % (final_score)) 

# EXPECTED < Average BLUE score (ArgMAX inference): 0.739589 >

## 2.5. Visualize Attention for Attention Decoder

In [None]:
# Your code goes here

# Part 3: Transformer Decoder

## 3.1 Implement Scaled Dot Attention

In [None]:
class ScaledDotAttention(nn.Module):
    def __init__(self, hidden_size):
        super(ScaledDotAttention, self).__init__()

        self.hidden_size = hidden_size

        self.Q = nn.Linear(hidden_size, hidden_size)
        self.K = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, hidden_size)
        self.softmax = nn.Softmax(dim=1)
        self.scaling_factor = torch.rsqrt(torch.tensor(self.hidden_size, dtype= torch.float))

    def forward(self, queries, keys, values):
        """The forward pass of the scaled dot attention mechanism.

        Arguments:
            queries: The current decoder hidden state, 2D or 3D tensor. (batch_size x (k) x hidden_size)
            keys: The encoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)
            values: The encoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)

        Returns:
            context: weighted average of the values (batch_size x k x hidden_size)
            attention_weights: Normalized attention weights for each encoder hidden state. (batch_size x seq_len x 1)

            The output must be a softmax weighting over the seq_len annotations.
        """

        # ------------
        # Your code goes here
        # ------------
        # batch_size = 1
        # q = ...
        # k = ...
        # v = ...
        # unnormalized_attention = ...
        # attention_weights = ...
        # context = ...

        return context, attention_weights


## 3.2. Implement Causal Scaled Dot Attention

The implementation should be nearly identical to the one above, but with mask.

In [None]:
class CausalScaledDotAttention(nn.Module):
    def __init__(self, hidden_size):
        super(CausalScaledDotAttention, self).__init__()

        self.hidden_size = hidden_size
        self.neg_inf = torch.tensor(-1e7).cuda()

        self.Q = nn.Linear(hidden_size, hidden_size)
        self.K = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, hidden_size)
        self.softmax = nn.Softmax(dim=1)
        self.scaling_factor = torch.rsqrt(torch.tensor(self.hidden_size, dtype= torch.float))

    def forward(self, queries, keys, values):
        """The forward pass of the scaled dot attention mechanism.

        NOTES:
            batch_size = 1

        Arguments:
            queries: The current decoder hidden state, 2D or 3D tensor. (batch_size x (k) x hidden_size)
                In training k = maxSequenceLength or length of the GT ourput sequence
                In testing k = length of currently decoded sub-sequence
            keys: The decoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)
            values: The decoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)

        Returns:
            context: weighted average of the values (batch_size x k x hidden_size)
            attention_weights: Normalized attention weights for each encoder hidden state. (batch_size x seq_len x 1)

            The output must be a softmax weighting over the seq_len annotations.
        """

        # ------------
        # Your code goes here
        # ------------
        # batch_size = 1
        # q = ...
        # k = ...
        # v = ...
        # unnormalized_attention = ...
        # mask = ...
        # attention_weights = ...
        # context = ...

        return context, attention_weights

## 3.3. Implement Transformer Decoder

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self):
        super(TransformerDecoder, self).__init__()

        self.hidden_dim = 300
        wordEncodingSize = 2000
        self.dropout_p = 0.1
        self.num_layers = 3
        self.linear_input = nn.Linear(wordEncodingSize, self.hidden_dim)
        self.dropout = nn.Dropout(self.dropout_p)
        
        self.self_attentions = nn.ModuleList([CausalScaledDotAttention(
                                    hidden_size=self.hidden_dim, 
                                 ) for i in range(self.num_layers)])
        self.encoder_attentions = nn.ModuleList([ScaledDotAttention(
                                    hidden_size=self.hidden_dim, 
                                 ) for i in range(self.num_layers)])
        self.attention_mlps = nn.ModuleList([nn.Sequential(
                                    nn.Linear(self.hidden_dim, self.hidden_dim),
                                    nn.ReLU(),
                                 ) for i in range(self.num_layers)])

        self.linear = nn.Linear(self.hidden_dim, vocabularySize)

    def forward(self, input_sentence, hidden, cell, annotations):
        embed = self.dropout(self.linear_input(input_sentence)).unsqueeze(0)
        
        encoder_attention_weights_list = []
        self_attention_weights_list = []
        contexts = embed
        batch_size, seq_len, hidden_size = contexts.size()

        for i in range(self.num_layers):
            # ------------
            # Your code goes here
            # ------------
            
            
            encoder_attention_weights_list.append(encoder_attention_weights)
            self_attention_weights_list.append(self_attention_weights)            
        
        output = self.linear(contexts)
        encoder_attention_weights = torch.stack(encoder_attention_weights_list)
        self_attention_weights = torch.stack(self_attention_weights_list)
        
        return output, encoder_attention_weights, self_attention_weights

## 3.4. Training Transformer Decoder

Note that you will need to modify the train() procedure for Part 1 to handle the Transformer.

In [None]:
# Train the model and monitor the loss. Remember to use Adam optimizer and CrossEntropyLoss
encoder = EncoderLSTM()
decoder = TransformerDecoder()
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.0005) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0005) 
criterion = nn.CrossEntropyLoss()
epochs = 5

print("Start training end to end network ......")
for epoch in range(epochs):
    epoch_loss=[]
    count=0
    for id, sentence in enumerate(filtered_sentences):
        target_variable = piglatin_filtered_sentences[id]
        loss = train(sentence, target_variable, encoder, decoder,encoder_optimizer,decoder_optimizer, criterion, teacher_forcing_ratio = 1, decoderType="Transformer")
        count = count+1
        if count%500==0:
            print("Single sentence Loss (epoch %d) : %f" % (epoch, loss))
        epoch_loss.append(loss)
        
    print("Loss (epoch %d) : %f" % (epoch, np.sum(epoch_loss)/len(filtered_sentences))) 

## 3.5. Testing Transformer Decoder
Note that you will need to modify the inference() procedure for Part 1 to handle Transformer

In [None]:
# Perform inference for all validation sequences and report the average BLEU score
avg_score=[]

# iterate over the validation set 
for idx, input_sentence in enumerate(val_sentences): 
    # output_sentence, _ = inference(...)
    # target_sentence = ... 
    # score = compute_blue(...)
    avg_score.append(score)
    if idx < 10 :
        print('BLEU score distance between \n  "' + target_sentence + '" \nand\n  "'+ output_sentence + '" \n is: ' + str(score) +'\n\n')

final_score = np.sum(avg_score)/len(val_sentences)
print("Average BLUE score : %f" % (final_score)) 

## 3.6 Visualizing Attention for Transformer Decoder

Note that since we have multiple attention layers, there will be one attention to be visualized per layer. 

# 4. Effectiveness of word2vec

As an option, you may repeat one of the models above by modifying the code to use word2vec embedding for the input English sentences

In [None]:
# Your code goes here