In this notebook, machine translation is performed by using two **deep learning** approaches: a **Recurrent Neural Network (RNN)** and **Transformer**.

The anki data for **Chinese Mandarin to English translation** is trained using **sequence-to-sequence models**. Please refer to the following resources for more details:

1.   https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
2.   https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
3. https://arxiv.org/pdf/1409.0473.pdf

**Performance Analysis:**\
From the model evaluation output obtained from the trainings below, we can see:
- **RNN model** \
Loss 1.5236\
BLEU 1-gram: 0.198950\
BLEU 2-gram: 0.062358\
BLEU 3-gram: 0.048089\
BLEU 4-gram: 0.045925

- **Transformer model**\
Loss 2.7410 \
BLEU 1-gram: 0.218020\
BLEU 2-gram: 0.062562\
BLEU 3-gram: 0.045175\
BLEU 4-gram: 0.041665

Base on this evaluation, the RNN model seems to have a slight edge in overall performance, especially when considering the balance between loss and BLEU scores. However, depending on the specific use case, the Transformer’s ability to capture individual word accuracy (BLEU 1-gram) might still be valuable.


## I. Load Packages

In [1]:
import pandas as pd
import unicodedata
import re
from torch.utils.data import Dataset
import torch
import math
import random
import os

import torch.nn as nn
import torch.nn.functional as F
import time
from tqdm.notebook import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

rnn_encoder, rnn_encoder, transformer_encoder, transformer_decoder = None, None, None, None
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if __name__=='__main__':
    print('Using device:', DEVICE)

Using device: cuda


## II. Download and Prepare the Data

### Helper Functions

In [2]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """Normalizes latin chars with accent to their canonical decomposition"""
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence_chn(w):
  w = re.sub(r'([?.!,。？！，])', r' \1 ', w)
  w = ' '.join([c for c in w])
  w = re.sub(r'[" "]+', ' ', w)
  w = ' '.join( [t if t not in '1234567890`~@#$%^&*()_-+={}[];\':"/<>\\|' else '' for t in w.split(' ')] )
  w = unicode_to_ascii(w.lower().strip())
  w = '<start> ' + w + ' <end>'
  return w

def preprocess_sentence_general(w):
    '''
    Preprocess the sentence to add the start, end tokens and make them lower-case
    '''
    w = unicode_to_ascii(w.lower().strip())  # Normalize and lowercase
    w = re.sub(r'([?.!,])', r' \1 ', w)      # Add space around punctuation
    w = re.sub(r'[" "]+', ' ', w)            # Replace multiple spaces with a single space
    w = ' '.join([t if t not in '1234567890`~@#$%^&*()_-+={}[];\':"/<>\\|' else '' for t in w.split(' ')])  # Remove unwanted characters

    w = w.rstrip().strip()                   # Trim any remaining spaces
    w = '<start> ' + w + ' <end>'            # Add start and end tokens
    return w

def preprocess_sentence(w, lang):
    assert lang in ["chn", "general"]
    if lang == "chn":
      w = preprocess_sentence_chn(w)
    elif lang == "general":
      w = preprocess_sentence_general(w)

    w = w.rstrip().strip()
    return w

def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded


def preprocess_data_to_tensor(dataframe, src_vocab, trg_vocab, src_lang, trg_lang):
    # Vectorize the input and target languages
    src_tensor = [[src_vocab.word2idx[s if s in src_vocab.vocab else '<unk>'] for s in es.split(' ')] for es in dataframe[src_lang].values.tolist()]
    trg_tensor = [[trg_vocab.word2idx[s if s in trg_vocab.vocab else '<unk>'] for s in eng.split(' ')] for eng in dataframe[trg_lang].values.tolist()]

    # Calculate the max_length of input and output tensor for padding
    max_length_src, max_length_trg = max(len(t) for t in src_tensor), max(len(t) for t in trg_tensor)
    print('max_length_src: {}, max_length_trg: {}'.format(max_length_src, max_length_trg))

    # Pad all the sentences in the dataset with the max_length
    src_tensor = [pad_sequences(x, max_length_src) for x in src_tensor]
    trg_tensor = [pad_sequences(x, max_length_trg) for x in trg_tensor]

    return src_tensor, trg_tensor, max_length_src, max_length_trg


def train_test_split(src_tensor, trg_tensor):
    '''
    Create training and test sets.
    '''
    total_num_examples = len(src_tensor) - int(0.2*len(src_tensor))
    src_tensor_train, src_tensor_test = src_tensor[:int(0.75*total_num_examples)], src_tensor[int(0.75*total_num_examples):total_num_examples]
    trg_tensor_train, trg_tensor_test = trg_tensor[:int(0.75*total_num_examples)], trg_tensor[int(0.75*total_num_examples):total_num_examples]

    return src_tensor_train, src_tensor_test, trg_tensor_train, trg_tensor_test

In [3]:
# Sanity Check Function
count_parameters = lambda model: sum(p.numel() for p in model.parameters() if p.requires_grad)

def sanityCheckModel(all_test_params, NN, expected_outputs, init_or_forward):
    print('--- TEST: ' + ('Number of Model Parameters (tests __init__(...))' if init_or_forward=='init' else 'Output shape of forward(...)') + ' ---')
    if init_or_forward == "forward":
        # Creating random texts and lables batches
        texts_batch = torch.randint(low=0, high=len(all_test_params[0]['src_vocab']), size=(10,16))
        labels_batch = torch.randint(low=0, high=len(all_test_params[0]['src_vocab']), size=(10,12))

    for tp_idx, (test_params, expected_output) in enumerate(zip(all_test_params, expected_outputs)):
        if init_or_forward == "forward":
            batch_size = test_params['batch_size']
            texts = texts_batch[:batch_size]
            # if NN.__name__ == "RnnEncoder":
            texts = texts.transpose(0,1)

        # Construct the student model
        tps = {k:v for k, v in test_params.items() if k != 'batch_size'}
        stu_nn = NN(**tps)

        input_rep = str({k:v for k,v in tps.items()})

        if init_or_forward == "forward":
            with torch.no_grad():
                if NN.__name__ == "TransformerEncoder":
                    stu_out = stu_nn(texts)
                else:
                    stu_out, _ = stu_nn(texts)
                    expected_output = torch.rand(expected_output).size()
            ref_out_shape = expected_output

            has_passed = torch.is_tensor(stu_out)
            if not has_passed: msg = 'Output must be a torch.Tensor; received ' + str(type(stu_out))
            else:
                has_passed = stu_out.shape == ref_out_shape
                msg = 'Your Output Shape: ' + str(stu_out.shape)


            status = 'PASSED' if has_passed else 'FAILED'
            message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(texts.shape) + '\tExpected Output Shape: ' + str(ref_out_shape) + '\t' + msg
            print(message)
        else:
            stu_num_params = count_parameters(stu_nn)
            ref_num_params = expected_output
            comparison_result = (stu_num_params == ref_num_params)

            status = 'PASSED' if comparison_result else 'FAILED'
            message = '\t' + status + "\tInput: " + input_rep + ('\tExpected Num. Params: ' + str(ref_num_params) + '\tYour Num. Params: '+ str(stu_num_params))
            print(message)

        del stu_nn

### Evaluation Functions
These functions will be used to evaluate both the RNN and Transformer Models.

In [4]:
def get_reference_candidate(target, pred, trg_vocab):
    def _to_token(sentence):
        lis = []
        for s in sentence[1:]:
            x = trg_vocab.idx2word[s]
            if x == "<end>": break
            lis.append(x)
        return lis
    reference = _to_token(list(target.numpy()))
    candidate = _to_token(list(pred.numpy()))
    return reference, candidate

def compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab):
    bleu_1 = 0.0
    bleu_2 = 0.0
    bleu_3 = 0.0
    bleu_4 = 0.0

    smoother = SmoothingFunction()
    save_reference = []
    save_candidate = []
    for i in range(len(target_tensor_val)):
        reference, candidate = get_reference_candidate(target_output[i], final_output[i], trg_vocab)

        bleu_1 += sentence_bleu(reference, candidate, weights=(1,), smoothing_function=smoother.method1)
        bleu_2 += sentence_bleu(reference, candidate, weights=(1/2, 1/2), smoothing_function=smoother.method1)
        bleu_3 += sentence_bleu(reference, candidate, weights=(1/3, 1/3, 1/3), smoothing_function=smoother.method1)
        bleu_4 += sentence_bleu(reference, candidate, weights=(1/4, 1/4, 1/4, 1/4), smoothing_function=smoother.method1)

        save_reference.append(reference)
        save_candidate.append(candidate)

    bleu_1 = bleu_1/len(target_tensor_val)
    bleu_2 = bleu_2/len(target_tensor_val)
    bleu_3 = bleu_3/len(target_tensor_val)
    bleu_4 = bleu_4/len(target_tensor_val)

    scores = {"bleu_1": bleu_1, "bleu_2": bleu_2, "bleu_3": bleu_3, "bleu_4": bleu_4}
    print('BLEU 1-gram: %f' % (bleu_1))
    print('BLEU 2-gram: %f' % (bleu_2))
    print('BLEU 3-gram: %f' % (bleu_3))
    print('BLEU 4-gram: %f' % (bleu_4))

    return save_candidate, scores

### Download and Visualize the Data

In [5]:
lang_path = 'cmn-eng'

os.system(f"wget http://www.manythings.org/anki/{lang_path}.zip")
os.system(f"unzip -o {lang_path}.zip")
src_script, trg_script = "chn", "general"
src_lang, trg_lang = lang_path.split('-')[0], lang_path.split('-')[1]

In [6]:
total_num_examples = 50000
dat = pd.read_csv(f'{src_lang}.txt',
                sep="\t",
                header=None,
                usecols=[0,1],
                names=[f'{trg_lang}', f'{src_lang}'],
                nrows=total_num_examples,
                encoding="UTF-8"
).sample(frac=1).reset_index().drop(['index'], axis=1)

dat # Visualize the data

Unnamed: 0,eng,cmn
0,Better be the head of a cat than the tail of a...,寧為貓頭不為獅尾。
1,Do you have any friends?,你有朋友吗？
2,Any student can answer that question.,任何學生都可以回答這個問題。
3,I can't get rid of my cold.,我的感冒怎麼也不好。
4,My house is close to the school.,我的家离学校很近。
...,...,...
29904,A brass band is marching along the street.,一个管弦乐团沿着路前进
29905,We still have many other things to discuss.,我们还有许多别的事情要讨论。
29906,Tom should've finished it by now.,汤姆现在应该已经完成了。
29907,May I eat something?,我可以吃點什麼嗎？


### Preprocess the data

In [7]:
data = dat.copy()

data[trg_lang] = dat[trg_lang].apply(lambda w: preprocess_sentence(w, trg_script))
data[src_lang] = dat[src_lang].apply(lambda w: preprocess_sentence(w, src_script))
data # Visualizing the data

Unnamed: 0,eng,cmn
0,<start> better be the head of a cat than the t...,<start> 寧 為 貓 頭 不 為 獅 尾 。 <end>
1,<start> do you have any friends ? <end>,<start> 你 有 朋 友 吗 ？ <end>
2,<start> any student can answer that question ....,<start> 任 何 學 生 都 可 以 回 答 這 個 問 題 。 <end>
3,<start> i can't get rid of my cold . <end>,<start> 我 的 感 冒 怎 麼 也 不 好 。 <end>
4,<start> my house is close to the school . <end>,<start> 我 的 家 离 学 校 很 近 。 <end>
...,...,...
29904,<start> a brass band is marching along the str...,<start> 一 个 管 弦 乐 团 沿 着 路 前 进 <end>
29905,<start> we still have many other things to dis...,<start> 我 们 还 有 许 多 别 的 事 情 要 讨 论 。 <end>
29906,<start> tom should've finished it by now . <end>,<start> 汤 姆 现 在 应 该 已 经 完 成 了 。 <end>
29907,<start> may i eat something ? <end>,<start> 我 可 以 吃 點 什 麼 嗎 ？ <end>


### Vocabulary & Dataloader Classes

A class fro managing the vocabulary is created. There is a seperate class for the vocabulary because there are two dfiferent vacabularies - one for source language and one for target language.

Then the dataloader is prepared and return the source sentence and target sentence.

In [8]:
class Vocab_Lang():
    def __init__(self, vocab):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.vocab = vocab

        for index, word in enumerate(vocab):
            self.word2idx[word] = index + 2 # +2 because of <pad> and <unk> token
            self.idx2word[index + 2] = word

    def __len__(self):
        return len(self.word2idx)

    def __repr__(self):
        if len(self.vocab) <= 5:
            return str(self.vocab)
        else:
            return f'Vocab_Lang object with {len(self.vocab)} words'

class MyData(Dataset):
    def __init__(self, X, y):
        self.length = torch.LongTensor([np.sum(1 - np.equal(x, 0)) for x in X])
        self.data = torch.LongTensor(X)
        self.target = torch.LongTensor(y)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        return x, y

    def __len__(self):
        return len(self.data)

In [9]:
import numpy as np
import random
from torch.utils.data import DataLoader

In [10]:
#adjust hyperparameters
BATCH_SIZE = 64
EMBEDDING_DIM = 256

### Build Vocabulary

In [11]:
def build_vocabulary(pd_dataframe):
    sentences = [sen.split() for sen in pd_dataframe]
    vocab = {}
    for sen in sentences:
        for word in sen:
            if word not in vocab:
                vocab[word] = 1
    return list(vocab.keys())

In [12]:
src_vocab_list = build_vocabulary(data[src_lang])
trg_vocab_list = build_vocabulary(data[trg_lang])

### Instantiate Datasets
The train and test datasets are now instantiated.

In [13]:
src_vocab = Vocab_Lang(src_vocab_list)
trg_vocab = Vocab_Lang(trg_vocab_list)

src_tensor, trg_tensor, max_length_src, max_length_trg = preprocess_data_to_tensor(data, src_vocab, trg_vocab, src_lang, trg_lang)
src_tensor_train, src_tensor_val, trg_tensor_train, trg_tensor_val = train_test_split(src_tensor, trg_tensor)

# create train and val datasets
train_dataset = MyData(src_tensor_train, trg_tensor_train)
train_dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)

test_dataset = MyData(src_tensor_val, trg_tensor_val)
test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=False)

max_length_src: 46, max_length_trg: 36


  self.data = torch.LongTensor(X)


In [14]:
idxes = random.choices(range(len(train_dataset.dataset)), k=5)
src, trg =  train_dataset.dataset[idxes]
print('Source:', src)
print('Source Dimensions: ', src.size())
print('Target:', trg)
print('Target Dimensions: ', trg.size())

Source: tensor([[   2,  103,  101,  151,   86,    1,  152,  690,   31,  265,  158,  490,
           10,   11,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,   31,  222,  255,   12,  953,   71,  176,  178,   10,   11,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,  147,  101,   90, 2029, 2030,   87,   90,   32, 1788, 1788,   10,
           11,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,   12,   13,  750,  117,  290,  751,  1

## III. Train a Recurrent Neural Network (RNN)

Here a recurrent model for machine translation is implemented, and then trained and evaluated its results.

Here are some links related to the tasks:
1. Attention paper: https://arxiv.org/pdf/1409.0473.pdf
2. Explanation of LSTM's & GRU's: https://towardsdatascience.com/illustrated-guide-to-lstms-and-gru-s-a-step-by-step-explanation-44e9eb85bf21
3. Attention explanation: https://towardsdatascience.com/attention-in-neural-networks-e66920838742
4. Another attention explanation: https://towardsdatascience.com/attention-and-its-different-forms-7fc3674d14dc

### Encoder Model
A recurrent model is built here, and a sequence of output of GRU as well as the final hidden state will be returned. These will be used in the decoder.

In [15]:
class RnnEncoder(nn.Module):
    def __init__(self, src_vocab, embedding_dim, hidden_units):
        super(RnnEncoder, self).__init__()
        """
        Args:
            src_vocab: Vocab_Lang, the source vocabulary
            embedding_dim: the dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """
        self.src_vocab = src_vocab
        vocab_size = len(src_vocab)

        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units

        # Initialize embedding layer
        self.embedding = nn.Embedding(vocab_size, self.embedding_dim)

        # Initialize a single directional GRU with 1 layer and batch_first=False
        self.gru = nn.GRU(self.embedding_dim, self.hidden_units, batch_first = False)

    def forward(self, x):
        """
        Args:
            x: source texts, [max_len, batch_size]

        Returns:
            output: [max_len, batch_size, hidden_units]
            hidden_state: [1, batch_size, hidden_units]

        - Pass x through an embedding layer and pass the results through the recurrent net
        - Return output and hidden states from the recurrent net
        """
        embeddings = self.embedding(x)
        output, hidden_state = self.gru(embeddings)

        return output, hidden_state

In [16]:
# Sanity check
if __name__ == '__main__':
    # Set random seed
    torch.manual_seed(42)
    # Create test inputs
    embedding_dim = [2, 5, 8]
    hidden_units = [50, 100, 200]
    sanity_vocab = Vocab_Lang(vocab=["a", "aa", "aaa"])
    params = []
    inputs = []
    for ed in embedding_dim:
        for hu in hidden_units:
            inp = {}
            inp['src_vocab'] = sanity_vocab
            inp['embedding_dim'] = ed
            inp['hidden_units'] = hu
            inputs.append(inp)
    # Test init
    expected_outputs = [8110, 31210, 122410, 8575, 32125, 124225, 9040, 33040, 126040]

    sanityCheckModel(inputs, RnnEncoder, expected_outputs, "init")
    print()

    # Test forward
    inputs = []
    batch_sizes = [1, 2]
    for hu in hidden_units:
        for b in batch_sizes:
            inp = {}
            inp['embedding_dim'] = EMBEDDING_DIM
            inp['src_vocab'] = sanity_vocab
            inp["batch_size"] = b
            inp['hidden_units'] = hu
            inputs.append(inp)
    expected_outputs = [torch.Size([16, 1, 50]), torch.Size([16, 2, 50]), torch.Size([16, 1, 100]), torch.Size([16, 2, 100]), torch.Size([16, 1, 200]), torch.Size([16, 2, 200])]

    sanityCheckModel(inputs, RnnEncoder, expected_outputs, "forward")

--- TEST: Number of Model Parameters (tests __init__(...)) ---
	PASSED	Input: {'src_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 2, 'hidden_units': 50}	Expected Num. Params: 8110	Your Num. Params: 8110
	PASSED	Input: {'src_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 2, 'hidden_units': 100}	Expected Num. Params: 31210	Your Num. Params: 31210
	PASSED	Input: {'src_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 2, 'hidden_units': 200}	Expected Num. Params: 122410	Your Num. Params: 122410
	PASSED	Input: {'src_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 5, 'hidden_units': 50}	Expected Num. Params: 8575	Your Num. Params: 8575
	PASSED	Input: {'src_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 5, 'hidden_units': 100}	Expected Num. Params: 32125	Your Num. Params: 32125
	PASSED	Input: {'src_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 5, 'hidden_units': 200}	Expected Num. Params: 124225	Your Num. Params: 124225
	PASSED	Input: {'src_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 8, 'hidden_units': 50

### Decoder Model
a Decoder model is implemented that uses an attention mechanism, as provided in https://arxiv.org/pdf/1409.0473.pdf. We have broken this up into three functions that you need to implement: `__init__(self, ...)`, `compute_attention(self, dec_hs, enc_output)`, and `forward(self, x, dec_hs, enc_output)`:

* <b>`__init__(self, ...)`: </b> Instantiate the parameters of your model, and store them in `self` variables.

* <b>`compute_attention(self, dec_hs, enc_output)`</b>: Compute the <b>context vector</b>, which is a weighted sum of the encoder output states. Suppose the decoder hidden state at time $t$ is $\mathbf{h}_t$, and the encoder hidden state at time $s$ is $\mathbf{\bar h}_s$. The pseudocode is as follows:

  1. <b>Attention scores:</b> Compute real-valued scores for the decoder hidden state $\mathbf{h}_t$ and each encoder hidden state $\mathbf{\bar h}_s$: $$\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_s)=
      \mathbf{v}_a^T \tanh(\mathbf{W}_1 \mathbf{h}_t +\mathbf{W}_2 \mathbf{\bar h}_s)
$$
   Here the scoring function is implemented. A higher score indicates a stronger "affinity" between the decoder state and a specific encoder state.

   Note that in theory, $\mathbf{v_a}$ could have a different dimension than $\mathbf{h}_t$ and $\mathbf{\bar h}_s$, but you should use the same hidden size for this vector.

 2. <b>Attention weights:</b> Normalize the attention scores to obtain a valid probability distribution: $$\alpha_{ts} = \frac{\exp \big (\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_s) \big)}{\sum_{s'=1}^S \exp \big (\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_{s'}) \big)}$$ Notice that this is just the softmax function, and can be implemented with `F.softmax(...)` in Pytorch.

 3. <b>Context vector:</b> Compute a context vector $\mathbf{c}_t$ that is a weighted average of the encoder hidden states, where the weights are given by the attention weights you just computed: $$\mathbf{c}_t=\sum_{s=1}^S \alpha_{ts} \mathbf{\bar h}_s$$

  The context vector will be returned, along with the attention weights.


* <b>`forward(self, x, dec_hs, enc_output)`: </b> Run a <b>single</b> decoding step, resulting in a distribution over the vocabulary for the next token in the sequence. Pseudocode can be found in the docstrings below.

In [17]:
class RnnDecoder(nn.Module):
    def __init__(self, trg_vocab, embedding_dim, hidden_units):
        super(RnnDecoder, self).__init__()
        """
        Args:
            trg_vocab: Vocab_Lang, the target vocabulary
            embedding_dim: The dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """
        self.trg_vocab = trg_vocab
        vocab_size = len(trg_vocab)

        # Initialize embedding layer
        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        self.embedding = nn.Embedding(vocab_size, self.embedding_dim)

        # Initialize layers to compute attention score
        self.w1 = nn.Linear(self.hidden_units, self.hidden_units)
        self.w2 = nn.Linear(self.hidden_units, self.hidden_units)
        self.v = nn.Linear(self.hidden_units, 1)

        # Initialize a single directional GRU with 1 layer and batch_first=True
        self.gru = nn.GRU(self.hidden_units + self.embedding_dim, self.hidden_units, batch_first=True)

        # Input to RNN will be the concatenation of your embedding vector and the context vector
        # Initialize fully connected layer
        self.fc = nn.Linear(self.hidden_units, vocab_size)

    def compute_attention(self, dec_hs, enc_output):
        '''
        This function computes the context vector and attention weights.

        Args:
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            context_vector: Context vector, according to formula; [batch_size, hidden_units]
            attention_weights: The attention weights you have calculated; [batch_size, max_len_src, 1]

       steps
            (1) Compute the attention scores for dec_hs & enc_output
                    - permute the dimensions of the tensors in order to pass them through linear layers
                    - Output size: [batch_size, max_len_src, 1]
            (2) Compute attention_weights by taking a softmax over your scores to normalize the distribution (Make sure that after softmax the normalized scores add up to 1)
                    - Output size: [batch_size, max_len_src, 1]
            (3) Compute context_vector from attention_weights & enc_output
                    - find it helpful to use torch.sum & element-wise multiplication (* operator)
            (4) Return context_vector & attention_weights
        '''

        decoder_hidden_state = dec_hs.permute(1, 0, 2)
        encoder_outputs = enc_output.permute(1, 0, 2)

        attention_weights = torch.softmax(self.v(torch.tanh(self.w1(decoder_hidden_state) + self.w2(encoder_outputs))), dim=1)
        context_vector = torch.sum(attention_weights * encoder_outputs, dim=1)

        return context_vector, attention_weights

    def forward(self, x, dec_hs, enc_output):
        '''
        This function runs the decoder for a **single** time step.

        Args:
            x: Input token; [batch_size, 1]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            fc_out: (Unnormalized) output distribution [batch_size, vocab_size]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            attention_weights: The attention weights you have learned; [batch_size, max_len_src, 1]

        Steps:
            (1) Compute the context vector & attention weights by calling self.compute_attention(...) on the appropriate input
            (2) Obtain embedding vectors for your input x
                    - Output size: [batch_size, 1, embedding_dim]
            (3) Concatenate the context vector & the embedding vectors along the appropriate dimension
            (4) Feed this result through your RNN (along with the current hidden state) to get output and new hidden state
                    - Output sizes: [batch_size, 1, hidden_units] & [1, batch_size, hidden_units]
            (5) Feed the output of your RNN through linear layer to get (unnormalized) output distribution (don't call softmax!)
            (6) Return this output, the new decoder hidden state, & the attention weights
        '''
        fc_out, attention_weights = None, None
        context_vector, attention_weights = self.compute_attention(dec_hs, enc_output)
        embeddings = self.embedding(x)
        embeddings_concat = torch.cat((context_vector.unsqueeze(1), embeddings), -1)

        output, _ = self.gru(embeddings_concat)
        fc_out = self.fc(output).squeeze(1)

        return fc_out, dec_hs, attention_weights

In [18]:
# sanity check
def sanityCheckDecoderModelForward(inputs, NN, expected_outputs):
    print('--- TEST: Output shape of forward(...) ---\n')
    expected_fc_outs = expected_outputs[0]
    expected_dec_hs = expected_outputs[1]
    expected_attention_weights = expected_outputs[2]
    msg = ''
    for i, inp in enumerate(inputs):
        input_rep = '{'
        for k,v in inp.items():
            if torch.is_tensor(v):
                input_rep += str(k) + ': ' + 'Tensor with shape ' + str(v.size()) + ', '
            else:
                input_rep += str(k) + ': ' + str(v) + ', '
        input_rep += '}'
        dec = RnnDecoder(trg_vocab=inp['trg_vocab'],embedding_dim=inp['embedding_dim'],hidden_units=inp['hidden_units'])
        dec_hs = torch.rand(1, inp["batch_size"], inp['hidden_units'])
        x = torch.randint(low=0,high=len(inp["trg_vocab"]),size=(inp["batch_size"], 1))
        with torch.no_grad():
            dec_out = dec(x=x, dec_hs=dec_hs,enc_output=inp['encoder_outputs'])
            if not isinstance(dec_out, tuple):
                msg = '\tFAILED\tYour RnnDecoder.forward() output must be a tuple; received ' + str(type(dec_out))
                print(msg)
                continue
            elif len(dec_out)!=3:
                msg = '\tFAILED\tYour RnnDecoder.forward() output must be a tuple of size 3; received tuple of size ' + str(len(dec_out))
                print(msg)
                continue
            stu_fc_out, stu_dec_hs, stu_attention_weights = dec_out
        del dec
        has_passed = True
        msg = ""
        if not torch.is_tensor(stu_fc_out):
            has_passed = False
            msg += '\tFAILED\tOutput must be a torch.Tensor; received ' + str(type(stu_fc_out)) + " "
        if not torch.is_tensor(stu_dec_hs):
            has_passed = False
            msg += '\tFAILED\tDecoder Hidden State must be a torch.Tensor; received ' + str(type(stu_dec_hs)) + " "
        if not torch.is_tensor(stu_attention_weights):
            has_passed = False
            msg += '\tFAILED\tAttention Weights must be a torch.Tensor; received ' + str(type(stu_attention_weights)) + " "

        status = 'PASSED' if has_passed else 'FAILED'
        if not has_passed:
            message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape (x): ' + str(os.XATTR_REPLACE.shape) + '\tExpected Output Shape: ' + str(expected_fc_outs[i]) + '\t' + msg
            print(message)
            continue

        has_passed = stu_fc_out.size() == expected_fc_outs[i]
        msg = 'Your Output Shape: ' + str(stu_fc_out.size())
        status = 'PASSED' if has_passed else 'FAILED'
        message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape (x): ' + str(x.shape) + '\tExpected Output Shape: ' + str(expected_fc_outs[i]) + '\t' + msg
        print(message)

        has_passed = stu_dec_hs.size() == expected_dec_hs[i]
        msg = 'Your Hidden State Shape: ' + str(stu_dec_hs.size())
        status = 'PASSED' if has_passed else 'FAILED'
        message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape (x): ' + str(x.shape) + '\tExpected Hidden State Shape: ' + str(expected_dec_hs[i]) + '\t' + msg
        print(message)

        has_passed = stu_attention_weights.size() == expected_attention_weights[i]
        msg = 'Your Attention Weights Shape: ' + str(stu_attention_weights.size())
        status = 'PASSED' if has_passed else 'FAILED'
        message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape (x): ' + str(x.shape) + '\tExpected Attention Weights Shape: ' + str(expected_attention_weights[i]) + '\t' + msg
        print(message)

        stu_sum = stu_attention_weights.sum(dim=1).squeeze()
        if torch.allclose(stu_sum, torch.ones_like(stu_sum), atol=1e-5):
            print('\tPASSED\t The sum of your attention_weights along dim 1 is 1.')
        else:
            print('\tFAILED\t The sum of your attention_weights along dim 1 is not 1.')
        print()

In [19]:
if __name__ == '__main__':
    # Set random seed
    torch.manual_seed(42)
    # Create test inputs
    embedding_dim = [2, 5, 8]
    hidden_units = [50, 100, 200]
    sanity_vocab = Vocab_Lang(vocab=["a", "aa", "aaa"])
    params = []
    inputs = []
    for ed in embedding_dim:
        for hu in hidden_units:
            inp = {}
            inp['trg_vocab'] = sanity_vocab
            inp['embedding_dim'] = ed
            inp['hidden_units'] = hu
            inputs.append(inp)
    # Test init
    expected_outputs = [21016, 82016, 324016, 21481, 82931, 325831, 21946, 83846, 327646]
    sanityCheckModel(inputs, RnnDecoder, expected_outputs, "init")
    print()

    # Test forward
    inputs = []
    hidden_units = [50, 100, 200]
    batch_sizes = [1, 2, 4]
    embedding_dims = iter([50,80,100,120,150,200,300,400,500])
    encoder_outputs = iter([torch.rand([16, 1, 50]), torch.rand([16, 2, 50]), torch.rand([16, 4, 50]), torch.rand([16, 1, 100]), torch.rand([16, 2, 100]), torch.rand([16, 4, 100]), torch.rand([16, 1, 200]), torch.rand([16, 2, 200]),torch.rand([16, 4, 200])])
    expected_fc_outs = [torch.Size([1, 5]),torch.Size([2, 5]),torch.Size([4, 5]),torch.Size([1, 5]),torch.Size([2, 5]),torch.Size([4, 5]),torch.Size([1, 5]),torch.Size([2, 5]),torch.Size([4, 5])]
    expected_dec_hs = [torch.Size([1, 1, 50]), torch.Size([1, 2, 50]), torch.Size([1, 4, 50]), torch.Size([1, 1, 100]), torch.Size([1, 2, 100]), torch.Size([1, 4, 100]), torch.Size([1, 1, 200]), torch.Size([1, 2, 200]), torch.Size([1, 4, 200])]
    expected_attention_weights = [torch.Size([1, 16, 1]), torch.Size([2, 16, 1]), torch.Size([4, 16, 1]), torch.Size([1, 16, 1]), torch.Size([2, 16, 1]), torch.Size([4, 16, 1]), torch.Size([1, 16, 1]), torch.Size([2, 16, 1]), torch.Size([4, 16, 1])]
    expected_outputs = (expected_fc_outs, expected_dec_hs, expected_attention_weights)

    for hu in hidden_units:
        for b in batch_sizes:
            inp = {}
            edim = next(embedding_dims)
            inp['embedding_dim'] = edim
            inp['trg_vocab'] = sanity_vocab
            inp["batch_size"] = b
            inp['hidden_units'] = hu
            inp['encoder_outputs'] = next(encoder_outputs)
            inputs.append(inp)

    sanityCheckDecoderModelForward(inputs, RnnDecoder, expected_outputs)

--- TEST: Number of Model Parameters (tests __init__(...)) ---
	PASSED	Input: {'trg_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 2, 'hidden_units': 50}	Expected Num. Params: 21016	Your Num. Params: 21016
	PASSED	Input: {'trg_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 2, 'hidden_units': 100}	Expected Num. Params: 82016	Your Num. Params: 82016
	PASSED	Input: {'trg_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 2, 'hidden_units': 200}	Expected Num. Params: 324016	Your Num. Params: 324016
	PASSED	Input: {'trg_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 5, 'hidden_units': 50}	Expected Num. Params: 21481	Your Num. Params: 21481
	PASSED	Input: {'trg_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 5, 'hidden_units': 100}	Expected Num. Params: 82931	Your Num. Params: 82931
	PASSED	Input: {'trg_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 5, 'hidden_units': 200}	Expected Num. Params: 325831	Your Num. Params: 325831
	PASSED	Input: {'trg_vocab': ['a', 'aa', 'aaa'], 'embedding_dim': 8, 'hidden_units'

### RNN Model Training
Encoder and decoder models are trained using cross-entropy loss.

In [20]:
def loss_function(real, pred):
    mask = real.ge(1).float() # Only consider non-zero inputs in the loss

    loss_ = F.cross_entropy(pred, real) * mask
    return torch.mean(loss_)

def train_rnn_model(encoder, decoder, dataset, optimizer, trg_vocab, device, n_epochs):
    batch_size = dataset.batch_size
    for epoch in range(n_epochs):
        start = time.time()
        n_batch = 0
        total_loss = 0

        encoder.train()
        decoder.train()

        for src, trg in tqdm(dataset):
            n_batch += 1
            loss = 0

            enc_output, enc_hidden = encoder(src.transpose(0,1).to(device))
            dec_hidden = enc_hidden

            # use teacher forcing - feeding the target as the next input (via dec_input)
            dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size)

            # run code below for every timestep in the ys batch
            for t in range(1, trg.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
                assert len(predictions.shape) == 2 and predictions.shape[0] == dec_input.shape[0] and predictions.shape[1] == len(trg_vocab.word2idx), "First output of decoder must have shape [batch_size, vocab_size], you returned shape " + str(predictions.shape)
                loss += loss_function(trg[:, t].to(device), predictions.to(device))
                dec_input = trg[:, t].unsqueeze(1)

            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss

            optimizer.zero_grad()

            batch_loss.backward()

            ### update model parameters
            optimizer.step()

        # Save checkpoint for model (optional)
        print('Epoch:{:2d}/{}\t Loss: {:.4f} \t({:.2f}s)'.format(epoch + 1, n_epochs, total_loss / n_batch, time.time() - start))

    print('Model trained!')

In [21]:
if __name__ == '__main__':
    # HYPERPARAMETERS - feel free to change
    LEARNING_RATE = 0.001
    HIDDEN_UNITS=256
    N_EPOCHS=10

    rnn_encoder = RnnEncoder(src_vocab, EMBEDDING_DIM, HIDDEN_UNITS).to(DEVICE)
    rnn_decoder = RnnDecoder(trg_vocab, EMBEDDING_DIM, HIDDEN_UNITS).to(DEVICE)

    rnn_model_params = list(rnn_encoder.parameters()) + list(rnn_decoder.parameters())
    optimizer = torch.optim.Adam(rnn_model_params, lr=LEARNING_RATE)

    print('Encoder and Decoder models initialized!')

Encoder and Decoder models initialized!


In [22]:
if __name__ == '__main__':
    train_rnn_model(rnn_encoder, rnn_decoder, train_dataset, optimizer, trg_vocab, DEVICE, N_EPOCHS)

  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 1/10	 Loss: 0.9722 	(27.42s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 2/10	 Loss: 0.7381 	(26.11s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 3/10	 Loss: 0.6223 	(26.19s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 4/10	 Loss: 0.5297 	(26.18s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 5/10	 Loss: 0.4500 	(26.19s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 6/10	 Loss: 0.3808 	(26.38s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 7/10	 Loss: 0.3198 	(26.54s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 8/10	 Loss: 0.2679 	(26.51s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 9/10	 Loss: 0.2239 	(26.15s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch:10/10	 Loss: 0.1866 	(26.36s)
Model trained!


### Inference (Decoding) Function
After the model is trained, it can be used on test data. Here is a function that takes the trained model and a source sentence, and returns its translation to the target language. Instead of using teacher forcing, the input to the decoder at time $t_i$ will be the prediction of the decoder at time $t_{i-1}$.

In [40]:
def decode_rnn_model(encoder, decoder, src, max_decode_len, device):
    """
    Args:
        encoder: RnnEncoder object
        decoder: RnnDecoder object
        src: [max_src_length, batch_size] the source sentences to translate
        max_decode_len: The maximum desired length (int) of the target translated sentences
        device: the device the torch tensors are on (need to call x.to(device) for some of your tensors)

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in the vocabulary at each time step
    """
    # Initialize variables
    trg_vocab = decoder.trg_vocab
    batch_size = src.size(1)

    # Initialize output tensors on the correct device
    curr_output = torch.zeros((batch_size, max_decode_len), device=device)
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)), device=device)

    # We start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size, device=device)
    curr_output[:, 0] = dec_input.squeeze(1)

    # Implement decoding algorithm
    enc_output, enc_hs = encoder(src.to(device))
    hidden = enc_hs

    for t in range(1, max_decode_len):
        predictions, hidden, weights = decoder(dec_input, hidden, enc_output)
        curr_predictions[:, t, :] = predictions
        dec_input = torch.argmax(predictions, 1).unsqueeze(1)
        curr_output[:, t] = dec_input.squeeze(1)

    return curr_output, curr_predictions


Here are some examples of sentences generated by the model compared with correct translations from the source language to target language.

In [44]:
if __name__ == '__main__':
    rnn_encoder.eval()
    rnn_decoder.eval()
    idxes = random.choices(range(len(test_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    curr_output, _ = decode_rnn_model(rnn_encoder, rnn_decoder, src.transpose(0,1).to(DEVICE), trg.size(1), DEVICE)
    for i in range(len(src)):
        print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
        print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
        print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
        print("----------------")


Source sentence: <start> 汤 姆 被 玛 丽 说 的 话 给 难 倒 了 。 <end>
Target sentence: <start> tom was puzzled by what mary said . <end>
Predicted sentence: <start> tom was puzzled by what mary was puzzled by what mary was puzzled by what mary was puzzled by what mary was puzzled by what mary was puzzled by what mary was puzzled by what
----------------
Source sentence: <start> 这 个 位 子 空 着 吗 ？ <end>
Target sentence: <start> is this seat available ? <end>
Predicted sentence: <start> is this flower ? <end>
----------------
Source sentence: <start> 叫 救 护 车 。 <end>
Target sentence: <start> call an ambulance . <end>
Predicted sentence: <start> call a car . <end>
----------------
Source sentence: <start> 我 从 窗 口 爬 进 来 的 。 <end>
Target sentence: <start> i climbed in through the window . <end>
Predicted sentence: <start> i took it down from the window . <end>
----------------
Source sentence: <start> 会 议 结 束 了 。 <end>
Target sentence: <start> the meeting ended . <end>
Predicted sentence: <start> the meetin

### RNN Model Evaluation
Here is the function to run the test set through the model and calcualte BLEU scores. The BLEU scores are expected to satisfy the following conditions:
*   BLEU-1 > 0.290
*   BLEU-2 > 0.081
*   BLEU-3 > 0.059
*   BLEU-4 > 0.056

In [45]:
def evaluate_rnn_model(encoder, decoder, test_dataset, target_tensor_val, device):
    trg_vocab = decoder.trg_vocab
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()

    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0
            curr_output, curr_predictions = decode_rnn_model(encoder, decoder, src.transpose(0,1).to(device), trg.size(1), device)
            for t in range(1, trg.size(1)):
                loss += loss_function(trg[:, t].to(device), curr_predictions[:,t,:].to(device))

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(1)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(1)))
            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg
            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss

        print('Loss {:.4f}'.format(total_loss / n_batch))

    # Compute BLEU scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [46]:
if __name__ == '__main__':
    rnn_save_candidate, rnn_scores = evaluate_rnn_model(rnn_encoder, rnn_decoder, test_dataset, trg_tensor_val, DEVICE)

Loss 1.5236
BLEU 1-gram: 0.198950
BLEU 2-gram: 0.062358
BLEU 3-gram: 0.048089
BLEU 4-gram: 0.045925


## IV. Train a Transformer
Here a transformer model for machine translation is implemented, and then trained and evaluated its results.
Here are some links related to the tasks:
<ul>
<li> Original transformer paper: https://arxiv.org/pdf/1706.03762.pdf
<li> Helpful tutorial: http://jalammar.github.io/illustrated-transformer/
<li> Another tutorial: http://peterbloem.nl/blog/transformers
</ul>

### Positional Embeddings
A key component of the transformer's encoder model is the Positional Embedding. Word embeddings encode words in a way that words with similar meaning have similar vectors. Because there are no recurrences in a Transformer, we need a way to tell the transformer the relative position of words in a sentence: so will add a positional embedding to the word embeddings. Now, two words with a similar embedding will both be close in meaning and occur near each other in the sentence.

Here a positional embedding matrix of size $(max\_len, embed\_dim)$ is created by using the following formulas:
<br>
$\begin{align*} pe[pos,2i] &= \sin \Big (\frac{pos}{10000^{2i/embed\_dim}}\Big )\\pe[pos,2i+1] &= \cos \Big (\frac{pos}{10000^{2i/embed\_dim}}\Big ) \end{align*}$

In [47]:
def create_positional_embedding(max_len, embed_dim):
    '''
    Args:
        max_len: The maximum length supported for positional embeddings
        embed_dim: The size of your embeddings
    Returns:
        pe: [max_len, 1, embed_dim] computed as in the formulae above
    '''
    pe = torch.zeros(max_len, embed_dim)
    pos = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp((torch.arange(0, embed_dim, 2, dtype=torch.float) * (-math.log(10000.0) / embed_dim)))
    pe[:, 0::2] = torch.sin(pos.float() * div_term)
    pe[:, 1::2] = torch.cos(pos.float() * div_term)
    pe = pe.unsqueeze(0).transpose(0,1)

    return pe

### Encoder Model

In [48]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_src, device, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.device = device
        """
        Args:
            src_vocab: Vocab_Lang, the source vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of attention heads
            num_layers: the number of Transformer Encoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_src: maximum length of the source sentences
            device: the working device (you may need to map your postional embedding to this device)
            dropout: the dropout to be applied. Default=0.1.
        """
        self.src_vocab = src_vocab
        src_vocab_size = len(src_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_src, embedding_dim).to(device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that position_embedding is not a learnable parameter

        # Initialize embedding layer
        self.embedding = nn.Embedding(src_vocab_size, embedding_dim)

        # Dropout layer
        self.dropout = nn.Dropout()

        # Initialize a nn.TransformerEncoder model (use embedding_dim,
        # num_layers, num_heads, & dim_feedforward here)
        enc_model = nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(enc_model, num_layers).to(device)

    def make_src_mask(self, src):
        """
        Args:
            src: [max_len, batch_size]
        Returns:
            Boolean matrix of size [batch_size, max_len] indicating which indices are padding
        """
        assert len(src.shape) == 2, 'src must have exactly 2 dimensions'
        src_mask = src.transpose(0, 1) == 0 # padding idx
        return src_mask.to(self.device) # [batch_size, max_src_len]

    def forward(self, x):
        """
        Args:
            x: [max_len, batch_size]
        Returns:
            output: [max_len, batch_size, embed_dim]
        Steps (note: x refers to the original input to this function throughout the pseudo-code):
        - Pass x through the word embedding
        - Add positional embedding to the word embedding, then apply dropout
        - Call make_src_mask(x) to compute a mask: this tells us which indexes in x
          are padding, which we want to ignore for the self-attention
        - Call the encoder, with src_key_padding_mask = src_mask
        """

        embedding = self.embedding(x).to(self.device)
        embedding = self.dropout(embedding + self.position_embedding[:embedding.size(0)])
        embedding_mask = self.make_src_mask(x)

        output = self.transformer_encoder(embedding, src_key_padding_mask=embedding_mask)

        return output

### Decoder Model

In [49]:
class TransformerDecoder(nn.Module):
    def __init__(self, trg_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_trg, device, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.device = device
        """
        Args:
            trg_vocab: Vocab_Lang, the target vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of attention heads
            num_layers: the number of Transformer Decoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_trg: maximum length of the target sentences
            device: the working device (you may need to map your postional embedding to this device)
            dropout: the dropout to be applied. Default=0.1.
        """
        self.trg_vocab = trg_vocab
        trg_vocab_size = len(trg_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_trg, embedding_dim).to(self.device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that positional_embedding is not a learnable parameter

        # Initialize embedding layer
        self.embedding = nn.Embedding(trg_vocab_size, embedding_dim)

        # Dropout layer
        self.dropout = nn.Dropout()

        # Initialize a nn.TransformerDecoder model (you'll need to use embedding_dim,
        # num_layers, num_heads, & dim_feedforward here)
        decoder_model = nn.TransformerDecoderLayer(embedding_dim, num_heads, dim_feedforward).to(self.device)
        self.transfomer_decoder = nn.TransformerDecoder(decoder_model,num_layers)

        # Final fully connected layer
        self.fc = nn.Linear(embedding_dim,trg_vocab_size)

    def generate_square_subsequent_mask(self, sz):
        """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(self.device)
        return mask

    def forward(self, dec_in, enc_out):
        """
        Args:
            dec_in: [sequence length, batch_size]
            enc_out: [max_len, batch_size, embed_dim]
        Returns:
            output: [sequence length, batch_size, trg_vocab_size]
        Steps:
        - Compute input word and positional embeddings in similar manner to encoder
        - Call generate_square_subsequent_mask() to compute a mask: this time,
          the mask is to prevent the decoder from attending to tokens in the "future".
          In other words, at time step i, the decoder should only attend to tokens
          1 to i-1.
        - Call the decoder, with tgt_mask = trg_mask
        - Run the output through the fully-connected layer and return it
        """
        embedding = self.embedding(dec_in.to(self.device))
        embedding = self.dropout(embedding + self.position_embedding[:embedding.size(0)])

        trg_mask = self.generate_square_subsequent_mask(dec_in.size(0))
        output = self.transfomer_decoder(embedding, enc_out, tgt_mask=trg_mask)

        output = self.fc(output)

        return output

### Transformer Model Training

In [50]:
def train_transformer_model(encoder, decoder, optimizer, device, n_epochs):
    encoder.train()
    decoder.train()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    for epoch in range(n_epochs):
        start = time.time()
        losses = []

        for src, trg in tqdm(train_dataset):

            src = src.to(device).transpose(0,1) # [max_src_length, batch_size]
            trg = trg.to(device).transpose(0,1) # [max_trg_length, batch_size]

            enc_out = encoder(src)
            output = decoder(trg[:-1, :], enc_out)

            output = output.reshape(-1, output.shape[2])
            trg = trg[1:].reshape(-1)

            optimizer.zero_grad()

            loss = criterion(output, trg)
            losses.append(loss.item())

            loss.backward()

            # Clip to avoid exploding grading issues
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1)

            optimizer.step()

        mean_loss = sum(losses) / len(losses)
        print('Epoch:{:2d}/{}\t Loss:{:.4f} ({:.2f}s)'.format(epoch + 1, n_epochs, mean_loss, time.time() - start))

In [51]:
if __name__ == '__main__':
    # HYPERPARAMETERS - feel free to change
    LEARNING_RATE = 0.001
    DIM_FEEDFORWARD=512
    N_EPOCHS=10
    N_HEADS=2
    N_LAYERS=2
    DROPOUT=0.1

    transformer_encoder = TransformerEncoder(src_vocab, EMBEDDING_DIM, N_HEADS,
                                 N_LAYERS,DIM_FEEDFORWARD,
                                 max_length_src, DEVICE, DROPOUT).to(DEVICE)
    transformer_decoder = TransformerDecoder(trg_vocab, EMBEDDING_DIM, N_HEADS,
                              N_LAYERS,DIM_FEEDFORWARD,
                              max_length_trg, DEVICE, DROPOUT).to(DEVICE)

    transformer_model_params = list(transformer_encoder.parameters()) + list(transformer_decoder.parameters())
    optimizer = torch.optim.Adam(transformer_model_params, lr=LEARNING_RATE)

    print('Encoder and Decoder models initialized!')

Encoder and Decoder models initialized!




In [52]:
if __name__ == '__main__':
    train_transformer_model(transformer_encoder, transformer_decoder, optimizer, DEVICE, N_EPOCHS)

  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 1/10	 Loss:4.5764 (9.73s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 2/10	 Loss:3.7332 (8.80s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 3/10	 Loss:3.3813 (8.91s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 4/10	 Loss:3.1082 (9.03s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 5/10	 Loss:2.8887 (9.01s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 6/10	 Loss:2.7011 (9.07s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 7/10	 Loss:2.5197 (9.14s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 8/10	 Loss:2.3625 (9.24s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch: 9/10	 Loss:2.2142 (9.51s)


  0%|          | 0/280 [00:00<?, ?it/s]

Epoch:10/10	 Loss:2.0799 (9.25s)


### Inference (Decoding) Function
After the model is trained, it can be used on test data. Here is a function that takes the trained transformer model and a source sentence, and returns its translation. Like the RNN, we use the prediction of the decoder as the input to the decoder for the sequence of outputs. For the RNN, at time step $t_i$ the decoder takes the hidden state $h_{i-1}$ and the previous prediction $w_{i-1}$ at each time step. However, because the transformer does not use recurrences, we do not pass a hidden state; instead, at time step $t_i$ we pass $w_1,w_2 \cdots w_{i-1}$, which is the entire sequence predicted so far.

In [53]:
def decode_transformer_model(encoder, decoder, src, max_decode_len, device):
    """
    Args:
        encoder: Your TransformerEncoder object
        decoder: Your TransformerDecoder object
        src: [max_src_length, batch_size] the source sentences you wish to translate
        max_decode_len: The maximum desired length (int) of your target translated sentences
        device: the device your torch tensors are on (you may need to call x.to(device) for some of your tensors)

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in your vocabulary at each time step

    Steps:
    - Obtain encoder output by encoding src sentences
    - For 1 ≤ t ≤ max_decode_len:
        - Obtain dec_input as the best words so far for previous time steps (you can get this from curr_output)
        - Obtain your (unnormalized) prediction probabilities by feeding dec_input and encoder output to decoder
        - Save your (unnormalized) prediction probabilities in curr_predictions at index t
        - Calculate the most likely (highest probability) token and save in curr_output at timestep t
    """
    # Initialize variables
    trg_vocab = decoder.trg_vocab
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len))
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)))
    enc_output = None

    # We start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size).transpose(0,1)
    curr_output[:, 0] = dec_input.squeeze(1)

    # Implement decoding algorithm
    enc_output = encoder(src)

    for t in range(0,max_decode_len - 1):
      dec_in = curr_output[:,:t+1].transpose(0,1)
      predictation = decoder(dec_in.to(torch.int32).to(device), enc_output)[-1]

      curr_predictions[:,t+1,:] = predictation
      curr_output[:,t+1] = torch.argmax(predictation, dim =1)

    return curr_output, curr_predictions, enc_output

In [54]:
if __name__ == '__main__':
    transformer_encoder.eval()
    transformer_decoder.eval()
    idxes = random.choices(range(len(test_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    curr_output, _, _ = decode_transformer_model(transformer_encoder, transformer_decoder, src.transpose(0,1).to(DEVICE), trg.size(1), DEVICE)
    for i in range(len(src)):
        print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
        print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
        print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
        print("----------------")

Source sentence: <start> 你 们 共 住 一 个 房 间 吗 ？ <end>
Target sentence: <start> do you share a room ? <end>
Predicted sentence: <start> do you share a room ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end> ? <end>
----------------
Source sentence: <start> 汤 姆 不 知 道 玛 丽 说 的 是 真 话 还 是 假 话 。 <end>
Target sentence: <start> tom doesn't know whether mary is telling the truth or not . <end>
Predicted sentence: <start> tom didn't know mary was really handsome . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end>
----------------
Source sentence: <start> 他 們 在 小 學 時 就 是 朋 友 了 。 <end>
Target sentence: <start> they became friends in elementary school . <end>
Predicted sentence: <start> they studied english . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end> . <end>
----------------
Source sentence: <start> 我 們 沒 有 看

### Transformer Model Evaluation
Here is the function to run the test set through the model and calcualte BLEU scores. The BLEU scores are expected to satisfy the following conditions:
*   BLEU-1 > 0.290
*   BLEU-2 > 0.081
*   BLEU-3 > 0.059
*   BLEU-4 > 0.056

In [55]:
def evaluate_model(encoder, decoder, test_dataset, target_tensor_val, device):
    trg_vocab = decoder.trg_vocab
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    losses=[]
    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0

            src, trg = src.transpose(0,1).to(device), trg.transpose(0,1).to(device)
            curr_output, curr_predictions, enc_out = decode_transformer_model(encoder, decoder, src, trg.size(0), device)

            for t in range(1, trg.size(0)):
                output = decoder(trg[:-1, :], enc_out)
                output = output.reshape(-1, output.shape[2])
                loss_trg = trg[1:].reshape(-1)
                loss += criterion(output, loss_trg)
                # loss += criterion(curr_predictions[:,t,:].to(device), trg[t,:].reshape(-1).to(device))

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(0)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(0)))

            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg.transpose(0,1)
            losses.append(loss.item() / (trg.size(0)-1))

        mean_loss = sum(losses) / len(losses)
        print('Loss {:.4f}'.format(mean_loss))

    # Compute Bleu scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [56]:
if __name__ == '__main__':
    transformer_save_candidate, transformer_scores = evaluate_model(transformer_encoder, transformer_decoder, test_dataset, trg_tensor_val, DEVICE)

Loss 2.7410
BLEU 1-gram: 0.218020
BLEU 2-gram: 0.062562
BLEU 3-gram: 0.045175
BLEU 4-gram: 0.041665
