# Imports 

In [2]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torch.utils.data import DataLoader

import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

# Optional imports - only import if available
try:
    import spacy
except ImportError:
    spacy = None

try:
    import GPUtil
except ImportError:
    GPUtil = None

# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

# Useful Functions
Some convenience helper functions used throughout the notebook

In [3]:
def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

# Encoder-Decoder Structure  
The encoder-decoder structure allows differences between the lengths of the input and output sequences, this avoids padding the sequences, which causes information loss and missing. This structure ensured both encoder and decoder can be used as a tool to embed in any algorithm and can be used in multimodal learning by changing different types of information into sequences, like changing images or audio files to its binary form. Besides, training the encoder and decoder separately can significantly cut the cost of training, making the whole system more efficient.  
The encoder-decoder structure in the Transformers structure has two main sections, Self-Attention and Feed-Forward Neural Network, which will be followed up below.

In [4]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    the encoder maps an input sequence of symbol representations 
    (x_1, ..., x_n) to a sequence of continuous representations, z=
    (z_1, ..., z_n).
    Encoder-Decoder architecture changes input sequences into vectors,
    vice-versa.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        """Take in and process masked src and target sequences."""
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask) 
        # encode and decode
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask) 
        # sequences to vectors, vector lengths are the same
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask) 
        # vectors to sequences


class Generator(nn.Module):
    """Define standard linear + softmax generation step."""
    def __init__(self, d_model, vocab):
        """
        linear transformation, x * W^T + b, W as weight and b as bias, d_model is the size 
        of the input tensor, vocab is the size of the output tensor. for an input tensor like 
        (batch_size, seq_length, input_size), tensor[-1] would be d_model.
        """
        super(Generator, self).__init__() 
        #make initialisation global
        self.proj = nn.Linear(d_model, vocab)
        
    def forward(self, x):
        """
        Here we take the log of the partition function, log(exp(x_i) / sum(exp(x_j) for j in
        range(len(x). This avoids data overflow.
        """
        return F.log_softmax(self.proj(x), dim=-1)

## Encoder
The encoder is a tool using for changing input sequences (discrete numbers) into tensors (continuous variables that can be easily processed by GPUs). Encoder processes sequence token by token and capture its relationship of relevant contents before and after the exact token to give a predicted vector (tensor). This means each token can be deducted from other token, they do not have to be saved in fixed length. Mathematical expression as follows: 

Changing from discrete quantities:$$(x_1, x_2, x_3...)$$to continuous function:
$$f(x_1, x_2, x_3...)$$
The encoder may ignore the detail of the whole input, imagine human reading books, people could memorize a general concept instead of every single word. The solution people came out nowadays will be explained later.

In [None]:
class Encoder(nn.Module):
    """Core encoder is a stack of N layers"""
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N) 
        # make "N" deep copies of layer
        self.norm = nn.LayerNorm(layer.size) 
        # normalise every single layer
        
    def forward(self, x, mask):
        """Pass the input (and mask) through each layer in turn."""
        for layer in self.layers:
            x = layer(x, mask) 
            # define layers
        return self.norm(x) 
        # normalise output

## Decoder  
The decoder follows same rule but functions an opposite way-it builds output sequence from the tensor generated by the encoder. To be specific, the decoder reads the tensor to find meaning of the input sequence and corresponding knowledge, which is embedded as tensors, in the pool to produce the answer. When generating the answer, decoder also reads the generated part to ensure the generation does not include any content obeys the causal sequence.

## Feed-Forward Neural Network  
The FFN was designed to have a deeper and non-linear output for the model and process the contents token by token. The attention mechanism cannot capture any non-linear relation between contents.   This structure bases on the Multilayer Perceptron(MLP), expressed as:
$$
FFN(x)=W_2​\cdot max(0,W_1​\cdot x+b_1​)+b_2​
$$
In this equation, variable $x$ is the input token, $W_1$ and $b_1$ is the weight and bias of the first layer that maps the input matrices into a larger dimension, while $W_2$ and $b_2$ are the ones for the second layer that maps the dimension to the original one. The ReLU, defined as $f(x) = max(0, x)$, acts as a gate that performs deeper and non-linear features for the matrices. The processing by elements are parallel to significantly reduce the training time.  
A more updated version is expressed as following:
$$
y=(x\cdot W_u​)\odot \psi(x\cdot W_v​)W_o​
$$
The $\odot$ means multiplication by elements and bias could be removed as an engineering option.  
For tensors with dimension $(B,T,d_{model})$, the amount of computation of the FFN has complexity $O(B\cdot T\cdot d_{model}\cdot d_{ff})$.

## Normalisation  
  
The Layer Normalisation is used for minimise the stability issues in activation distribution between batches and layers, making the training more stable. The normalisation is in feature dimensions and independent from batch size, therefore it is friendly to the small batch, longer sequence and online reasoning. 
  
Layer Normalisation includes following content:
  
1. Calculating Mean and Square Roots:  
$\mu = \frac{1}{D} \sum_{i}^{D}x_{i}$  
$\sigma ^2 = \frac{1}{D} \sum_{i}^{D}(x_{i}-\mu )^2$  
Where D is the number for feature dimension, which is also the second parameter in the input tensor [S, D].  
  
2. Normalisation:  
$\hat{x} = \frac{x-\mu}{\sqrt{\sigma ^2+\epsilon } }$  
The calculation was element by element.  
  
3. Affine transformation:  
However, the normalisation means any result from previous layer would be erased, therefore we need some adjustments to make sure the features could be either restored or learned from future training.  
$y = \gamma \odot \hat{x} + \beta$  
Where y is the output and $\gamma$ is the gain, $\beta$ is the bias.  
  
The structure in the original transformers is slightly different from what it is nowadays:  
  
In the original paper(Post-LN), LayerNorm after all calculation processes:  
LN(x + SelfAttention(x))  
LN(x + FeedForward(x))  

In the nowadays models(Pre-LN), LayerNorm before applying Attention or other algorithms:  
x = x + SelfAttention(LN(x))  
x = x + FeedForward(LN(x))  


In [None]:
class LayerNorm(nn.Module):
    """
    Construct a layernorm module (See arXiv:1607.06450). The goal is to calculate mean and 
    stddev for every single sample with their own characteristic dimension.
    """
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        """
        nn.Parameter save parameters as part of nn.Module, it is included in gradiant 
        calculation so it can be traced and updated during training sessions
        """
        self.a_2 = nn.Parameter(torch.ones(features)) 
        # create tensors with all elements equal to 1 with size "features"
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        """Specific function to make convergence faster."""
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * ((x - mean) / (std + self.eps)) + self.b_2 
        # first term and last term made Affine Transformation, the term in the middle is normalisation term.

## Transformers / Attention Mechanism In Decoder Structure  

The Transformer makes sequential encoding capable for parallel computing by introducing the Self-Attention mechanism: each Input Sequence Matrix (X) has a weight matrix(W) to calculate Query(Q), Key(K) and Value(V):
$$
Q = X\cdot W_Q\ ,\  K = X\cdot W_K\ ,\  V = X\cdot W_V
$$

Each of the matrices has shape of ```[batch_size, seq_len, d_k]```.  
In the Attention Mechanism, the Query gives the Attention which the token being processed needed to put on the other tokens in the pool to find the most relevant ones to generate the output.  
The Key suggests the token being selected from the pool, the accuracy of the embedded tensor gives more accurate Attention Score, which means better response. The Value suggests the complete content with same label as the Value and the result from matrices' multiplication will be the final output.   
The Attention Weight is calculated by:
$$
Attention(Q,K,V)=softmax(\frac{Q\cdot K^{T}}{\sqrt{d_{K}} }) \cdot V
$$
Where the term inside softmax function is the Attention Score. The softmax function acts like partition function we use in statistics (the weight acts like probability of a specific state and score is the overall probability). The Attention Score is the overall relevance between Q and K, while the Attention Weight is the one we used for specific calculation.  
The encoder uses Attention the way same as what we described before. However, the Attention system is used differently in the decoder. The Causal Attention mechanism needs a matrix to achieve the Masked Self-Attention in the decoder, calculated by:
$$
MaskedAttention(Q,K,V) = softmax(\frac{Q\cdot K^{T}}{\sqrt{d_{K}}} + M)\cdot V
$$so that the Query does not include anything not provided yet. To be specific, the Query of the decoder comes from its own output. The Key and Value still comes from input sequence and encoder.

## Generator Step  
  
This ia a standard standard linear + softmax generation step. The 