In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# 1. Implement Scaled Dot Product Attention Mechanism

$$ \text{SoftMax} \left( \frac{Q}{T_e}K^T\right) V, \quad T_e = \sqrt{d_k}$$

In [9]:
class ScaledDotProductAttention(nn.Module):
    """ Scaled Dot Product Attention """
    def __init__(self, temperature, attn_dropout = 0.1):
        super(ScaledDotProductAttention, self).__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask = None):
        attn = torch.matmul(q/self.temperature, k.transpose(2, 3))

        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)

        attn = self.dropout(F.softmax(attn, dim = -1))
        output = torch.matmul(attn, v)

        return output, attn

# 2. Implement Multi-Head Attention

<img src="https://production-media.paperswithcode.com/methods/multi-head-attention_l1A3G7a.png" alt="Encoder" width="300" height="auto">

### Run several scaled dot product attention, each mapping to a different feature map, meaning each has its own Q, K, and V matrices.

- $ d_{model} \rightarrow $ dimensionality of the representation of each token. In other words it is the number of values that are associated with a word. [Embedding Space]

- $ d_k \rightarrow $ feature space of reduced dimensionality. It represents an extraction of some sort of information from the complete dimensionality  $d_{model}$

- $ d_v \rightarrow w_V $ represents the matrix that when multiplied by the embedding ($d_{model}$) return the value vector. The value vector is what must be added to the original embedding to provide context



$$ {W_Q}_{\text{Multi-head}} =
\left(
  \begin{array}{c|c|c|c}
      &     &        &             \\
  {W_Q}_1 & {W_Q}_2 & \cdots & {W_Q}_{n_{Heads}} \\
      &     &        &  
\end{array} \right), \quad

{W_K}_{\text{Multi-head}} =
\left(
  \begin{array}{c|c|c|c}
      &     &        &             \\
  {W_K}_1 & {W_K}_2 & \cdots & {W_K}_{n_{Heads}} \\
      &     &        &  
\end{array} \right), \quad

{W_V}_{\text{Multi-head}} =
\left(
  \begin{array}{c|c|c|c}
      &     &        &             \\
  {W_V}_1 & {W_V}_2 & \cdots & {W_V}_{n_{Heads}} \\
      &     &        &  
\end{array} \right)
$$

$${W_Q}_x, {W_K}_x \quad \text{have dimensions} \quad (d_k \times d_{model})$$
$${W_V}_x \quad \text{has dimensions} \quad (d_{model} \times d_{model})$$

This way $W_V$ has many more paramenters than $W_Q$ and $W_K$ so it is often split into two matrices ${W_V}_{\uparrow}$, and ${W_V}_{\downarrow}$

Then, instead of having:
$$ {W_V}_1 \times E_{token}, \quad \text{we write} \quad {W_V}_{1\uparrow} \times {W_V}_{1\downarrow} \times E_{token}$$

Here ${W_V}_{\uparrow}$ has dimensions $(d_{model} \times d_v)$, and ${W_V}_{\downarrow}$ has dimensions $(d_v \times d_{model})$

<sub>*notation for matrix dimensions is (rows x columns). Here we are considering $W \times E_\uparrow$ but nn.Linear() is actually more similar to $E_\rightarrow \times W^T$</sub>

In [10]:
class MultiHeadAttention(nn.Module):
    """ Multi-Head Attention """
    def __init__(self, n_head, d_model, d_k, d_v, dropout = 0.1):
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        
        # Linear Transformations: y = x @ A^T + b
        # Where the values inside A are trainable

        # d_model - embedding size
        # n_head - number of attention heads
        # d_k - reduced feature space

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias = False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias = False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias = False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias = False)

        # Initialize attention
        self.attention = ScaledDotProductAttention(temperature = d_k ** 0.5)

        # Dropout randomly deactivates neuron heads. This is used as a regularization
        # technique and is effective to reduce overfitting.
        self.dropout = nn.Dropout(dropout)

        # Standard Scaler Normalization initialized.
        # Eps is a value added to denominator and is used for increased stability
        self.layer_norm = nn.LayerNorm(d_model, eps = 1e-6)

    def forward(self, q, k, v, mask = None):
        """
        q, k, v in the encoder layer is just the encodings
        
        sz_b -> size of the batch being processed.
        len_q -> size of the phrase
        len_k -> size of the phrase
        v -> size of the phrase

        Refer to Extra Material (a.) to understand the logic behind
        these calculations
        """

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        # Retain the input to be added at latter stage (residual connection)
        residual = q

        # Obtain the Query, Key and Value matrices for all sentences in the batch
        # and for all the attention heads
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        # Transpose values to calculate self attention: batch x nheads x nwords x reduced_embedd
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        if mask is not None:
            mask = mask.unsqueeze(1) # Head axis broadcasting

        
        # Perform the Scaled Dot Product Attention Calculation
        q, attn = self.attention(q, k, v, mask = mask)

        # Transpose to move the head dimension back: b x lq x n x dv
        # Combine last two dimensions to concatenate all the heads together: b x lq x (n*dv)

        # Return to the previous dimension order so we can include the residual connection
        # We must maintain this order for consequent training reinjection
        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)

        # Not sure this last linear layer is necessary
        q = self.dropout(self.fc(q))
        
        q += residual

        # Perform layer normalization to each attention head ??? (revisit)
        q = self.layer_norm(q)

        # Attn is the calculation attention value (argument of the softmax)
        # q if the output of the multihead attention layer.
        return q, attn 

## 2.1 Multilayer Perceptron after the multi-head attention

In [11]:
class PositionwiseFeedForward(nn.Module):
    """ 
        Simple two-layer feed-forward network 
        e.g., Multi-layer Perceptron
    
    Input layer of dimension d_in passes to a hidden layer
    of dimension d_hidden, recovering the original dimensionality
    d_in.

    """

    def __init__(self, d_in, d_hid, dropout = 0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_in, d_hid) # position-wise-
        self.w_2 = nn.Linear(d_hid, d_in) # position-wise

        # Normalize layer 
        self.layer_norm = nn.LayerNorm(d_in, eps = 1e-6)

        # Deactivates some neuron heads as a regularization technique
        # It is used to prevent overfitting.
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Forward pass through the two layers with a residual connection
        residual = x

        x = self.w_2(F.relu(self.w_1(x)))
        x = self.dropout(x)
        x += residual

        x = self.layer_norm(x)

        return x

# 3. Encoder/Decoder Implementations

## 3.1 Encoder

<img src="https://www.researchgate.net/profile/Ehsan-Amjadian/publication/352239001/figure/fig1/AS:1033334390013952@1623377525434/Detailed-view-of-a-transformer-encoder-block-It-first-passes-the-input-through-an.jpg" alt="Encoder" width="300" height="auto">

In [12]:
class EncoderLayer(nn.Module):
    """ 
    Encoder is built of a stack of Nx encoding layers.
    Each of these layers is built of two sublayers,
    a Multi-head Attention layers and a forward pass
    FFN one.
    """
    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout = 0.1):
        super(EncoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout)

    def forward(self, enc_input, slf_attn_mask = None):
        # Propagate the encoded phrases through the attention mechanism
        enc_output, enc_slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask = slf_attn_mask)

        # Propagate the output of the attention thought the MLP
        enc_output = self.pos_ffn(enc_output)

        return enc_output, enc_slf_attn

## 3.2 Decoder

<img src="https://res.cloudinary.com/edlitera/image/upload/c_fill,f_auto/v1680629118/blog/gz5ccspg3yvq4eo6xhrr" alt="Decoder" width="300" height="auto">

In [33]:
class DecoderLayer(nn.Module):
    """ 
    The decoder layer is built of two multi-head attention sublayers
    followed by a MLP. The second attention head takes as inputs for the
    Values and Keys the output from the encoder, and the output from the 
    first Self-attention head as Queries.
    """
    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout = 0.1):
        super(DecoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout)
        self.enc_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout)

    def forward(self, dec_input, enc_output, slf_attn_mask = None, dec_enc_attn_mask = None):
        # First Attention is self-attention (attention between word in output phrase)
        dec_output, dec_slf_attn = self.slf_attn(dec_input, dec_input, dec_input, slf_attn_mask)

        # Second Attention is cross-attention (compares the queries from self-attention with 
        # the keys from encoder) and uses values from decoder language
        dec_output, dec_enc_attn = self.enc_attn(dec_output, enc_output, enc_output, dec_enc_attn_mask)

        # Foward pass through a MLP
        dec_output = self.pos_ffn(dec_output)

        # Output after one pass through decoder, the self-attention, and the cross-attention values
        return dec_output, dec_slf_attn, dec_enc_attn

# 4. Build the Entire Model

## 4.1 Positional Encoding

It is important for the transformer to encode the information of the position of each word. For that, sinusoidal encoding are used deplying sine and cosine functions of different frequencies,

$$ PE_{(pos, 2i)} = \sin{(pos/10000^{2i/d_{model}})} $$
$$ PE_{(pos, 2i+1)} = \cos{(pos/10000^{2i/d_{model}})} $$

In [29]:
class PositionalEncodings(nn.Module):
    """
    Positinal encoding gives each word a positional embedding having
    dimensions d_hid. N_position is the total number of words in the phrase.
    """
    def __init__(self, d_hid, n_positions):
        super(PositionalEncodings, self).__init__()

        # Not trainable parameters, just a look-up table for the sinusoid encodings
        # register_buffer assures these values are not trainable
        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_positions, d_hid))

    def _get_sinusoid_encoding_table(self, n_position, d_hid):
        """ Sinusoid Position Encoding """

        def get_position_angle_vec(position):
            # We uses (i // 2) to have repeated values in the same array and perform the 
            # sin and cos for each
            return [position/np.power(10000, 2 * (i // 2) / d_hid) for i in range(d_hid)]
        
        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i + 1

        return torch.FloatTensor(sinusoid_table).unsqueeze(0)

    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach

## 4.2 Encoder Implementation

In [30]:
class Encoder(nn.Module):
    """ Stack all the Encoder Layers """

    def __init__(self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v, 
                 d_model, d_inner, pad_idx, dropout = 0.1, n_position = 200, scale_emb = False):
        
        super(Encoder, self).__init__()

        # Word Embedder (size source vocab, embedding size)
        self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx = pad_idx)

        # Add the positional encoding to each word. 
        # d_word_vec is the size of the positional enconding vector
        self.position_enc = PositionalEncodings(d_word_vec, n_position)
        self.dropout = nn.Dropout(dropout)

        # Stack n_layers of Encoding
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout) for _ in range(n_layers)])
        
        # Finalize with a layers normalization
        self.layer_norm = nn.LayerNorm(d_model, eps = 1e-6)
        self.scale_emb = scale_emb
        self.d_model = d_model

    def forward(self, src_seq, src_mask, return_attns = False):

        enc_slf_attn_list = []
        
        # Word Embedding
        enc_output = self.src_word_emb(src_seq)
        if self.scale_emb:
            enc_output *= self.d_model ** 0.5
        
        # Add Positional Embedding
        enc_output = self.dropout(self.position_enc(enc_output))
        enc_output = self.layer_norm(enc_output)

        for enc_output in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer = (enc_output, src_mask)
            enc_slf_attn_list += [enc_slf_attn] if return_attns else []

        if return_attns:
            return enc_output, enc_slf_attn_list
        else:
            return enc_output        

## 4.3 Decoder Implementation

In [35]:
class Decoder(nn.Module):
    def __init__(self, n_trg_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
    d_model, d_inner, pad_idx, n_positions = 200, dropout = 0.1, scale_emb = False):

        super(Decoder, self).__init__()

        # Target Vocabulary Encoding
        self.trg_wrd_emb = nn.Embedding(n_trg_vocab, d_word_vec, pad_idx)

        # Positional Encoding
        self.position_enc = PositionalEncodings(d_word_vec, n_positions)
        self.dropout = nn.Dropout(dropout)
        self.layer_stack([
            DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout) for _ in range(n_layers)])
        
        self.layer_norm = nn.LayerNorm(d_model, eps = 1e-6)
        self.scale_emb = scale_emb
        self.d_model = d_model

    def forward(self, trg_seq, trg_mask, enc_output, src_mask, return_attns = False):

        dec_self_attn_list, dec_enc_attn_list = [], []
        
        dec_output = self.trg_word_emb(trg_seq)

        if self.scale_emb:
            dec_output *= self.d_model**0.5

        dec_output = self.positional_enc(dec_output)
        dec_output = self.layer_norm(enc_output)

        ## Embedding Complete

        for dec_layer in self.layers_stack:
            dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
                dec_output, enc_output, trg_mask, src_mask)
            dec_slf_attn_list += [dec_slf_attn] if return_attns else []
            dec_enc_attn_list += [dec_enc_attn] if return_attns else []

        if return_attns:
            return dec_output, dec_slf_attn_list, dec_enc_attn_list
        else:
            return dec_output

# 5. Built the Transformer Model

<img src="https://production-media.paperswithcode.com/methods/new_ModalNet-21.jpg" alt="Transformer" width="300" height="auto">

In [34]:
def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)


def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''
    sz_b, len_s = seq.size()
    subsequent_mask = (1 - torch.triu(
        torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask

In [39]:
class Transformer(nn.Module):
    def __init__(
        self, n_src_vocab, n_trg_vocab, src_pad_idx, trg_pad_idx, 
        d_word_vec = 512, d_model = 512, d_inner = 2048, n_layers = 6, 
        n_head = 8, d_k = 64, d_v = 64, dropout = 0.1, n_position = 200,
        trg_emb_proj_weight_sharing = True, src_emb_proj_weight_sharing = True,
        scaled_emb_or_prj = 'prj'):

        super(Transformer, self).__init__()

        self.src_pad_idx, self.trg_pad_idx = src_pad_idx, trg_pad_idx

        # In section 3.4 of paper "Attention Is All You Need", there is such detail:
        # "In our model, we share the same weight matrix between the two
        #  embedding layers and the pre-softmax linear transformation...
        # In the embedding layers, we multiply those weights by \sqrt{d_model}".
        #
        # Options here:
        #   'emb': multiply \sqrt{d_model} to embedding output
        #   'prj': multiply (\sqrt{d_model} ^ -1) to linear projection output
        #   'none': no multiplication

        assert scaled_emb_or_prj in ['emb', 'prj', 'none']
        scale_emb = (scaled_emb_or_prj == 'emb') if trg_emb_proj_weight_sharing else False
        self.scale_proj = (scaled_emb_or_prj == 'proj') if trg_emb_proj_weight_sharing else False
        self.d_model = d_model

        self.encoder = Encoder(
            n_src_vocab, n_position, d_word_vec, d_model, d_inner, 
            n_layers, n_head, d_k, d_v, src_pad_idx, dropout, scale_emb)

        self.decoder = Decoder(
            n_trg_vocab, n_position, d_word_vec, d_model, d_inner,
            n_layers, n_head, d_k, d_v, trg_pad_idx, dropout, scale_emb)

        self.trg_word_prj = nn.Linear(d_model, n_trg_vocab, bias = False)

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

        assert d_model == d_word_vec
        """
        To facilitate residual connections, the dimensions of all
        module outputs should be the same
        """

        if trg_emb_prj_weight_sharing:
            # Share the weight between target word embedding & last dense layer
            self.trg_word_prj.weight = self.decoder.trg_word_emb.weight

        if trg_emb_prj_weight_sharing:
            self.encoder.src_word_emb.weight = self.decoder.trg_word_emb.weight

    def forward(self, src_seq, trg_seq):
        
        src_mask = get_pad_mask(src_seq, self.src_pad_idx)
        trg_mask = get_pad_mask(trg_seq, self.trg_pad_idx) & get_subsequent_mask(trg_seq)

        enc_output, *_ = self.encoder(src_seq, src_mask)
        dec_output, *_ = self.decoder(trg_seq, trg_mask, enc_output, src_mask)
        seq_logit = self.trg_word_prj(dec_output)
        if self.scale_prj:
            seq_logit *= self.d_model ** -0.5

        return seq_logit.view(-1, seq_logit.size(2))

# ! Extra Material

## a. understand Query, Key and Value matrices

In [158]:
d_model = 12000
n_head = 6 
d_k = 128
b_size = 3
max_phrase_size = 5
test_l = nn.Linear(d_model, d_k * n_head, bias = False)
test = test_l.get_parameter('weight')
embedd = torch.randn(b_size, max_phrase_size, d_model)

In [159]:
test.size()

torch.Size([768, 12000])

In [160]:
test_l(embedd).view(b_size, max_phrase_size, n_head, d_k).size()

torch.Size([3, 5, 6, 128])

#### <span style="color:red">NOTE:</span> After this calculation we just transpose dimensions 1 and 2 and obtain the Q, K and V matrices for 3 batches, each having 6 attention heads for phrases of length 5 where each word has and embedding size of 128

## b. understand word Embedding

In [117]:
n_src_vocab = 5000
d_word_vec = 128
src_word_emb = nn.Embedding(n_src_vocab, d_word_vec)

In [146]:
vocab = {"cake": 0, "there": 1, "hey": 2, "is": 3, "good": 4999}
word_indexes = [vocab[w] for w in ["cake", "is", "good"]]
word_indexes

[0, 3, 4999]

In [147]:
word_vectors = src_word_emb(torch.tensor(word_indexes))
word_vectors.size()

torch.Size([3, 128])