# Transformer model (section 1: input embeddings)

Convert original sentence into a vector of 512 dimensions

original sentence -> input IDs (numbers that correspond to the position of each word inside the vocab) -> embeddings (vec size 512)

In [4]:
import torch
import torch.nn as nn
import math

In [6]:
class InputEmbeddings(nn.Module):
    def __init__(self,
                 d_model: int, # what is the dimension of the model
                 vocab_size: int # how many words in the vocabulary
                ):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

1. Why multiple $\sqrt{d_{model}}$ in the forward method for the embedding layer?
- In the embedding layer, we multiply those weights by $\sqrt{d_{model}}$

# Section 2: positional encoding

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, 
                 d_model: int, 
                 seq_len: int, # maximum length of the sentence (create 1 positional vector for each position)
                 dropout: float # makes the model less overfit
                ) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = dropout
        
        # Positional encoding is a matrix of shape seq_len * d_model (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        
        # Create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # tensor shape: (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (math.log(10000.0) / d_model))
        
        # Apply the sin to even positions
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add batch dimension
        pe = pe.unsqueece(0) # will become a tensor of shape (1, seq_len, d_model)

        # To keep tensor in a model not as a learnt parameter, but save it when save model file, should register as buffer
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # Add positional encoding to every word in the sentence
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # not learnt in the training process, this part of tensor not learnt
        return self.dropout(x)

For even positions: $PE(pos, 2i) = \sin \left( \frac{pos}{10000^{\frac{2i}{d_{model}}}} \right)$

For odd positions: $PE(pos, 2i+1) = \cos \left (\frac{pos}{10000^{\frac{2i}{d_{model}}}} \right)$

# Section 3: add and norm

layer normalization

In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, eps: float = 10**-6):
        super().__init__()
        self.eps = eps
        # 2 parameters needed for layer normalization: alpha (will be multiplied), bias (will be added)
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))
    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True) # keepdim: usually the mean cancels out the dimension to which it is applied
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

# Section 4: Feed forward layer

A fully connected layer that our model uses in the encoder and decoder

$FFN(x) = max(0, xW_1 + b_1)W_2 + b_2$

Two matrices ($W_1$ and $W_2$), ReLU in between, with a bias

In [10]:
class FeedForwardBlock(nn.Module):
    def __init__(self,
                 d_model: int,
                 d_ff: int,
                 dropout: float):
        super().__init__()
        self.Linear_1 = nn.Linear(d_model, d_ff) # W1 and B1
        self.dropout = nn.Dropout(dropout)
        self.Linear_2  = nn.Linear(d_ff, d_model) # W2 and B2
    def forward(self, x):
        # (batch, seq_len, d_model) -> (batch, seq_len, d_ff) -> (batch, seq_len, d_model) finally apply linear to get d_model
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

# Section 5: Multihead attention

In the encoder, the multihead attention takes the input of the encoder and uses it 3 times, query, key, values

<img src="multihead_attention.png" width="900">

$d_k = \frac{d_{model}}{h}$

In [None]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float):
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h
        # define the matrices by which we will multiply the query, key, and values, and output matrix Wo
        self.w_q = nn.Linear(d_model, d_model) # Because in the above graph, Wq's shape is (d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model) #Wk
        self.w_v = nn.Linear(d_model, d_model) # Wv

        self.w_o = nn.Linear(d_modelm d_model) # Wo
        self.dropout = nn.Dropout(dropout)
    @staticmethod # means we can call the "attention" function without having an instance of the "MultiHeadAttention" class
    def attention(query)

    def forward(self, q, k, v, mask):
        # mask: prevent some words to interact with other words
        query = self.w_q(q) # q' in the slides, (batch, seq_len, d_model) --> (batch. seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch. seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch. seq_len, d_model)

        # now we want to divide the query, key, value matrices into smaller matrices so we can give give each small matrix different head
        # using view method, keep batch dimension, we dont split the sentence, we split the embedding
        # split the d_model into h by dk
        query = query.view(query.shape[0], query.shape[1], self.h, self,d_k).transpose(1, 2)
        # why transpose? because we want h to be the 2nd dimension 
        # (batch, seq_len, d_model) -> (batch, seq_len, h, d_k) -> (batch, h, seq_len, d_k)
        key = query.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], view.shape[1], self.h, self.d_k).transpose(1, 2)
        