In [38]:
import torch
from torchinfo import summary

### Residual Connection
![](https://i.imgur.com/m7KQiX8.png)
Residual connections enabling gradients to flow more efficiently, fix the vanishing gradient problem and promoting learning of incremental features at each layer.

In [42]:
from torch import Tensor
from torch.nn import Module

class ResidualConnection(Module):
    """
    Apply residual connection with element-wise addition between input tensor and output tensor
    
    Return :
        Tensor : same shape of input and output tensor
    """
    def __init__(self):
        super().__init__()
    
    def forward(self, input : Tensor, output : Tensor)->Tensor:
        
        residual_connection = torch.add(input, output)
        
        return (residual_connection)
        

In [45]:
embedding_size = 768

x = torch.randn(size=(1, 5, embedding_size))
output = torch.randn(size=(1, 5, embedding_size))

layer = ResidualConnection()

residual_output = layer(input=x, output=output)

residual_output.shape

torch.Size([1, 5, 768])

### Multi Head Self Attention Block
![](https://i.imgur.com/xGGHbdX.png)
#### *Normalize input feature and apply multi-head self-attention followed by a residual connection.*

In [58]:
from torch import Tensor
from torch.nn import Module
from torch.nn import LayerNorm
from torch.nn import MultiheadAttention

class MultiHeadSelfAttentionBlock(Module):
    """
    Normalize input feature and apply multi-head self-attention followed by a residual connection.
    The input and output tensor shapes are expected to be identical.

    Attributes:
        `layer_norm` (LayerNorm): Layer normalization applied to the input tensor.
        `multi_head_attention` (MultiheadAttention): Multi-head self-attention mechanism.

    Args:
        `embedding_size` (int, optional): the input tensor's last dimension, which corresponds
            to the number of features or embedding dimensions. Default is 768.
        `num_heads` (int, optional): The number of attention heads in the multi-head self-attention mechanism.
            Default is 12.
        `dropout_rate` (float, optional): The dropout rate on attention.
            Default is 0.1.

    Returns:
        Tensor: The output tensor after applying multi-head self-attention and the residual connection.
    """
    def __init__(
        self, 
        embedding_size : int = 768, 
        nbr_heads : int = 12, 
        dropout_rate : float = 0.1,
    )->None:
        
        super().__init__()
        
        self.layer_norm = LayerNorm(normalized_shape=embedding_size)
        
        self.multi_head_attention = MultiheadAttention(
            embed_dim=embedding_size,
            num_heads=nbr_heads,
            dropout=dropout_rate,
            batch_first=True
        )
        
        self.residual_connection = ResidualConnection()
        
    
    def forward(self, x : Tensor)->Tensor:
        
        x_normalized = self.layer_norm(x)
        
        attention, _ = self.multi_head_attention(# duplicate the input tensor `x_normalized` for Self Attention
            query=x_normalized,
            key=x_normalized,
            value=x_normalized,
            need_weights=False
        )
        
        
        residual_output = self.residual_connection(input=x, output=attention)
        
        return (residual_output)



In [59]:
embedding_size = 768

x = torch.randn(size=(1, 5, embedding_size))

attention_block = MultiHeadSelfAttentionBlock(
    embedding_size=embedding_size,
    nbr_heads=12,
    dropout_rate=0.0,
)

out = attention_block(x)
out.shape

torch.Size([1, 5, 768])

In [60]:
from torchinfo import summary

summary(model=attention_block, 
        input_size=(1, 5, embedding_size),
        col_names=["input_size", "output_size", "num_params"],
        col_width=20,
        row_settings=["var_names"]
) 

Layer (type (var_name))                                      Input Shape          Output Shape         Param #
MultiHeadSelfAttentionBlock (MultiHeadSelfAttentionBlock)    [1, 5, 768]          [1, 5, 768]          --
├─LayerNorm (layer_norm)                                     [1, 5, 768]          [1, 5, 768]          1,536
├─MultiheadAttention (multi_head_attention)                  --                   [1, 5, 768]          2,362,368
├─ResidualConnection (residual_connection)                   --                   [1, 5, 768]          --
Total params: 2,363,904
Trainable params: 2,363,904
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.02
Forward/backward pass size (MB): 0.03
Params size (MB): 0.01
Estimated Total Size (MB): 0.05

#### Why `GELU` *in a nutshell* ?
![](https://i.imgur.com/n9C9j1M.png)
*$$\text{Gradient Everywhere}$$*
$$GELU=0.5 x\left(1+\frac{2}{\sqrt{\pi}} \int_0^{\frac{x}{\sqrt{2}}} e^{-t^2} d t\right)$$
- The GELU activation function is preferred over ReLU to its smooth differentiability, better gradient flow, and give ability to model to learn complex patterns with long-term dependencies in sequence data. 
- GELU is commonly used in Transformer architectures like BERT and tasks requiring complex nonlinear modeling. 
***
### Multi Layer Perceptron Block (MLP)
![](https://i.imgur.com/RNp7TIb.png)

The MLP block comprises two fully connected layers with a GELU activation 
- Layer normalization is applied to the input features before entering the MLP block
- The first fully connected layer processes the input tensor, generating an intermediate tensor that is then passed through a GELU activation function.
- The second fully connected layer processes intermediate tensor, generating an output tensor.
- Dropout is applied after each feature creation (like Linear Layer), helping to prevent overfitting.
- Residual connections are used on output tensor of the second Layer with input tensor
****


In [61]:
from torch.nn import Module
from torch.nn import Linear
from torch.nn import GELU
from torch.nn import Dropout
from torch.nn import LayerNorm

class MultiLayerPerceptronBlock(Module):  
    """
    A Multi-Layer Perceptron (MLP) block with residual connections, used in the Transformer Encoder.
    
    The MLP block consists of two fully connected layers with a GELU activation function in between.
    Layer normalization is applied on input features before the block, 
    Dropout is applied after each feature creation (like Linear Layer)
    Residual connections are used on output of the second Layer with input tensor
    
    Args:
        `embedding_size` (int, optional): Input and output tensor size. Default is 768.
        `units` (int, optional): Number of hidden units in the intermediate layer. Default is 3072.
        `dropout_rate` (float, optional): Dropout probability. Default is 0.1.
        
    Returns:
        Tensor: Output tensor with the same shape as the input tensor (batch_size, nbr_tokens, embedding_size).
    """
    def __init__(
        self, 
        embedding_size: int = 768, 
        units: int = 3072, 
        dropout_rate: float = 0.1
    ):
        super().__init__()
        
        self.layer_norm = LayerNorm(normalized_shape=embedding_size)
        self.gelu = GELU()
        self.dropout = Dropout(p=dropout_rate)
        
        self.fc1 = Linear(
            in_features=embedding_size,
            out_features=units
        )
        
        self.fc2 = Linear(
            in_features=units,
            out_features=embedding_size
        )
        
        self.residual_connection = ResidualConnection()
        
    def forward(self, x: Tensor) -> Tensor:
        
        x_norm = self.layer_norm(x)
        
        hidden = self.fc1(x_norm)
        hidden = self.gelu(hidden)
        hidden = self.dropout(hidden)
        
        output = self.fc2(hidden)
        output = self.dropout(output)
        
        residual_output = self.residual_connection(input=x, output=output)
        
        return (residual_output)
        


In [62]:
embedding_size = 768

x = torch.randn(size=(1, 5, embedding_size))

mlp_block = MultiLayerPerceptronBlock(
    embedding_size=embedding_size,
    units=3072,
    dropout_rate=0.1,
)

out = mlp_block(x)

out.shape

torch.Size([1, 5, 768])

In [63]:
from torchinfo import summary

summary(model=mlp_block, 
        input_size=(1, 5, embedding_size),
        col_names=["input_size", "output_size", "num_params"],
        col_width=20,
        row_settings=["var_names"]
) 

Layer (type (var_name))                                 Input Shape          Output Shape         Param #
MultiLayerPerceptronBlock (MultiLayerPerceptronBlock)   [1, 5, 768]          [1, 5, 768]          --
├─LayerNorm (layer_norm)                                [1, 5, 768]          [1, 5, 768]          1,536
├─Linear (fc1)                                          [1, 5, 768]          [1, 5, 3072]         2,362,368
├─GELU (gelu)                                           [1, 5, 3072]         [1, 5, 3072]         --
├─Dropout (dropout)                                     [1, 5, 3072]         [1, 5, 3072]         --
├─Linear (fc2)                                          [1, 5, 3072]         [1, 5, 768]          2,360,064
├─Dropout (dropout)                                     [1, 5, 768]          [1, 5, 768]          --
├─ResidualConnection (residual_connection)              --                   [1, 5, 768]          --
Total params: 4,723,968
Trainable params: 4,723,968
Non-trainable par

### Transformer Encoder Block 
![](https://i.imgur.com/UNfcDsq.png)

In [64]:
from torch import Tensor
from torch.nn import Module

class TransformerEncoderBlock(Module):   
    """
    building block that combines multi-head self-attention and a multi-layer perceptron, 
    applying residual connections and layer normalization. 

    Args:
        `embedding_size` (int, optional): The size of the input embeddings. Default is 768.
        `nbr_heads` (int, optional): The number of attention heads in the `MultiHeadSelfAttentionBlock`. Default is 12.
        `dropout_attention` (float, optional): The dropout rate for the `MultiHeadSelfAttentionBlock`. Default is 0.0.
        `mlp_units` (int, optional): The number of units in the `MultiLayerPerceptronBlock`. Default is 3072.
        `dropout_mlp` (float, optional): The dropout rate for the `MultiLayerPerceptronBlock`. Default is 0.1.
    
    Forward method input:
        x (Tensor): a sequence of Tokens Embedding like [batch_size, nbr_tokens, embedding_size].
    
    Return
        Tensor: a tensor with same input size after passing through the `MultiHeadSelfAttentionBlock` and` MultiLayerPerceptronBlock`.
    """
    def __init__(
        self, 
        embedding_size : int = 768, 
        nbr_heads : int = 12,
        dropout_attention : float = 0.0,
        mlp_units : int = 3072,
        dropout_mlp : float = 0.1,
    )->None:
        
        super().__init__()
        
        self.attention_block = MultiHeadSelfAttentionBlock(
            embedding_size=embedding_size,
            nbr_heads=nbr_heads,
            dropout_rate=dropout_attention
        )
        
        self.mlp_block = MultiLayerPerceptronBlock(
            embedding_size=embedding_size,
            units=mlp_units,
            dropout_rate=dropout_mlp,
        )
        
    
    def forward(self, x : Tensor)->Tensor:
        
        attention = self.attention_block(x)
        new_features = self.mlp_block(attention)
    
        return (new_features)

In [67]:
embedding_size = 768

x = torch.randn(size=(1, 5, embedding_size))

encoder_block = TransformerEncoderBlock(
    embedding_size=embedding_size,
    nbr_heads=12,
    dropout_attention=0.0,
    mlp_units=3072,
    dropout_mlp=0.1
)

out = encoder_block(x)

out.shape

torch.Size([1, 5, 768])

In [68]:
from torchinfo import summary

summary(model=encoder_block, 
        input_size=(1, 5, embedding_size),
        col_names=["input_size", "output_size", "num_params"],
        col_width=20,
        row_settings=["var_names"]
) 

Layer (type (var_name))                                 Input Shape          Output Shape         Param #
TransformerEncoderBlock (TransformerEncoderBlock)       [1, 5, 768]          [1, 5, 768]          --
├─MultiHeadSelfAttentionBlock (attention_block)         [1, 5, 768]          [1, 5, 768]          --
│    └─LayerNorm (layer_norm)                           [1, 5, 768]          [1, 5, 768]          1,536
│    └─MultiheadAttention (multi_head_attention)        --                   [1, 5, 768]          2,362,368
│    └─ResidualConnection (residual_connection)         --                   [1, 5, 768]          --
├─MultiLayerPerceptronBlock (mlp_block)                 [1, 5, 768]          [1, 5, 768]          --
│    └─LayerNorm (layer_norm)                           [1, 5, 768]          [1, 5, 768]          1,536
│    └─Linear (fc1)                                     [1, 5, 768]          [1, 5, 3072]         2,362,368
│    └─GELU (gelu)                                      [1, 5, 307

### Transformer Encoder
![](https://i.imgur.com/HXvoxHr.png)

In [70]:
from torch import Tensor
from torch.nn import Module
from torch.nn import Sequential

class TransformerEncoder(Module):
    
    def __init__(
        self, 
        nbr_encoder_blocks : int = 12,
        embedding_size : int = 768, 
        nbr_heads : int = 12,
        dropout_attention : float = 0.0,
        mlp_units : int = 3072,
        dropout_mlp : float = 0.1,
    )->None:
        
        super().__init__()
        
        self.encoder = Sequential()
        
        for _ in range(nbr_encoder_blocks):
            
            block = TransformerEncoderBlock(
                embedding_size=embedding_size,
                nbr_heads=nbr_heads,
                dropout_attention=dropout_attention,
                mlp_units=mlp_units,
                dropout_mlp=dropout_mlp
            )
            
            self.encoder.append(module=block)
        
    def forward(self, x : Tensor)->Tensor:
        
        output = self.encoder(x)
        
        return (output)
        
        

In [75]:
embedding_size = 768

x = torch.randn(size=(1, 5, embedding_size))

encoder = TransformerEncoder(
    embedding_size=embedding_size,
    nbr_heads=12,
    dropout_attention=0.0,
    mlp_units=3072,
    dropout_mlp=0.1
)

out = encoder(x)

out.shape

torch.Size([1, 5, 768])

In [76]:
from torchinfo import summary

summary(model=encoder_block, 
        input_size=(1, 5, embedding_size),
        col_names=["input_size", "output_size", "num_params"],
        col_width=20,
        row_settings=["var_names"]
) 

Layer (type (var_name))                                           Input Shape          Output Shape         Param #
TransformerEncoder (TransformerEncoder)                           [1, 5, 768]          [1, 5, 768]          --
├─Sequential (encoder)                                            [1, 5, 768]          [1, 5, 768]          --
│    └─TransformerEncoderBlock (0)                                [1, 5, 768]          [1, 5, 768]          --
│    │    └─MultiHeadSelfAttentionBlock (attention_block)         [1, 5, 768]          [1, 5, 768]          2,363,904
│    │    └─MultiLayerPerceptronBlock (mlp_block)                 [1, 5, 768]          [1, 5, 768]          4,723,968
│    └─TransformerEncoderBlock (1)                                [1, 5, 768]          [1, 5, 768]          --
│    │    └─MultiHeadSelfAttentionBlock (attention_block)         [1, 5, 768]          [1, 5, 768]          2,363,904
│    │    └─MultiLayerPerceptronBlock (mlp_block)                 [1, 5, 768]         