In [5]:
import torch

### ResidualLayerNorm

##### Example 1

In [4]:
batch, sentence_length, embedding_dim = 20, 5, 10

In [18]:
embedding = torch.randn(batch, sentence_length, embedding_dim)

In [36]:
from torch import nn

`embedding` is a text embedding of a batch of `20` sentences, each sentence contain `5` words

In [37]:
embedding.shape

torch.Size([20, 5, 10])

Apply Layer Norm to `embedding` using Pytorch's built-in module

In [38]:
embedding_dim = embedding.shape[-1]

In [39]:
layer_norm = nn.LayerNorm(embedding_dim)

In [40]:
output = layer_norm(embedding)

In [41]:
output.shape

torch.Size([20, 5, 10])

##### Example 2

In [43]:
output_attention = torch.randn(10, 3, 5)

In [45]:
embeddings = torch.randn(10, 3, 5)

In [100]:
from torch import nn

In [101]:
class ResidualLayerNorm(nn.Module):
    def __init__(self, d_model, dropout):
        super().__init__()
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, residual):
        # source: git/hyunwoongko/transformer/blob/
        # master/models/blocks/encoder_layer.py
        return self.layer_norm(self.dropout(x) + residual)

`output_attention` is the output of the first multi-head attention

`embeddings` is the text embedding of training data

In [102]:
output_attention.shape, embeddings.shape

(torch.Size([10, 3, 5]), torch.Size([10, 3, 5]))

`ResidualLayerNorm` is the first layer norm in Encoder Block. Write it from scratch

**Hint**: Allow use Pytorch's built-in module for `LayerNorm`

In [103]:
layer = ResidualLayerNorm(d_model=5, dropout=0.3)

In [104]:
output = layer(
    x=output_attention,
    residual=embeddings
)

In [105]:
output.shape

torch.Size([10, 3, 5])

### Position-wise Feed Forward

##### Example 1

In [80]:
x = torch.randn(10, 3, 5)

In [87]:
from torch import nn

In [88]:
class PostionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        # shape(x) = [batch_size x seq_len x d_model]
        
        # shape(output) = [batch_size x seq_len x d_model]
        output = self.layers(x)
        return output

`x` is the ouput of the `Add & Norm` layer in Encoder Layer

In [89]:
x.shape

torch.Size([10, 3, 5])

Write a Position-wise Feed Forward layer from scratch

**Hint**: Allow use Pytorch's Dropout module

In [90]:
layer = PostionWiseFeedForward(
    d_model=5,
    d_ff=16,
    dropout=0.3
)

In [91]:
output = layer(x)

In [92]:
output.shape

torch.Size([10, 3, 5])

### Encoder Layer

##### Example 1

In [39]:
embeddings = torch.randn(5, 3, 10)

Given
- `ResidualLayerNorm`
    + Take `d_model` and `dropout` in initialize
    + Take `x` and `residual` in forward pass
- `PostionWiseFeedForward`
    + Take `d_model`, `d_ff` and `dropout` in initialize
    + Take `x` in forward pass
- `MultiHeadAttention`
    + Take `d_model` and `n_heads`in initialize
    + Take `pre_q`, `pre_k`, and `pre_v` in forward pass

In [80]:
from torch import nn
from foundation.transformer.encoder import ResidualLayerNorm, PostionWiseFeedForward
from foundation.transformer.efficient_attention import MultiHeadAttention

Write an Encoder Layer in Transformer from scratch **(no mask)**

In [81]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super().__init__()
        
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.norm_1 = ResidualLayerNorm(d_model, dropout=dropout)
        self.feed_forward = PostionWiseFeedForward(d_model, n_heads)
        self.norm_2 = ResidualLayerNorm(d_model, dropout=dropout)
    
    def forward(self, x):
        mha_output, mha_weights = self.mha(
            pre_q=x, pre_k=x, pre_v=x
        )        
        norm_1 = self.norm_1(x=mha_output, residual=x)
        ff = self.feed_forward(norm_1)
        norm_2 = self.norm_2(ff, norm_1)
        
        return norm_2, mha_weights

In [82]:
encoder_layer = EncoderLayer(d_model=10, n_heads=2, d_ff=16, dropout=0.2)

In [83]:
embeddings.shape

torch.Size([5, 3, 10])

In [84]:
output, attention_weights = encoder_layer(embeddings)

In [86]:
output.shape, attention_weights.shape

(torch.Size([5, 3, 10]), torch.Size([5, 2, 3, 3]))

### Encoder Block

In [9]:
tokens = torch.arange(0, 30).reshape(6, 5)

In [18]:
from torch import nn
from foundation.transformer.encoder import EncoderLayer
from foundation.transformer.embedding import TextEmbedding
from foundation.transformer.positional_encoding import PositionalEncoding

In [19]:
class Encoder(nn.Module):
    def __init__(self, d_model, n_heads, n_layers, d_ff, dropout):
        super().__init__()
        self.embedding = TextEmbedding(
            vocab_size = 1000,
            d_model = d_model,
            padding_idx = 0
        )
        self.positional_encoding = PositionalEncoding(d_model)
        self.encoders = nn.ModuleList([
            EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)
        ])
    
    def forward(self, x):
        # shape(x) = [batch_size x src_seq_len]
        
        # shape(embeddings) = [batch_size x src_seq_len x d_model]
        embeddings = self.embedding(x)
        # shape(encoding) = [batch_size x src_seq_len x d_model]
        encoding = self.positional_encoding(embeddings)
        
        for encoder in self.encoders:
            # shape(encoding) = [batch_size x src_seq_len x d_model]
            # shape(encoder_attention_weights) = [batch_size x num_heads x src_seq_len x src_seq_len]
            encoding, encoder_attention_weights = encoder(encoding)
        
        return encoding, encoder_attention_weights

In [20]:
encoder = Encoder(
    d_model=10,
    n_heads=2,
    n_layers = 3,
    d_ff=16,
    dropout=0.2
)

`tokens` is a batch of `6` sentence, each sentence contains `5` words

In [21]:
tokens.shape

torch.Size([6, 5])

In [22]:
encoding, encoder_attention_weights = encoder(tokens)

In [23]:
encoding.shape, encoder_attention_weights.shape

(torch.Size([6, 5, 10]), torch.Size([6, 2, 5, 5]))