# Introduction to the Transformer Model

In [1]:
import torch
import torch.nn as nn
import math

##  Define the Attention Mechanism

In [20]:
class Attention(nn.Module):
    def forward(self, query, key, value, mask=None, dropout=None):
        d_k = query.size(-1)  # Dimension of the key, used for scaling down the dot product
        # Calculate the dot products of the query with all keys, divide each by sqrt(d_k),
        # and apply a softmax to obtain the weights on the values
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Apply mask - set scores to -inf where mask is 0 to ignore these positions
            scores = scores.masked_fill(mask == 0, -1e9)
        p_attn = torch.softmax(scores, dim=-1)  # Softmax to obtain the weights
        if dropout is not None:
            p_attn = dropout(p_attn)  # Apply dropout to the attention weights
        # Multiply the weights by the value to get the output
        return torch.matmul(p_attn, value), p_attn

In [28]:
""" Example """
# Assuming dimensions for simplicity: batch_size=1, seq_len=5, model_dim=512
query = key = value = torch.rand(1, 5, 512)
attention_module = Attention()
output, attn_weights = attention_module(query, key, value)
print("Output shape:", output.shape)  # Expected shape: (1, 5, 512)
print("Attention Weights shape:", attn_weights.shape)  # Expected shape: (1, 5, 5)

Output shape: torch.Size([1, 5, 512])
Attention Weights shape: torch.Size([1, 5, 5])


## Define Multi-Headed Attention

In [22]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        self.d_k = d_model // h  # Dimension of each head
        self.h = h  # Number of heads
        # Linear layers for projecting Q, K, V
        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_linear = nn.Linear(d_model, d_model)  # Final projection
        self.attention = Attention()  # The attention mechanism defined above
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        # Prepare Q, K, V for multi-head processing (split into h heads)
        query, key, value = \
            [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linear_layers, (query, key, value))]
        if mask is not None:
            mask = mask.unsqueeze(1)  # Adjust mask for multi-heads
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.output_linear(x)  # Apply final linear projection

In [36]:
""" Example """
query = key = value = torch.rand(1, 10, 512)  # Example dimensions: batch_size=1, seq_len=10, d_model=512
multi_head_attention = MultiHeadedAttention(h=8, d_model=512)
output = multi_head_attention(query, key, value)
print("Multi-Headed Attention Output shape:", output.shape)  # Expected shape: (1, 10, 512)

Multi-Headed Attention Output shape: torch.Size([1, 10, 512])


## Define Position-wise Feed-forward Networks

In [23]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.l1 = nn.Linear(d_model, d_ff)  # First linear transformation
        self.l2 = nn.Linear(d_ff, d_model)  # Second linear transformation
        self.dropout = nn.Dropout(dropout)  # Dropout for regularization

    def forward(self, x):
        return self.l2(self.dropout(nn.ReLU()(self.l1(x))))  # Apply ReLU between the linear layers

In [37]:
""" Example """
input_tensor = torch.rand(1, 10, 512)  # Example dimensions: batch_size=1, seq_len=10, d_model=512
ffn = PositionwiseFeedForward(d_model=512, d_ff=2048)
ffn_output = ffn(input_tensor)
print("Feed-forward Network Output shape:", ffn_output.shape)  # Expected shape: (1, 10, 512)

Feed-forward Network Output shape: torch.Size([1, 10, 512])


## Define Sublayer Connection

In [24]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = nn.LayerNorm(size)  # Normalize the input
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        # Apply normalization, sublayer operation, and dropout, then add the input x for residual connection
        return x + self.dropout(sublayer(self.norm(x)))

In [38]:
""" Example """
input_tensor = torch.rand(1, 10, 512)  # Example dimensions: batch_size=1, seq_len=10, d_model=512
sublayer_connection = SublayerConnection(size=512, dropout=0.1)
# Example sublayer function: Let's use a lambda that adds a constant tensor for simplicity
sublayer_output = sublayer_connection(input_tensor, lambda x: x + torch.ones_like(x))
print("Sublayer Connection Output shape:", sublayer_output.shape)  # Expected shape: (1, 10, 512)

Sublayer Connection Output shape: torch.Size([1, 10, 512])


## Define the Transformer Block

In [25]:
class TransformerBlock(nn.Module):
    def __init__(self, hidden, attn_heads, dropout):
        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=4 * hidden, dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)

    def forward(self, x, mask=None):
        # Apply attention to the input
        x = self.input_sublayer(x, lambda _x: self.attention(_x, _x, _x, mask=mask))
        # Apply feed-forward network to the result
        x = self.output_sublayer(x, self.feed_forward)
        return x

In [39]:
""" Example """
input_tensor = torch.rand(1, 10, 512)  # Example dimensions: batch_size=1, seq_len=10, d_model=512
transformer_block = TransformerBlock(hidden=512, attn_heads=8, dropout=0.1)
transformer_output = transformer_block(input_tensor)
print("Transformer Block Output shape:", transformer_output.shape)  # Expected shape: (1, 10, 512)

Transformer Block Output shape: torch.Size([1, 10, 512])


## Example Usage

In [26]:
# Example input
batch_size = 5
seq_length = 10
hidden = 512
model = TransformerBlock(hidden=hidden, attn_heads=8, dropout=0.1)
input_tensor = torch.rand(batch_size, seq_length, hidden)

# Example mask (optional)
mask = torch.randint(0, 2, (batch_size, seq_length))
mask = torch.einsum("ab,ac->abc", mask, mask)

# Forward pass
print("input_tensor shape:", input_tensor.shape)
print("mask shape:", mask.shape)
output = model(input_tensor, mask=mask)
print("output shape:", output.shape)  # Expected shape

input_tensor shape: torch.Size([5, 10, 512])
mask shape: torch.Size([5, 10, 10])
output shape: torch.Size([5, 10, 512])
