In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.num_heads = num_heads
        self.d_head = d_model // num_heads

        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        B, T, D = x.shape

        qkv = self.qkv_proj(x)  # Shape: [B, T, 3*D]
        q, k, v = qkv.chunk(3, dim=-1)

        # Split into heads
        q = q.view(B, T, self.num_heads, self.d_head).transpose(1, 2)
        k = k.view(B, T, self.num_heads, self.d_head).transpose(1, 2)
        v = v.view(B, T, self.num_heads, self.d_head).transpose(1, 2)

        # Scaled dot-product attention
        scores = (q @ k.transpose(-2, -1)) / self.d_head**0.5
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn_weights = F.softmax(scores, dim=-1)
        attn_output = attn_weights @ v  # Shape: [B, num_heads, T, d_head]

        # Concatenate heads
        out = attn_output.transpose(1, 2).contiguous().view(B, T, D)
        return self.out_proj(out)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.attn(x, mask)
        x = self.norm1(x + self.dropout(attn_out))  # Residual + Norm
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))    # Residual + Norm
        return x


What’s Inside

| Module                    | Purpose                                      |
| ------------------------- | -------------------------------------------- |
| `MultiHeadSelfAttention`  | Implements multi-head self-attention         |
| `PositionwiseFeedForward` | Adds depth & complexity to each token        |
| `TransformerEncoderBlock` | Combines attention + FFN + residuals & norms |


In [None]:

# Settings
batch_size = 2
seq_len = 5
d_model = 32
num_heads = 4
d_ff = 64

# Dummy input tensor (like output of embedding layer)
dummy_input = torch.randn(batch_size, seq_len, d_model)

# Transformer Encoder Block
encoder = TransformerEncoderBlock(d_model=d_model, num_heads=num_heads, d_ff=d_ff)

# Forward pass
output = encoder(dummy_input)

print("Input Shape:", dummy_input.shape)
print("Output Shape:", output.shape)
print("\nOutput Tensor:")
print(output)


Output Interpretation
Shape remains the same: (batch_size, seq_len, d_model)
But the values are transformed via:
Self-attention to capture relationships across tokens.
Feedforward to add depth.
LayerNorm + Residuals for training stability.
This step lets you trace how input embeddings evolve through the encoder.

Step 3: Stack Multiple Transformer Encoder Blocks

In [None]:
import torch.nn as nn

class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, num_heads, d_ff)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)


 Explanation:
nn.ModuleList: allows stacking submodules (like encoder blocks).
Each layer passes output to the next → deep sequence modeling.
Final LayerNorm stabilizes the output.

In [None]:
# Config
num_layers = 4  # Stack 4 encoder layers
encoder_stack = TransformerEncoder(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    d_ff=d_ff
)

# Pass dummy input through stacked encoder
stacked_output = encoder_stack(dummy_input)

print("Final Stacked Output Shape:", stacked_output.shape)


Next: Add Positional Encoding

⚠️ Transformers have no recurrence or convolution. They need positional encodings to understand sequence order.
This step is crucial before using real data, especially for tasks like classification, translation, etc.

 Why Positional Encoding?
The attention mechanism treats inputs like a set, not a sequence. So:
“I love AI” → same as → “AI love I”
Without position, order is lost
Positional encodings inject sequence order information into the input.

 Sinusoidal Positional Encoding (Vaswani et al.)

In [None]:
import math
import torch

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)  # even
        pe[:, 1::2] = torch.cos(position * div_term)  # odd

        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x


Use it in the Encoder Pipeline

In [None]:
# Initialize positional encoding
pos_enc = PositionalEncoding(d_model)

# Add positional info to dummy input
x_with_pos = pos_enc(dummy_input)

# Pass through stacked encoder
final_output = encoder_stack(x_with_pos)
print("Output shape with positional encoding:", final_output.shape)


Step 5: Decoder Block + Cross Attention

🔥 To complete the full Transformer architecture, especially for sequence-to-sequence tasks like machine translation, text summarization, or code generation.
✅ Why This Now?
We’ve:
✅ Built the full encoder with attention + position
✅ Studied attention in isolation
🚫 Not yet seen how Decoder uses both self-attention and cross-attention with encoder output.
🎯 Learning Goal:
Understand and implement a Transformer Decoder Block with:
Masked self-attention (so it doesn't peek into the future)
Cross-attention (attends to encoder outputs)
Feedforward + residuals + layer norms
Once done, you’ll understand 99% of the architecture behind models like:
GPT (decoder-only)
BERT (encoder-only)
T5, BART, etc. (encoder-decoder)

Transformer Decoder Block: Core Concepts

Input → Masked Self-Attention
      → Add & Norm
      → Cross-Attention (Encoder Output)
      → Add & Norm
      → Feed Forward Network (FFN)
      → Add & Norm → Output


🔒 1. Masked Multi-Head Self-Attention
Prevents future tokens from being seen.
Like: “I am [MASK]” → can’t attend to words beyond [MASK].
Uses a causal mask (triangular matrix) to block right-side tokens.
🔁 2. Cross-Attention
Allows decoder to attend to encoder’s outputs.
Decoder Query (Q) attends to encoder Key/Value (K, V).
Critical for seq2seq tasks like translation.
⚙️ 3. Feedforward + Add & Norm
Just like encoder: 2-layer FFN + residuals + layer norm.

+-------------------------+
|      Masked Self-Attn   | ← attends to past decoder tokens only
+-------------------------+
|      Add & LayerNorm    |
+-------------------------+
|      Cross-Attention    | ← attends to encoder output
+-------------------------+
|      Add & LayerNorm    |
+-------------------------+
| Feed Forward (2-layer)  |
+-------------------------+
|      Add & LayerNorm    |
+-------------------------+

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.cross_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)

        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.norm3 = nn.LayerNorm(embed_dim)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        # 1. Masked Self-Attention (causal)
        tgt2, _ = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask)
        tgt = self.norm1(tgt + self.dropout1(tgt2))

        # 2. Cross-Attention (encoder output)
        tgt2, _ = self.cross_attn(tgt, memory, memory, attn_mask=memory_mask)
        tgt = self.norm2(tgt + self.dropout2(tgt2))

        # 3. Feed Forward
        tgt2 = self.ffn(tgt)
        tgt = self.norm3(tgt + self.dropout3(tgt2))

        return tgt
