# Phase 3, Lesson 1: Transformer Architecture[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/suraaj3poudel/Learn-To-Make-GPT-Model/blob/main/phase3_mini_transformer/01_transformer_architecture.ipynb)Understanding "Attention is All You Need" 📄## What You'll Learn1. Complete Transformer architecture2. Multi-head attention in detail3. Position encodings  4. Feed-forward networks5. Layer normalizationThe architecture that changed everything!

In [None]:
# Setupimport numpy as npimport matplotlib.pyplot as pltimport mathprint('✅ Ready to build Transformers!')

## 1. Transformer OverviewThe Transformer architecture has two main parts:1. **Encoder**: Processes input sequence2. **Decoder**: Generates output sequenceFor GPT, we only use the **Decoder** (autoregressive generation).**Key Components**:- Multi-head self-attention- Position encodings- Feed-forward networks- Layer normalization- Residual connectionsLet's build each part!

## 2. Positional Encoding**Problem**: Attention has no sense of word order!- "I love you" vs "You love I" → Same attention patterns!**Solution**: Add positional information to embeddings**Formula** (sinusoidal):```PE(pos, 2i) = sin(pos / 10000^(2i/d))PE(pos, 2i+1) = cos(pos / 10000^(2i/d))```Where `pos` = position, `i` = dimension index

In [None]:
def positional_encoding(max_len, d_model):    """    Create positional encoding matrix        Args:        max_len: Maximum sequence length        d_model: Model dimension        Returns:        pe: (max_len, d_model) positional encodings    """    pe = np.zeros((max_len, d_model))        for pos in range(max_len):        for i in range(0, d_model, 2):            # Sin for even indices            pe[pos, i] = math.sin(pos / (10000 ** (2 * i / d_model)))                        # Cos for odd indices            if i + 1 < d_model:                pe[pos, i + 1] = math.cos(pos / (10000 ** (2 * i / d_model)))        return pe# Create positional encodingsmax_len = 50d_model = 64pe = positional_encoding(max_len, d_model)print(f"Positional encoding shape: {pe.shape}")# Visualizeplt.figure(figsize=(12, 6))plt.imshow(pe, cmap='RdBu', aspect='auto')plt.colorbar(label='Encoding value')plt.xlabel('Dimension')plt.ylabel('Position')plt.title('Positional Encoding Visualization')plt.tight_layout()plt.show()print("Each position gets a unique pattern!")print("Similar positions have similar encodings")

## 3. Multi-Head AttentionInstead of one attention mechanism, use multiple "heads"!Each head:- Has its own Q, K, V projections- Attends to different aspects of the input- Produces partial outputThen concatenate all heads and project.

In [None]:
class MultiHeadAttention:    def __init__(self, d_model, num_heads):        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"                self.d_model = d_model        self.num_heads = num_heads        self.d_k = d_model // num_heads  # Dimension per head                # Weight matrices for Q, K, V projections        self.W_q = np.random.randn(d_model, d_model) * 0.01        self.W_k = np.random.randn(d_model, d_model) * 0.01        self.W_v = np.random.randn(d_model, d_model) * 0.01        self.W_o = np.random.randn(d_model, d_model) * 0.01  # Output projection        def split_heads(self, x):        """Split into multiple heads"""        batch_size, seq_len, d_model = x.shape[0], x.shape[0], x.shape[1]        # Reshape: (seq_len, d_model) -> (seq_len, num_heads, d_k)        x = x.reshape(seq_len, self.num_heads, self.d_k)        # Transpose: (num_heads, seq_len, d_k)        return x.transpose(1, 0, 2)        def scaled_dot_product_attention(self, Q, K, V, mask=None):        """Compute attention for one head"""        d_k = Q.shape[-1]                # Attention scores        scores = np.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)                # Apply mask if provided (for decoder)        if mask is not None:            scores = scores + (mask * -1e9)                # Softmax        exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))        attention_weights = exp_scores / exp_scores.sum(axis=-1, keepdims=True)                # Apply to values        output = np.matmul(attention_weights, V)                return output, attention_weights        def forward(self, x, mask=None):        """        Multi-head attention forward pass                Args:            x: (seq_len, d_model) input            mask: Optional mask for decoder                Returns:            output: (seq_len, d_model)        """        seq_len = x.shape[0]                # Linear projections        Q = np.dot(x, self.W_q)        K = np.dot(x, self.W_k)        V = np.dot(x, self.W_v)                # Split into heads        Q = self.split_heads(Q[None, :, :])[0]  # (num_heads, seq_len, d_k)        K = self.split_heads(K[None, :, :])[0]        V = self.split_heads(V[None, :, :])[0]                # Apply attention for each head        head_outputs = []        for i in range(self.num_heads):            head_out, _ = self.scaled_dot_product_attention(                Q[i:i+1], K[i:i+1], V[i:i+1], mask            )            head_outputs.append(head_out[0])                # Concatenate heads        concat = np.concatenate(head_outputs, axis=-1)                # Final linear projection        output = np.dot(concat, self.W_o)                return output# Test multi-head attentiond_model = 64num_heads = 8seq_len = 10mha = MultiHeadAttention(d_model, num_heads)# Random inputx = np.random.randn(seq_len, d_model)# Forward passoutput = mha.forward(x)print(f"Input shape: {x.shape}")print(f"Output shape: {output.shape}")print(f"\nNumber of heads: {num_heads}")print(f"Dimension per head: {d_model // num_heads}")print("\n✅ Multi-head attention working!")

## 4. Feed-Forward NetworkAfter attention, each position goes through a feed-forward network:```FFN(x) = ReLU(xW1 + b1)W2 + b2```Same network applied to each position independently!

In [None]:
class FeedForward:    def __init__(self, d_model, d_ff):        """        Args:            d_model: Model dimension            d_ff: Hidden dimension (usually 4 * d_model)        """        self.W1 = np.random.randn(d_model, d_ff) * 0.01        self.b1 = np.zeros(d_ff)        self.W2 = np.random.randn(d_ff, d_model) * 0.01        self.b2 = np.zeros(d_model)        def forward(self, x):        """        Args:            x: (seq_len, d_model)                Returns:            output: (seq_len, d_model)        """        # First layer + ReLU        hidden = np.maximum(0, np.dot(x, self.W1) + self.b1)                # Second layer        output = np.dot(hidden, self.W2) + self.b2                return output# Testd_model = 64d_ff = 256  # Usually 4x d_modelffn = FeedForward(d_model, d_ff)x = np.random.randn(10, d_model)output = ffn.forward(x)print(f"Input shape: {x.shape}")print(f"Hidden dim: {d_ff}")print(f"Output shape: {output.shape}")

## 5. Layer Normalization**Layer normalization** = Normalize across features for each exampleHelps training stability and speed!**Formula**:```LayerNorm(x) = γ * (x - μ) / σ + β```Where μ, σ are mean and std across features

In [None]:
class LayerNorm:    def __init__(self, d_model, eps=1e-6):        self.gamma = np.ones(d_model)        self.beta = np.zeros(d_model)        self.eps = eps        def forward(self, x):        """        Args:            x: (seq_len, d_model)        """        # Compute mean and std across features (last dimension)        mean = x.mean(axis=-1, keepdims=True)        std = x.std(axis=-1, keepdims=True)                # Normalize        x_norm = (x - mean) / (std + self.eps)                # Scale and shift        return self.gamma * x_norm + self.beta# Testln = LayerNorm(d_model=64)x = np.random.randn(10, 64)output = ln.forward(x)print(f"Input mean: {x.mean():.4f}, std: {x.std():.4f}")print(f"Output mean: {output.mean():.4f}, std: {output.std():.4f}")print("\nLayer norm centers and scales the features!")

## 6. Transformer BlockNow combine everything into one Transformer block:```1. Multi-head attention + residual + layer norm2. Feed-forward + residual + layer norm```This is the core building block!

In [None]:
class TransformerBlock:    def __init__(self, d_model, num_heads, d_ff):        self.attention = MultiHeadAttention(d_model, num_heads)        self.ffn = FeedForward(d_model, d_ff)        self.ln1 = LayerNorm(d_model)        self.ln2 = LayerNorm(d_model)        def forward(self, x, mask=None):        """        Full transformer block forward pass                Args:            x: (seq_len, d_model)            mask: Optional attention mask                Returns:            output: (seq_len, d_model)        """        # 1. Multi-head attention + residual + norm        attn_output = self.attention.forward(x, mask)        x = self.ln1.forward(x + attn_output)  # Residual connection                # 2. Feed-forward + residual + norm        ffn_output = self.ffn.forward(x)        x = self.ln2.forward(x + ffn_output)  # Residual connection                return x# Test transformer blockblock = TransformerBlock(d_model=64, num_heads=8, d_ff=256)x = np.random.randn(10, 64)output = block.forward(x)print(f"Input shape: {x.shape}")print(f"Output shape: {output.shape}")print("\n✅ Full Transformer block working!")

## 7. Complete Transformer ModelStack multiple transformer blocks!

In [None]:
class Transformer:    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, max_len):        self.embedding = np.random.randn(vocab_size, d_model) * 0.01        self.pos_encoding = positional_encoding(max_len, d_model)                # Stack of transformer blocks        self.blocks = [            TransformerBlock(d_model, num_heads, d_ff)            for _ in range(num_layers)        ]                self.ln_final = LayerNorm(d_model)        self.output_proj = np.random.randn(d_model, vocab_size) * 0.01        def forward(self, token_ids):        """        Args:            token_ids: (seq_len,) token indices                Returns:            logits: (seq_len, vocab_size) predictions        """        seq_len = len(token_ids)                # 1. Embedding        x = self.embedding[token_ids]                # 2. Add positional encoding        x = x + self.pos_encoding[:seq_len]                # 3. Pass through transformer blocks        for block in self.blocks:            x = block.forward(x)                # 4. Final layer norm        x = self.ln_final.forward(x)                # 5. Project to vocabulary        logits = np.dot(x, self.output_proj)                return logits# Create modelmodel = Transformer(    vocab_size=1000,    d_model=64,    num_heads=8,    d_ff=256,    num_layers=4,    max_len=100)# Testtoken_ids = np.array([1, 42, 17, 99, 5])logits = model.forward(token_ids)print(f"Input tokens: {token_ids}")print(f"Output logits shape: {logits.shape}")print(f"\nFor each position, we get a probability distribution over {logits.shape[1]} words!")print("\n✅ Complete Transformer implemented!")

## Summary### What We Built:1. **Positional encodings** - Give model sense of position2. **Multi-head attention** - Multiple attention mechanisms3. **Feed-forward networks** - Process each position4. **Layer normalization** - Stabilize training5. **Transformer blocks** - Combine everything6. **Complete Transformer** - Stack multiple blocks### Key Insights:- Transformers are built from simple components- Residual connections + layer norm = stable training- Multi-head attention = flexible modeling- No recurrence needed!### Next Steps:👉 **Lesson 2**: Train this transformer on real text data!You now understand Transformer architecture! 🎉