<img src="https://theaiengineer.dev/tae_logo_gw_flatter.png" width=35% align=right>

# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch
## Chapter 8 — The Transformer Architecture
**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5.

## How to Use This Notebook

- Assemble encoder and decoder blocks from reusable attention and feed-forward components.
- Trace tensor shapes through the model to prevent broadcasting surprises.
- Benchmark a forward pass to sanity-check performance before training.

### Roadmap

You will wire positional encodings, stack attention blocks, and integrate residual connections to form a minimal transformer.

### Study Tips

Keep a diagram of the architecture nearby. Annotating where tensors enter and exit each sublayer makes debugging much easier.

In [None]:
# Ensure torch (Colab friendly)
try:
    import torch  # noqa
    print('torch:', torch.__version__)
except Exception:
    import os
    gpu = os.system('nvidia-smi > /dev/null 2>&1') == 0
    index = (
        'https://download.pytorch.org/whl/cu121'
        if gpu else 'https://download.pytorch.org/whl/cpu'
    )
    get_ipython().run_line_magic('pip', f'install -q torch --index-url {index}')
    import torch
    print('torch:', torch.__version__)


In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'


In [None]:
# Positional encoding
def sinusoidal_positions(T: int, d_model: int, device=None):
    import math, torch
    pos = torch.arange(T, device=device).float()[:, None]
    i = torch.arange(d_model, device=device).float()[None, :]
    angle = pos / (10000 ** (2 * (i // 2) / d_model))
    enc = torch.zeros(T, d_model, device=device)
    enc[:, 0::2] = torch.sin(angle[:, 0::2])
    enc[:, 1::2] = torch.cos(angle[:, 1::2])
    return enc


In [None]:
# Multi-head attention
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.0):
        super().__init__(); assert d_model % num_heads == 0
        self.h = num_heads; self.d = d_model // num_heads
        self.qkv = nn.Linear(d_model, 3*d_model, bias=False)
        self.out = nn.Linear(d_model, d_model, bias=False)
        self.drop = nn.Dropout(dropout)
    def forward(self, x, mask=None):
        B, T, Dm = x.shape
        qkv = self.qkv(x); q, k, v = qkv.chunk(3, dim=-1)
        def split(t): return t.view(B, T, self.h, self.d).transpose(1, 2)
        q, k, v = map(split, (q, k, v))
        attn = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
        attn = self.drop(attn)
        y = attn.transpose(1,2).contiguous().view(B, T, Dm)
        return self.out(y)


In [None]:
# Check head shapes in isolation
B, T, D, H = 2, 5, 12, 3
x_chk = torch.randn(B, T, D)
mha_chk = MultiHeadAttention(D, H)
qkv = mha_chk.qkv(x_chk); q, k, v = qkv.chunk(3, dim=-1)
def split(t): return t.view(B, T, H, D//H).transpose(1, 2)
qh, kh, vh = map(split, (q, k, v))
q.shape, qh.shape, (H * (D//H)) == D


In [None]:
# Residual + LayerNorm (pre-norm)
class Residual(nn.Module):
    def __init__(self, d_model):
        super().__init__(); self.norm = nn.LayerNorm(d_model)
    def forward(self, x, sublayer, *args, **kwargs):
        return x + sublayer(self.norm(x), *args, **kwargs)


In [None]:
# Feed-forward
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.0):
        super().__init__(); self.net = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(d_ff, d_model), nn.Dropout(dropout))
    def forward(self, x): return self.net(x)


In [None]:
# Transformer block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.0):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.res1 = Residual(d_model); self.res2 = Residual(d_model)
    def forward(self, x, mask=None):
        x = self.res1(x, self.mha, mask)
        x = self.res2(x, self.ffn)
        return x


In [None]:
# Create toy input
B, T, D = 2, 6, 16
x = torch.randn(B, T, D)
x


In [None]:
# Add sinusoidal positions
pe = sinusoidal_positions(T, D)
pe.shape


In [None]:
x = x + pe[None, :, :]
x.shape


In [None]:
# Causal mask
mask = torch.tril(torch.ones(T, T))[None, :, :]
mask.shape


In [None]:
# Block and forward
block = TransformerBlock(D, num_heads=4, d_ff=64, dropout=0.1)
block


In [None]:
y = block(x, mask)
y.shape


In [None]:
# Visualize attention weights of a single head from the block
with torch.no_grad():
    B, T, Dm = x.shape
    # re-compute q,k for visualization
    qkv = block.mha.qkv(x); q, k, v = qkv.chunk(3, dim=-1)
    H = block.mha.h; Dh = block.mha.d
    def split(t): return t.view(B, T, H, Dh).transpose(1, 2)
    qh, kh = map(split, (q, k))
    d = Dh
    scores = (qh @ kh.transpose(-2, -1)) / (d ** 0.5)  # [B,H,T,T]
    scores = scores.masked_fill(mask == 0, float('-inf'))
    w = torch.softmax(scores, dim=-1)[0, 0]  # head 0 weights [T,T]
plt.figure(figsize=(4,3))
plt.imshow(w, cmap='magma', aspect='auto')
plt.colorbar(label='weight')
plt.xlabel('key\npositions')
plt.ylabel('query positions')
plt.title('Head 0 weights (toy)')
plt.tight_layout()


In [None]:
# Second block for a quick stability check
block2 = TransformerBlock(D, num_heads=4, d_ff=64, dropout=0.1)
block2


In [None]:
# Measure mean/std before and after each block
with torch.no_grad():
    def stats(t): return float(t.mean()), float(t.std())
    m0, s0 = stats(x)
    y1 = block(x, mask)
    m1, s1 = stats(y1)
    y2 = block2(y1, mask)
    m2, s2 = stats(y2)
    print('(mean,std) before:', (round(m0,4), round(s0,4)))
    print('after block 1   :', (round(m1,4), round(s1,4)))
    print('after block 2   :', (round(m2,4), round(s2,4)))


## Exercises

- Swap the sinusoidal positional encoding for a learned variant and observe the impact.
- Instrument the model with hooks to capture intermediate activations for later analysis.
- Profile the forward pass with different sequence lengths to understand scaling behavior.

<img src="https://theaiengineer.dev/tae_logo_gw_flatter.png" width=35% align=right>