In [1]:
import numpy as np

def softmax(x):
    """Compute softmax row-wise."""
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    """
    Computes scaled dot-product attention.
    
    Args:
        Q, K, V : numpy arrays of shape (seq_len, d_k)

    Returns:
        attention_weights: (seq_len, seq_len)
        context_vector: (seq_len, d_k)
    """
    d_k = Q.shape[-1]

    # Step 1: Compute raw attention scores (QKᵀ)
    scores = np.dot(Q, K.T)

    # Step 2: Scale by sqrt(d_k)
    scaled_scores = scores / np.sqrt(d_k)

    # Step 3: Softmax over keys dimension
    attention_weights = softmax(scaled_scores)

    # Step 4: Compute context = Attention · V
    context = np.dot(attention_weights, V)

    return attention_weights, context


# ----------- Example Test -------------
if __name__ == "__main__":
    Q = np.random.rand(4, 8)
    K = np.random.rand(4, 8)
    V = np.random.rand(4, 8)

    attn_w, ctx = scaled_dot_product_attention(Q, K, V)
    print("Attention Weights:\n", attn_w)
    print("Context Vector:\n", ctx)


Attention Weights:
 [[0.26318647 0.25382501 0.23709717 0.24589135]
 [0.250044   0.27144639 0.22143687 0.25707274]
 [0.22109294 0.25979311 0.25117361 0.26794034]
 [0.20913057 0.26392171 0.23908665 0.28786108]]
Context Vector:
 [[0.28188361 0.61726093 0.25250816 0.45623056 0.67824005 0.5678025
  0.57651392 0.30752946]
 [0.27593331 0.60669449 0.25490398 0.46679666 0.68146233 0.57265
  0.58056001 0.30896919]
 [0.26857128 0.62096175 0.24827066 0.45544712 0.70528679 0.58933699
  0.59862346 0.29402693]
 [0.26334988 0.62219977 0.24621293 0.45936542 0.70997394 0.59105818
  0.59678245 0.29253171]]


## 1. Scaled Dot-Product Attention Output

### Attention Weights:

Each row represents how much a query token attends to all other tokens.  
Softmax ensures each row sums to 1.  
Higher values indicate stronger attention toward specific tokens.

### Context Vector:

Computed as the weighted sum of the value matrix.  
Represents the updated embedding for each token after attention is applied.  
Encodes information gathered from all other tokens in the sequence.

### Interpretation

Attention weights determine which tokens influence each other.  
Context vectors contain the resulting combined information, forming the final token representations after attention.


In [4]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.24.1%2Bcpu-cp311-cp311-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.9.1%2Bcpu-cp311-cp311-win_amd64.whl.metadata (7.0 kB)
Downloading https://download.pytorch.org/whl/cpu/torchvision-0.24.1%2Bcpu-cp311-cp311-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 4.0/4.0 MB 22.0 MB/s eta 0:00:00
Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.9.1%2Bcpu-cp311-cp311-win_amd64.whl (662 kB)
   ---------------------------------------- 0.0/662.4 kB ? eta -:--:--
   --------------------------------------- 662.4/662.4 kB 12.9 MB/s eta 0:00:00
Installing collected packages: torchvision, torchaudio

   ---------------------------------------- 0/2 [torchvision]
   --------------------------

In [5]:
import torch
import torch.nn as nn

class SimpleEncoderBlock(nn.Module):
    def __init__(self, embed_dim=64, num_heads=4, ff_dim=256):
        super().__init__()
        
        # Multi-head self-attention
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        
        # Feed-forward network
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )

        # Layer normalization
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # ---- Multi-head attention + Add & Norm ----
        attn_output, _ = self.self_attn(x, x, x)
        x = self.norm1(x + attn_output)

        # ---- Feed-forward + Add & Norm ----
        ff_output = self.ff(x)
        x = self.norm2(x + ff_output)
        return x


# ---------- Shape Verification ----------
if __name__ == "__main__":
    batch_size = 32
    seq_len = 10
    embed_dim = 64

    encoder = SimpleEncoderBlock(embed_dim=embed_dim)
    sample_input = torch.randn(batch_size, seq_len, embed_dim)

    output = encoder(sample_input)

    print("Input shape :", sample_input.shape)   # (32, 10, 64)
    print("Output shape:", output.shape)         # (32, 10, 64)


Input shape : torch.Size([32, 10, 64])
Output shape: torch.Size([32, 10, 64])


## 2. Transformer Encoder Block Output

### Input Shape:

`(32, 10, 64)` → 32 samples, 10 tokens each, embedding size 64.

### Output Shape:

`(32, 10, 64)` → Same shape because:

- Multi-head attention preserves embedding size  
- Feed-forward layer expands → compresses back to 64  
- Residual connections require matching dimensions  
- LayerNorm does not change shape  

### Interpretation

The model changes the representation, not the shape, confirming the encoder block works correctly.
