In [1]:
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    """
    Computes scaled dot-product attention.

    Args:
        Q: Query matrix of shape (batch, seq_len, d_k)
        K: Key matrix   of shape (batch, seq_len, d_k)
        V: Value matrix of shape (batch, seq_len, d_v)

    Returns:
        attention_weights: (batch, seq_len, seq_len)
        context: (batch, seq_len, d_v)
    """
    d_k = Q.shape[-1]

    # Step 1: Compute scores = QK^T / sqrt(d_k)
    scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)

    # Step 2: Softmax along last dimension
    exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

    # Step 3: Compute context = attention_weights * V
    context = np.matmul(attention_weights, V)

    return attention_weights, context


# Example test
Q = np.random.rand(1, 5, 64)
K = np.random.rand(1, 5, 64)
V = np.random.rand(1, 5, 64)

attn_wt, context = scaled_dot_product_attention(Q, K, V)
print("Attention weights shape:", attn_wt.shape)
print("Context shape:", context.shape)



Attention weights shape: (1, 5, 5)
Context shape: (1, 5, 64)


In [2]:
import torch
import torch.nn as nn

class SimpleTransformerEncoder(nn.Module):
    def __init__(self, d_model=128, num_heads=8, dim_ff=512):
        super().__init__()

        # Multi-head self-attention
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=num_heads,
            batch_first=True
        )

        # Feed-forward network
        self.ff = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Linear(dim_ff, d_model)
        )

        # Layer norms
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # --- Self-attention ---
        attn_output, _ = self.self_attn(x, x, x)
        x = self.norm1(x + attn_output)

        # --- Feed-forward ---
        ff_output = self.ff(x)
        x = self.norm2(x + ff_output)

        return x


# ----- Test output shape -----
if __name__ == "__main__":
    batch_size = 32
    seq_len = 10
    d_model = 128

    model = SimpleTransformerEncoder(d_model=128, num_heads=8)

    x = torch.randn(batch_size, seq_len, d_model)
    out = model(x)

    print("Input shape :", x.shape)
    print("Output shape:", out.shape)



Input shape : torch.Size([32, 10, 128])
Output shape: torch.Size([32, 10, 128])
