<a href="https://colab.research.google.com/github/vasudeva4040/Homework5/blob/main/Homework05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Q1**

In [1]:
import numpy as np

def softmax(x, axis=-1):
    """
    Numerically stable softmax along the specified axis.
    """
    x = x - np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)


def scaled_dot_product_attention(Q, K, V):
    """
    Compute scaled dot-product attention.

    Args:
        Q: Query matrix, shape (batch_size, seq_len_q, d_k)
        K: Key   matrix, shape (batch_size, seq_len_k, d_k)
        V: Value matrix, shape (batch_size, seq_len_k, d_v)

    Returns:
        attention_weights: (batch_size, seq_len_q, seq_len_k)
        context:           (batch_size, seq_len_q, d_v)
    """
    d_k = Q.shape[-1]

    # (batch, seq_q, seq_k)
    scores = np.matmul(Q, np.swapaxes(K, -1, -2)) / np.sqrt(d_k)

    # Convert scores â†’ attention weights
    attention_weights = softmax(scores, axis=-1)

    # Weighted sum over values: (batch, seq_q, d_v)
    context = np.matmul(attention_weights, V)

    return attention_weights, context


# Example usage:
if __name__ == "__main__":
    batch_size = 2
    seq_len = 4
    d_k = d_v = 8

    Q = np.random.randn(batch_size, seq_len, d_k)
    K = np.random.randn(batch_size, seq_len, d_k)
    V = np.random.randn(batch_size, seq_len, d_v)

    attn_w, ctx = scaled_dot_product_attention(Q, K, V)
    print("Attention weights shape:", attn_w.shape)  # (2, 4, 4)
    print("Context shape:", ctx.shape)              # (2, 4, 8)


Attention weights shape: (2, 4, 4)
Context shape: (2, 4, 8)


**Q2**

In [2]:
import torch
import torch.nn as nn


class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=512, n_heads=8, d_ff=2048, dropout=0.1):
        super().__init__()

        # Multi-head self-attention (batch_first=True expects (batch, seq, d_model))
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=n_heads,
            dropout=dropout,
            batch_first=True
        )

        # Position-wise feed-forward network
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

        # Layer normalization and dropout
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout_attn = nn.Dropout(dropout)
        self.dropout_ff = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, x, attn_mask=None, key_padding_mask=None):
        """
        Args:
            x: input tensor of shape (batch_size, seq_len, d_model)
            attn_mask: optional attention mask (seq_len, seq_len) or broadcastable
            key_padding_mask: optional padding mask (batch_size, seq_len)

        Returns:
            Tensor with same shape as x: (batch_size, seq_len, d_model)
        """

        # ---- Multi-head self-attention sub-layer ----
        # Self-attention: Q = K = V = x
        attn_output, _ = self.self_attn(
            x, x, x,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask
        )

        # Residual connection + LayerNorm
        x = x + self.dropout_attn(attn_output)
        x = self.norm1(x)

        # ---- Feed-forward sub-layer ----
        ff_output = self.linear2(self.dropout_ff(self.activation(self.linear1(x))))

        # Residual connection + LayerNorm
        x = x + self.dropout_ff(ff_output)
        x = self.norm2(x)

        return x


if __name__ == "__main__":
    # (c) Verify output shape for batch of 32 sentences, each with 10 tokens
    batch_size = 32
    seq_len = 10
    d_model = 512

    encoder_block = TransformerEncoderBlock(d_model=d_model, n_heads=8, d_ff=2048)
    dummy_input = torch.randn(batch_size, seq_len, d_model)  # (32, 10, 512)

    output = encoder_block(dummy_input)
    print("Output shape:", output.shape)  # Expected: torch.Size([32, 10, 512])


Output shape: torch.Size([32, 10, 512])
