In [1]:
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    """
    Computes scaled dot-product attention.

    Args:
        Q: Query matrix  (shape: [num_queries, d_k])
        K: Key matrix    (shape: [num_keys, d_k])
        V: Value matrix  (shape: [num_keys, d_v])

    Returns:
        attention_weights: Softmax-normalized attention scores
        context_vector: Weighted sum of V using attention weights
    """

    # Step 1: Compute QK^T
    scores = np.dot(Q, K.T)

    # Step 2: Scale by sqrt(d_k)
    d_k = K.shape[-1]
    scaled_scores = scores / np.sqrt(d_k)

    # Step 3: Apply softmax row-wise
    exp_scores = np.exp(scaled_scores - np.max(scaled_scores, axis=-1, keepdims=True))
    attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

    # Step 4: Multiply by V to get context vector
    context_vector = np.dot(attention_weights, V)

    return attention_weights, context_vector


# Example test
Q = np.array([[1, 0]])
K = np.array([[1, 0], [0, 1]])
V = np.array([[1, 2], [3, 4]])

attn, context = scaled_dot_product_attention(Q, K, V)

print("Attention Weights:\n", attn)
print("Context Vector:\n", context)


Attention Weights:
 [[0.66976155 0.33023845]]
Context Vector:
 [[1.6604769 2.6604769]]
