In [2]:
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    """
    Compute scaled dot-product attention.
    
    Args:
        Q: Query matrix (seq_len_q, d_k)
        K: Key matrix (seq_len_k, d_k)
        V: Value matrix (seq_len_v, d_v)
    
    Returns:
        attention_weights: softmax scores
        context_vector: weighted sum of V
    """
    d_k = Q.shape[1]
    # Compute raw attention scores
    scores = np.dot(Q, K.T) / np.sqrt(d_k)
    
    # Softmax normalization
    attention_weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention_weights /= np.sum(attention_weights, axis=-1, keepdims=True)
    
    # Weighted sum of values
    context_vector = np.dot(attention_weights, V)
    
    return attention_weights, context_vector

# Example
Q = np.random.rand(2, 4)
K = np.random.rand(3, 4)
V = np.random.rand(3, 5)
weights, context = scaled_dot_product_attention(Q, K, V)
print("Attention weights:\n", weights)
print("Context vector:\n", context)
 



Attention weights:
 [[0.33705711 0.34317447 0.31976842]
 [0.34479381 0.34656346 0.30864273]]
Context vector:
 [[0.48982646 0.39372753 0.45396315 0.39871791 0.33339153]
 [0.48720248 0.38842963 0.44774976 0.40415351 0.32578721]]
