Reference

- [Understanding the Attention Mechanism â€” A Simple Implementation Using Python and NumPy](https://medium.com/@christoschr97/understanding-the-attention-mechanism-a-simple-implementation-using-python-and-numpy-3f1feae13fb7)

# Step 1: Initial Word Embeddings


In [1]:
import numpy as np

In [2]:
word_embeddings = {
    'she'   : np.array([0.2, 0.9, 0.1, 0.5]),
    'likes' : np.array([0.8, 0.3, 0.7, 0.2]),
    'coffee': np.array([0.4, 0.6, 0.3, 0.9]),
}

# Step 2: Create Input Matrix X


In [11]:
X = np.vstack([word_embeddings['she'], 
               word_embeddings['likes'], 
               word_embeddings['coffee']])
X.shape, X

((3, 4),
 array([[0.2, 0.9, 0.1, 0.5],
        [0.8, 0.3, 0.7, 0.2],
        [0.4, 0.6, 0.3, 0.9]]))

# Step 3: Define Weight Matrices W_q, W_k, and W_v



In [13]:
W_q = np.array([[0.9, 0.1, 0.1, 0.1],
                [0.1, 0.9, 0.1, 0.1],
                [0.1, 0.1, 0.9, 0.1],
                [0.1, 0.1, 0.1, 0.9]])

W_k = np.array([[0.9, 0.1, 0.1, 0.1],
                [0.1, 0.9, 0.1, 0.1],
                [0.1, 0.1, 0.9, 0.1],
                [0.1, 0.1, 0.1, 0.9]])

W_v = np.array([[0.8, 0.2, 0.1, 0.1],
                [0.2, 0.8, 0.2, 0.1],
                [0.1, 0.2, 0.8, 0.1],
                [0.1, 0.1, 0.1, 0.9]])

In [22]:
W_q[:,0]@ X[0, :]


np.float64(0.33000000000000007)

# Step 4: Compute Q, K, and V Matrices


In [None]:
# manually compute the first element
X[0, :] @ W_q[:,0]

np.float64(0.33000000000000007)

In [28]:

Q = X @ W_q
K = X @ W_k
V = X @ W_v

Q.shape, K.shape, V.shape

((3, 4), (3, 4), (3, 4))

# Step 5: Calculate Raw Attention Scores


In [31]:
Q.shape, K.T.shape

((3, 4), (4, 3))

In [33]:
scores = np.dot(Q, K.T)
scores


array([[1.2884, 1.064 , 1.452 ],
       [1.064 , 1.6064, 1.4496],
       [1.452 , 1.4496, 1.8768]])

# Step 6: Scale the Scores


In [41]:
d_k = K.shape[1]

scaled_scores = scores / np.sqrt(d_k)

scaled_scores.shape, scaled_scores

((3, 3),
 array([[0.6442, 0.532 , 0.726 ],
        [0.532 , 0.8032, 0.7248],
        [0.726 , 0.7248, 0.9384]]))

# Step 7: Apply Softmax to Obtain Attention Weights


In [52]:
exp_scores = np.exp(scaled_scores)

exp_scores, exp_scores.sum(axis=1, keepdims=True)




(array([[1.90446285, 1.70233357, 2.06679686],
        [1.70233357, 2.23267407, 2.0643182 ],
        [2.06679686, 2.0643182 , 2.55588872]]),
 array([[5.67359329],
        [5.99932583],
        [6.68700378]]))

In [54]:
attention_weights = exp_scores / exp_scores.sum(axis=1, keepdims=True)

attention_weights.shape, attention_weights

((3, 3),
 array([[0.33567137, 0.30004505, 0.36428358],
        [0.28375415, 0.37215416, 0.34409169],
        [0.30907667, 0.308706  , 0.38221733]]))

# Step 8: Calculate the Final Output


In [59]:
output = attention_weights @ V

output.shape, output


((3, 4),
 array([[0.57530294, 0.70527381, 0.50530294, 0.64177546],
        [0.60019479, 0.68822737, 0.53019479, 0.61916155],
        [0.58155011, 0.7007833 , 0.51155011, 0.64659215]]))