Refer to this [article](https://medium.com/@christoschr97/understanding-the-attention-mechanism-a-simple-implementation-using-python-and-numpy-3f1feae13fb7) .

# Step 1. Initial Word Embedding

In [6]:
import numpy as np

word_embeddings = {
    'she':    np.array([0.2, 0.9, 0.1, 0.5]),
    'likes':  np.array([0.8, 0.3, 0.7, 0.2]),
    'coffee': np.array([0.4, 0.6, 0.3, 0.9])
}


# Step 2. Create Input Matrix X

In [12]:
X = np.vstack([word_embeddings['she'], 
               word_embeddings['likes'], 
               word_embeddings['coffee']])



# Step 3. Define Weight Matrices W_q, W_k and W_v

In [19]:
W_q = np.array([[0.9, 0.1, 0.1, 0.1],
                [0.1, 0.9, 0.1, 0.1],
                [0.1, 0.1, 0.9, 0.1],
                [0.1, 0.1, 0.1, 0.9]])
W_k = np.array([[0.9, 0.1, 0.1, 0.1],
                [0.1, 0.9, 0.1, 0.1],
                [0.1, 0.1, 0.9, 0.1],
                [0.1, 0.1, 0.1, 0.9]])
W_v = np.array([[0.8, 0.2, 0.1, 0.1],
                [0.2, 0.8, 0.2, 0.1],
                [0.1, 0.2, 0.8, 0.1],
                [0.1, 0.1, 0.1, 0.9]])

# Step 4. Compute Q, K, V Matrices

In [20]:
Q = np.dot(X, W_q)
K = np.dot(X, W_k)
V = np.dot(X, W_v)

# Step 5. Calculate Raw Attention Scores

In [29]:
scores = np.dot(Q, K.T)

print(Q)
print(K.T)

[[0.33 0.89 0.25 0.57]
 [0.84 0.44 0.76 0.36]
 [0.54 0.7  0.46 0.94]]
[[0.33 0.84 0.54]
 [0.89 0.44 0.7 ]
 [0.25 0.76 0.46]
 [0.57 0.36 0.94]]


In [31]:
scores

array([[1.2884, 1.064 , 1.452 ],
       [1.064 , 1.6064, 1.4496],
       [1.452 , 1.4496, 1.8768]])

# Step 6. Scale the Scores

In [39]:
d_k = K.shape[1]

scaled_scores = scores / np.sqrt(d_k)

scaled_scores

array([[0.6442, 0.532 , 0.726 ],
       [0.532 , 0.8032, 0.7248],
       [0.726 , 0.7248, 0.9384]])

# Step 7. Apply Softmax to Obtain Attention Weights

In [51]:
exp_scores = np.exp(scaled_scores)
attention_weights = exp_scores / exp_scores.sum(axis=1, keepdims=True)

attention_weights

array([[0.33567137, 0.30004505, 0.36428358],
       [0.28375415, 0.37215416, 0.34409169],
       [0.30907667, 0.308706  , 0.38221733]])

# Step 8. Calculate the Final Output

In [53]:
output = np.dot(attention_weights, V)
output

array([[0.57530294, 0.70527381, 0.50530294, 0.64177546],
       [0.60019479, 0.68822737, 0.53019479, 0.61916155],
       [0.58155011, 0.7007833 , 0.51155011, 0.64659215]])