### Attention Mechanism

[Machine Learning Mastery](https://machinelearningmastery.com/the-attention-mechanism-from-scratch/)

[Wikipedia](https://en.wikipedia.org/wiki/Attention_(machine_learning))

In [7]:
# 1. Encode sequence of words (with encoder, or manually)
import random
import numpy as np
from scipy.special import softmax

# encoder representations of four different words
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])

# 2. Init Query, Key, Value weights
# generating the weight matrices
np.random.seed(42) # to allow us to reproduce the same attention values
W_Q = np.random.randint(3, size=(3, 3))  # Dim = Embedding dim x Attention dim
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))

# 3. Compute Q, K, V vectors
# generating the queries, keys and values
query_1 = word_1 @ W_Q
key_1 = word_1 @ W_K
value_1 = word_1 @ W_V

query_2 = word_2 @ W_Q
key_2 = word_2 @ W_K
value_2 = word_2 @ W_V

query_3 = word_3 @ W_Q
key_3 = word_3 @ W_K
value_3 = word_3 @ W_V

query_4 = word_4 @ W_Q
key_4 = word_4 @ W_K
value_4 = word_4 @ W_V

# 4. Get dot product similarity between Query 1 and Keys 1-4
#    Q1.T @ Ki for i=1:4 -> Weights
# scoring the first query vector against all key vectors
scores = np.array([
    np.dot(query_1, key_1), 
    np.dot(query_1, key_2), 
    np.dot(query_1, key_3), 
    np.dot(query_1, key_4)
])

# computing the weights by a softmax operation
weights = softmax(scores / key_1.shape[0] ** 0.5)

# 5. Attention = SUM ( Weights.T @ Values )
# computing the attention by a weighted sum of the value vectors
attention = (weights[0] * value_1) + (weights[1] * value_2) + (weights[2] * value_3) + (weights[3] * value_4)

print(attention)

[0.98522025 1.74174051 0.75652026]
