# Multi-Head Attention Sub-layer


In [8]:
#@title Step 1. Input: 3 inputs, d_model = 4

import numpy as np
from scipy.special import softmax

x = np.array([
    [1.0, 0.0, 1.0, 0.0], # Input 1
    [0.0, 2.0, 0.0, 2.0], # Input 2
    [1.0, 1.0, 1.0, 1.0], # Input 3    
])
print(x)

[[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


In [4]:
#@title Step 2. Initialize the Key, Query, Value Matrices

key = np.array([
    [1, 0, 1],
    [1, 0, 0],
    [0, 0, 1],
    [0, 1, 1]
])
query = np.array([
    [1, 0, 1],
    [1, 0, 0],
    [0, 0, 1],
    [0, 1, 1]
])
values = np.array([
    [1, 0, 1],
    [1, 0, 0],
    [0, 0, 1],
    [0, 1, 1]
])

In [6]:
#@title Step 3. Matrix multiplication to obtain Q, K, V

K = np.matmul(x, key)
Q = np.matmul(x, query)
V = np.matmul(x, values)

In [10]:
#@title Step 4. Scale attention scores

# I had no idea the @ operator did matrix multiplication, that's crazy.
attention_scores = (Q @ K.transpose()) / np.sqrt(len(x))
# This attention scoreing it based on the origin transformer equation

print(attention_scores)

[[2.88675135 3.46410162 4.61880215]
 [3.46410162 6.92820323 6.92820323]
 [4.61880215 6.92820323 8.08290377]]


In [12]:
#@title Step 5. Scale Softmax Attention Scores

for i, attention_score in enumerate(attention_scores):
  attention_scores[i] = softmax(attention_score)

print(attention_scores)

[[0.11857409 0.21121746 0.67020845]
 [0.01540939 0.4922953  0.4922953 ]
 [0.02324709 0.23406082 0.74269209]]


In [16]:
#@title Step 6. Final Attention Representations

attention_0_input_1 = attention_scores[0][0] * V[0]
attention_1_input_1 = attention_scores[0][1] * V[1]
attention_2_input_1 = attention_scores[0][2] * V[2]

attention_0_input_2 = attention_scores[1][0] * V[0]
attention_1_input_2 = attention_scores[1][1] * V[1]
attention_2_input_2 = attention_scores[1][2] * V[2]

attention_0_input_3 = attention_scores[2][0] * V[0]
attention_1_input_3 = attention_scores[2][1] * V[1]
attention_2_input_3 = attention_scores[2][2] * V[2]

In [19]:
#@title Step 7. Sum all attention representations for the input

attention_input_1 = attention_0_input_1 + attention_1_input_1 + attention_2_input_1
attention_input_2 = attention_0_input_2 + attention_1_input_2 + attention_2_input_2
attention_input_3 = attention_0_input_3 + attention_1_input_3 + attention_2_input_3

print(f'Attention Score for Input 1: {attention_input_1}')
print(f'Attention Score for Input 2: {attention_input_2}')
print(f'Attention Score for Input 3: {attention_input_3}')

Attention Score for Input 1: [1.88142591 1.09264338 2.67020845]
Attention Score for Input 2: [1.98459061 1.47688591 2.4922953 ]
Attention Score for Input 3: [1.97675291 1.21081373 2.74269209]


In [20]:
#@title Step 8. Concatenation of the output heads

print('Let\'s assume for a moment that steps 2-7 happened 8 times, for all 8 \'heads\' of the transformer')

print('This means that we have effectively 8x each of the vectors above')

print('We just concatenate them to obtain the output dimensions we require')

Let's assume for a moment that steps 2-7 happened 8 times, for all 8 'heads' of the transformer
This means that we have effectively 8x each of the vectors above
We just concatenate them to obtain the output dimensions we require


In [None]:
#@title Step 9. Post-Layer Normalization Step

print('This is just a normalization layer, nothing special here')