## **The Attention Mechanism**

In [None]:
import numpy as np
from scipy.special import softmax

In [None]:
print("Step 1: Input : 3 inputs, d_model=4")
x = np.array([[1.0, 0.0, 1.0, 0.0], # Input 1
              [0.0, 2.0, 0.0, 2.0], # Input 5
              [1.0, 1.0, 1.0, 1.0] # Input 3
              ])

print(x)

Step 1: Input : 3 inputs, d_model=4
[[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


In [None]:
print("Step 2: weights 3 dimensions x d_model=4")
print("w_query")

w_query = np.array([[1, 0, 1],
                    [1, 0, 0],
                    [0, 0, 1],
                    [0, 1, 1]])

print(w_query)

Step 2: weights 3 dimensions x d_model=4
w_query
[[1 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 1]]


In [None]:
print("w_key")
w_key = np.array([[0, 0, 1],
                  [1, 1, 0],
                  [0, 1, 0],
                  [1, 1, 0]])

print(w_key)

w_key
[[0 0 1]
 [1 1 0]
 [0 1 0]
 [1 1 0]]


In [None]:
print("w_value")
w_value = np.array([[0, 2, 0], 
                    [0, 3, 0],
                    [1, 0, 3],
                    [1, 1, 0]])

print(w_value)

w_value
[[0 2 0]
 [0 3 0]
 [1 0 3]
 [1 1 0]]


In [None]:
print("Step 3: Matrix multiplication to obtain Q, K, V")

print("Queries: x * w_query")
Q = np.matmul(x, w_query)
print(Q)

Step 3: Matrix multiplication to obtain Q, K, V
Queries: x * w_query
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]


In [None]:
print("Step 3 Matrix multiplication to obtain Q, K , V")

print("Keys: x * w_key")

K = np.matmul(x, w_key)
print(K)

Step 3 Matrix multiplication to obtain Q, K , V
Keys: x * w_key
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]


In [None]:
print("Values: x * w_value")

V = np.matmul(x, w_value)
print(V)

Values: x * w_value
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


In [None]:
print("Step 4 Scalled Attention Score")
k_d = 1 # square root of k_d=3 rounded down to 1 for this example
attention_scores = (Q @ K.transpose())/k_d
print(attention_scores)

Step 4 Scalled Attention Score
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


In [None]:
print("Step 5 Scaled softmax attention_scores for each vector")
attention_scores[0] = softmax(attention_scores[0])
attention_scores[1] = softmax(attention_scores[1])
attention_scores[2] = softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5 Scaled softmax attention_scores for each vector
[0.06337894 0.46831053 0.46831053]
[6.03366485e-06 9.82007865e-01 1.79861014e-02]
[2.95387223e-04 8.80536902e-01 1.19167711e-01]


In [None]:
print("Step 6: attention value obtained by score 1/k_d * V")
print(V[0])
print(V[1])
print(V[2])
print("Attention 1")
attention1 = attention_scores[0].reshape(-1, 1)
print(attention1)
attention1 = attention_scores[0][0] * V[0]
print(attention1)

print("Attention 2")
attention2 = attention_scores[0][1] * V[1]
print(attention2)

print("Attention 3")
attention3 = attention_scores[0][2] * V[2]
print(attention3)

Step 6: attention value obtained by score 1/k_d * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1
[[0.06337894]
 [0.46831053]
 [0.46831053]]
[0.06337894 0.12675788 0.19013681]
Attention 2
[0.93662106 3.74648425 0.        ]
Attention 3
[0.93662106 2.80986319 1.40493159]


In [None]:
print("Step 7: summed the results to create the first line of the output matrix")
attention_input1=attention1+attention2+attention3
print(attention_input1)

Step 7: summed the results to create the first line of the output matrix
[1.93662106 6.68310531 1.59506841]


In [None]:
print("Step 8: Step 1 to 7 for inputs 1 to 3")
#We assume we have 3 results with learned weights (they were not trained in this example)
#We assume we are implementing the original Transformer paper. We will have 3 results of 64 dimensions each
attention_head1 = np.random.random((3, 64))
print(attention_head1)

Step 8: Step 1 to 7 for inputs 1 to 3
[[1.25375360e-01 6.06975490e-01 4.49749886e-01 2.51785159e-01
  1.01632837e-01 3.41516035e-01 9.18368031e-01 3.98122360e-01
  4.99689358e-01 9.72080480e-02 8.65875165e-01 1.13310285e-01
  3.40447086e-01 3.67311911e-05 2.25938973e-01 9.80795251e-02
  6.17591743e-01 5.10631442e-01 6.27112175e-01 8.95516589e-01
  2.98942697e-02 5.73468790e-01 9.24361382e-01 2.17516690e-01
  6.86533405e-01 2.39276758e-01 7.51942518e-01 8.64038064e-01
  5.03306858e-01 9.42705553e-01 7.08888673e-01 3.99020733e-01
  9.84769062e-01 3.59816646e-01 3.94587224e-01 5.37475178e-01
  6.79690590e-01 8.74918288e-01 9.69953409e-01 3.50617092e-01
  4.12672451e-01 4.67935157e-01 8.30734483e-01 7.55077347e-01
  1.05808625e-01 7.44503944e-01 8.70523863e-01 9.20324229e-01
  4.82333271e-01 9.51804765e-02 9.58110923e-01 3.96611438e-01
  3.15356740e-01 5.37776819e-01 3.01296838e-01 3.78607640e-01
  4.40322323e-01 6.44670028e-01 1.93612622e-01 8.17858708e-01
  8.72871679e-01 8.28499488e-01 

In [None]:
print("Step 9: We assumed that we have trained the 8 heads of the attention-sublayer")
z0h1 = np.random.random((3, 64))
z0h2 = np.random.random((3, 64))
z0h3 = np.random.random((3, 64))
z0h4 = np.random.random((3, 64))
z0h5 = np.random.random((3, 64))
z0h6 = np.random.random((3, 64))
z0h7 = np.random.random((3, 64))
z0h8 = np.random.random((3, 64))

print("Shape of the head", z0h1.shape, "dimension of 8 heads", 64*8)

Step 9: We assumed that we have trained the 8 heads of the attention-sublayer
Shape of the head (3, 64) dimension of 8 heads 512


In [None]:
print("Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model")
output_attention = np.hstack((z0h1,z0h2,z0h3,z0h4,z0h5,z0h6,z0h7,z0h8))
print(output_attention)

Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model
[[0.50046502 0.96693932 0.45553428 ... 0.56353731 0.45969431 0.23722083]
 [0.63241634 0.5657176  0.87726363 ... 0.10003414 0.34150873 0.36276206]
 [0.40562743 0.65486719 0.60506598 ... 0.88302911 0.80112783 0.88063849]]


# **Hugging Face Online**

In [None]:
!pip -qq install transformers

[K     |████████████████████████████████| 2.1MB 8.4MB/s 
[K     |████████████████████████████████| 870kB 39.5MB/s 
[K     |████████████████████████████████| 3.3MB 44.8MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
from transformers import pipeline
translator = pipeline("translation_en_to_fr")
print(translator("It is easy to translate languages with transformers", max_length=40))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…


[{'translation_text': "Il est facile de traduire des langues à l'aide de transformateurs"}]


# **Positional Encoding**

In [None]:
!pip install --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/44/52/f1417772965652d4ca6f901515debcd9d6c5430969e8c02ee7737e6de61c/gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9MB)
[K     |████████████████████████████████| 23.9MB 75.2MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [None]:
import torch
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import math
import numpy as np 
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import Word2Vec 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
import matplotlib.pyplot as plt
import warnings



In [None]:
warnings.filterwarnings(action="ignore")

In [None]:
dprint = 0 # print if set to 1, default=0

sample = open("text.txt", "r")
s = sample.read()

# processing escape charachters
f = s.replace("\n", " ")

data = []

# sentence parsing
for i in sent_tokenize(f):
  temp = []
  # tokenize the sentence into words
  for j in word_tokenize(i):
    temp.append(j.lower())
  data.append(temp)

# creating a skip gram model
model2 = gensim.models.Word2Vec(data, min_count = 1, vector_size = 512,window = 5, sg = 1) 

# 1-The 2-black 3-cat 4-sat 5-on 6-the 7-couch 8-and 9-the 10-brown 11-dog 12-slept 13-on 14-the 15-rug.
word1 = 'black'
word2 = 'brown'
pos1 = 2
pos2 = 10




In [None]:
data

[['the',
  'black',
  'cat',
  'sat',
  'on',
  'the',
  'couch',
  'and',
  'the',
  'brown',
  'dog',
  'slept',
  'on',
  'the',
  'rug.the',
  'cat',
  'did',
  'not',
  'cross',
  'the',
  'street',
  'because',
  'it',
  'was',
  'too',
  'wet.the',
  'dog',
  'sat',
  'on',
  'the',
  'couch',
  'near',
  'the',
  'rug.the',
  'black',
  'cat',
  'sat',
  'on',
  'the',
  'couch',
  'and',
  'the',
  'brown',
  'dog',
  'slept',
  'on',
  'the',
  'rug.the',
  'cat',
  'did',
  'not',
  'cross',
  'the',
  'street',
  'because',
  'it',
  'was',
  'too',
  'wet.the',
  'dog',
  'sat',
  'on',
  'the',
  'couch',
  'near',
  'the',
  'rug',
  '.'],
 ['the',
  'black',
  'cat',
  'sat',
  'on',
  'the',
  'couch',
  'and',
  'the',
  'brown',
  'dog',
  'slept',
  'on',
  'the',
  'rug.the',
  'cat',
  'did',
  'not',
  'cross',
  'the',
  'street',
  'because',
  'it',
  'was',
  'too',
  'wet.the',
  'dog',
  'sat',
  'on',
  'the',
  'couch',
  'near',
  'the',
  'rug',
  '.'],

In [None]:
model2.wv['black']

array([-0.06562676, -0.05351067, -0.00367223,  0.06062045, -0.00825024,
        0.03129761,  0.04094658, -0.04247817,  0.02367185, -0.00713361,
        0.02598261, -0.00719516, -0.00289508, -0.00929288, -0.02900261,
       -0.02728165,  0.03671558,  0.07769306, -0.07263709, -0.04485692,
        0.04485429, -0.0331312 ,  0.10418659,  0.02434324, -0.02192772,
        0.00515193,  0.03559644,  0.03915192, -0.00977636, -0.1074532 ,
       -0.01346473,  0.03907426,  0.02671206, -0.01067896, -0.00663918,
        0.00800819,  0.04495849, -0.05629059, -0.0007354 , -0.01153078,
       -0.0825    ,  0.08180337, -0.07897275, -0.04355665,  0.05569196,
        0.01140372, -0.0011262 ,  0.05680713,  0.09018846,  0.09711311,
       -0.05737248,  0.04957562,  0.05798696,  0.1025627 , -0.06556484,
       -0.00416583,  0.05219064, -0.04471238,  0.00999579,  0.05621514,
        0.05462889, -0.03182608, -0.00359469,  0.02774829,  0.01147719,
       -0.00405973, -0.03139512,  0.03170056, -0.02370843, -0.02

In [None]:
a = model2.wv[word1]
b = model2.wv[word2]

if (dprint==1):
  print(a)

# compute cosine similarity
dot = np.dot(a, b)
norma = np.linalg.norm(a)
normb = np.linalg.norm(b)
cos = dot / (norma * normb)

aa = a.reshape(1, 512)
ba = b.reshape(1, 512)
cos = cosine_similarity(aa, ba)

A Positional Encoding example using one line of basic Python using a few lines of code for the sine and cosine functions. I added a Pytorch method inspired by Pytorch.org to explore these methods. The main idea to keep in mind is that we are looking to add small values to the word embedding output so that the positions are taken into account. This means that as long as the cosine similarity, for example, displayed at the end of the notebook, shows the positions have been taken into account, the method can apply. Depending on the Transformer model, this method can be fine-tuned as well as using other methods.

In [None]:
pe1 = aa.copy()
pe2 = aa.copy()
pe3 = aa.copy()
paa = aa.copy()
pba = ba.copy()
d_model = 512
max_print = d_model 
max_length = 20

In [None]:
for i in range(0, max_print, 2):
  pe1[0][i] = math.sin(pos1 / (10000 ** ((2 * i)/d_model)))
  paa[0][i] = (paa[0][i] * math.sqrt(d_model)) + pe1[0][i]
  pe1[0][i+1] = math.cos(pos1 / (10000 ** ((2*1)/d_model)))
  paa[0][i+1] = (paa[0][i] * math.sqrt(d_model)) + pe1[0][i+1]
  if dprint == 1:
    print(i, pe1[0][i], i+1, pe1[0][i+1])
    print(i, paa[0][i], i+1, paa[0][i+1])
    print("\n")

In [None]:
max_len=max_length 
pe = torch.zeros(max_len, d_model)

In [None]:
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() *(-math.log(10000.0)/d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)


In [None]:
for i in range(0, max_print, 2):
  pe2[0][i] = math.sin(pos2/(10000 ** ((2 * 8 )/d_model)))
  pba[0][i] = (pba[0][i] * math.sqrt(d_model)) + pe2[0][i]

  pe2[0][i+1] = math.cos(pos2/ (10000 ** ((2 * i)/d_model)))
  pba[0][i+1] = (pba[0][i+1] * math.sqrt(d_model)) + pe2[0][i+1]

  if dprint == 1:
    print(i, pe2[0][i], i+1, pe2[0][i+1])
    print(i, paa[0][1], i+1, paa[0][i+1])
    print("\n")


print(word1, word2)
cos_lib = cosine_similarity(aa, ba)
print(cos_lib, "word_similarity")
cos_lib = cosine_similarity(pe1, pe2)
print(cos_lib, "positional similarity")
cos_lib = cosine_similarity(paa, pba)
print(cos_lib, "positional encoding similarity")

if dprint == 1:
  print(word1)
  print("embedding")
  print(aa)
  print("positional encoding")
  print(pe1)
  print("encoding embedding")
  print(paa)

  print(word2)
  print("embedding")
  print(ba)
  print("positional encoding")
  print(pe2)
  print("encoding embedding")
  print(pba)

black brown
[[0.9995165]] word_similarity
[[-0.3699011]] positional similarity
[[0.05395487]] positional encoding similarity
