# **Implementing a simplified attention mechanism. (without trainable weights)**

In [2]:
import torch

# Just for illustration, let's use small embedding dimension (3-dimensional vector)
inputs = torch.tensor(
    [[0.43,0.15,0.89], # Your (x^1)
     [0.55,0.87,0.66], # journey (x^2)
     [0.57,0.85,0.64], # starts (x^3)
     [0.22,0.58,0.33], # with (x^4)
     [0.77,0.25,0.10], # one (x^6)
     [0.05,0.80,0.55]] # step (x^7)
    )

In [3]:
# let's get context for 'journey' which in second input in inputs tensor.
query = inputs[1]
attention_scores_2 = torch.empty(inputs.shape[0])

for i, x_i in enumerate(inputs):
  attention_scores_2[i] = torch.dot(x_i, query)

print(attention_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


Now, we normalize this attention scores to obtain weights that sum up to 1.

In [4]:
# Normalization
attn_weights_2_tmp = attention_scores_2 / torch.sum(attention_scores_2)
print("Attention weights:" , attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: tensor(1.0000)


better to use Softmax for normalization. (e^x1 / sum)

torch softmax --> (e ^ x1-max) (To reduce overflow errors or for much more precision, torch softmax reduces max from power)

In addition softmax function also ensures that the weights are positive so that there will not be any problem during summation

In [5]:
def softmax_naive(x):
  return torch.exp(x) / torch.sum(torch.exp(x), dim=0) # dim=0 means summing all entries in a row

attn_weights_2_naive = softmax_naive(attention_scores_2)
print("Attention weights:" , attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [6]:
# pytorch implementation of softmax
attn_weights_2 = torch.softmax(attention_scores_2, dim=0)
print("Attention weights:" , attn_weights_2)
print("Sum:", attn_weights_2.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [7]:
# now let's calculate the context vector for 'journey'
# context vector comprises is attention weight * attention scores for each and we add those.
query = inputs[1]

context_vec_2 = torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
  context_vec_2 += attn_weights_2[i] * x_i

print(context_vec_2)

tensor([0.4419, 0.6515, 0.5683])


We did for 'Journey', but we have to calculate for all queries.

1. Attention scores
2. Attention weights
3. Context vector

In [8]:
attn_scores = torch.empty(6,6)

for i,x_i in enumerate(inputs):
  for j,x_j in enumerate(inputs):
    attn_scores[i,j] = torch.dot(x_i, x_j)

print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [9]:
# Since for loops are very costly or computation heavy
# We can just take inputs and it's transpose and multiply with pytorch matrix multiplication @

attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [10]:
# for 2-D tensor --> [row, columns] and dim=-1, or dim=1 will calculate based on columns.
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [11]:
# now calculate context vectors.
all_context_vectors = attn_weights @ inputs
print(all_context_vectors)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


What is the need for trainable weights now??
- We know that from the above example, one and journey are not related to each other, but what if in a sentence they two have strong connection. That's answered by trainable weights.
- Apart from meaning, we have to capture the context of a sentence. This is why we need trainable weights.