## 3.3

### 3.3.1

In [3]:
import torch
from torch import nn

In [4]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your (x^1)
    [0.55, 0.87, 0.66], # journey (x^2)
    [0.57, 0.85, 0.64], # starts (x^3)
    [0.22, 0.58, 0.33], # with (x^4)
    [0.77, 0.25, 0.10], # one (x^5)
    [0.05, 0.80, 0.55]] # step (x^6)
)

In [5]:
# calculating attention scores $omega$

query = inputs[1]
attn_scores2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores2[i] = torch.dot(x_i, query)
print(attn_scores2)

#normalizing attentions scores
# In practice, it’s more common and advisable to use the softmax function for normalization.
attn_weights2_tmp = attn_scores2 / attn_scores2.sum()
print(attn_weights2_tmp, f"\n{attn_weights2_tmp.sum()}")

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) 
1.0000001192092896


In [6]:
def softmax_naive(x: torch.Tensor):
    return torch.exp(x)/torch.exp(x).sum(dim=0)

attn_weights2_naive = softmax_naive(attn_scores2)
print(attn_weights2_naive, f"\n{attn_weights2_naive.sum()}")

# since torch's implementation is more stable and is optimized for performance
attn_weights2 = torch.softmax(attn_scores2, dim=0)
print(attn_weights2, f"\n{attn_weights2.sum()}")

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581]) 
1.0
tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581]) 
1.0


In [7]:
# calculating context vector z^(2)
query = inputs[1]
context_vect2 = torch.zeros(query.shape[0])
for i, x_i in enumerate(inputs):
    context_vect2 += attn_weights2[i] * x_i
print(context_vect2)

tensor([0.4419, 0.6515, 0.5683])


### 3.3.2

In [8]:
attn_weights = torch.empty(inputs.shape[0], inputs.shape[0])
context_vects = torch.zeros(inputs.shape[0], inputs.shape[1])

for i, q_i in enumerate(inputs):
    for j, x_i in enumerate(inputs):
        attn_weights[i][j] = torch.dot(q_i, x_i)
attn_weights[i] = torch.softmax(attn_weights[i], dim=-1)
# By setting dim=-1, we are instructing the softmax function to apply the normalization along the last dimension of the attn_scores tensor

context_vects = attn_weights @ inputs # torch.matmul()
print(attn_weights)
print(context_vects)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
tensor([[1.9802, 2.6760, 2.6721],
        [2.8579, 4.2330, 3.7270],
        [2.8335, 4.1718, 3.6734],
        [1.5501, 2.4086, 2.0693],
        [1.5969, 1.8945, 1.6743],
        [0.4177, 0.6503, 0.5645]])


## 3.4

### 3.4.1

In [9]:
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2

torch.manual_seed(123)
W_q = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) # req grad is false for illustration, in actual implementation, this must be true
W_k = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_v = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

q_2 = x_2 @ W_q 
k_2 = x_2 @ W_k
v_2 = x_2 @ W_v

print(q_2)

tensor([0.4306, 1.4551])


In [None]:
queries = inputs @ W_q # the matrix mul is done on the last dim, the 6 elements here will be broadcast. since for an ele, size is 1x3  and the W_q size is 3x2
keys = inputs @ W_k
values = inputs @ W_q

print(inputs.shape, W_q.shape)
print(queries)

torch.Size([6, 3]) torch.Size([3, 2])
tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])


In [12]:
# computing omega_22
k_2 = keys[1]
attn_score_22 = q_2.dot(k_2)
print(attn_score_22)

tensor(1.8524)


In [16]:
# computing omega_2i ie[0, T]
attn_scores_2 = q_2 @ keys.T
print(q_2)
print(keys.T)
print(attn_scores_2)

tensor([0.4306, 1.4551])
tensor([[0.3669, 0.4433, 0.4361, 0.2408, 0.1827, 0.3275],
        [0.7646, 1.1419, 1.1156, 0.6706, 0.3292, 0.9642]])
tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [18]:
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2/d_k**0.5, dim=-1)
print(attn_weights_2)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])


In [20]:
# calculating the context vector z_2
context_vect_2 = attn_weights_2 @ values
print(context_vect_2)

tensor([0.3313, 1.1652])
