# Self-Attention Mechanism

The self-attention mechanism allows the model to weigh the importance of different words in a sequence. It helps the model focus on relevant words while encoding a particular word. This is a critical part of the Transformer architecture.

In the prior module, We left off with the position encoding embedding. To continue this process, we are pushing the positional embedding into the next compontent of the transformer, the attention mechanism. 

In [47]:
import torch
import torch.nn as nn
import math

# Positional Encoded Embeddings from the previous notebook
pos_encoded_embeddings = torch.tensor([[[ 0.0171,  1.0654,  0.4616,  0.9196,  0.7193, -0.7430,  0.7120,
           0.4198,  2.7427,  0.2844],
         [ 0.9161, -0.4999,  1.8302,  1.2182,  0.0651,  2.0396,  0.3780,
           0.4085,  1.3560, -1.3176],
         [-0.3461,  0.2526,  0.4120,  0.0398,  1.6146,  2.6475, -0.1887,
          -0.6907, -0.4066,  1.2899],
         [-0.8345, -0.4657, -0.5635,  1.1514,  0.1762,  1.8148, -1.4084,
           0.9153, -1.1734,  1.1989],
         [-0.3981, -0.7277, -1.2657,  1.9887, -0.2399,  0.0412,  1.9375,
           0.6083,  0.2095,  1.5739],
         [-1.5240,  1.2359,  0.5596, -0.1529, -0.4064, -0.0906, -1.6746,
           0.2480, -0.2364,  0.8417],
         [ 0.6992, -1.1193,  0.5642,  1.3861, -0.5185,  0.6701, -0.5353,
           1.8013,  0.7900, -0.9430],
         [-0.3550,  2.1032,  1.8690,  0.7174, -1.7692,  0.8200, -0.9620,
           1.8325,  0.3009,  1.0083],
         [ 1.5902, -0.8516,  3.2954, -0.5147,  0.1798,  1.8522, -1.0186,
           0.6484,  0.9932,  0.4030],
         [-0.5990, -0.4916,  0.5419, -0.0293, -0.3465,  1.1548, -1.1130,
          -0.8118,  0.3455,  0.9474]]], dtype=torch.float)

print("Positional Encoded Embeddings Shape:", pos_encoded_embeddings.shape) # torch.Size([1, 10, 10]) because we have 1 batch, 10 tokens, and 10 features
print(pos_encoded_embeddings)

Positional Encoded Embeddings Shape: torch.Size([1, 10, 10])
tensor([[[ 0.0171,  1.0654,  0.4616,  0.9196,  0.7193, -0.7430,  0.7120,
           0.4198,  2.7427,  0.2844],
         [ 0.9161, -0.4999,  1.8302,  1.2182,  0.0651,  2.0396,  0.3780,
           0.4085,  1.3560, -1.3176],
         [-0.3461,  0.2526,  0.4120,  0.0398,  1.6146,  2.6475, -0.1887,
          -0.6907, -0.4066,  1.2899],
         [-0.8345, -0.4657, -0.5635,  1.1514,  0.1762,  1.8148, -1.4084,
           0.9153, -1.1734,  1.1989],
         [-0.3981, -0.7277, -1.2657,  1.9887, -0.2399,  0.0412,  1.9375,
           0.6083,  0.2095,  1.5739],
         [-1.5240,  1.2359,  0.5596, -0.1529, -0.4064, -0.0906, -1.6746,
           0.2480, -0.2364,  0.8417],
         [ 0.6992, -1.1193,  0.5642,  1.3861, -0.5185,  0.6701, -0.5353,
           1.8013,  0.7900, -0.9430],
         [-0.3550,  2.1032,  1.8690,  0.7174, -1.7692,  0.8200, -0.9620,
           1.8325,  0.3009,  1.0083],
         [ 1.5902, -0.8516,  3.2954, -0.5147,  0.17

## Linear Transformation to Generate Queries, Keys, and Values

The positional encoded embeddings are passed through three different linear layers to generate the Query (Q), Key (K), and Value (V) matrices. These matrices are used to compute the attention scores.

In [48]:
# Dimensions
d_model = pos_encoded_embeddings.size(-1)
d_k = d_model // 2  # Assuming d_k = d_model / num_heads for simplicity
num_heads = 2

# Create nn.Linear layers for Q, K, V for each head
W_q = nn.ModuleList([nn.Linear(d_model, d_k) for _ in range(num_heads)])
W_k = nn.ModuleList([nn.Linear(d_model, d_k) for _ in range(num_heads)])
W_v = nn.ModuleList([nn.Linear(d_model, d_k) for _ in range(num_heads)])

# Apply the linear transformations to generate Q, K, V for each head
queries = [W_q[i](pos_encoded_embeddings) for i in range(num_heads)]
keys = [W_k[i](pos_encoded_embeddings) for i in range(num_heads)]
values = [W_v[i](pos_encoded_embeddings) for i in range(num_heads)]

# Print the shapes and matrices of each head before concatenation
for i in range(num_heads):
    print(f"Head {i+1} Queries Shape:", queries[i].shape)
    print(f"Head {i+1} Queries Matrix:\n", queries[i])
    print(f"Head {i+1} Keys Shape:", keys[i].shape)
    print(f"Head {i+1} Keys Matrix:\n", keys[i])
    print(f"Head {i+1} Values Shape:", values[i].shape)
    print(f"Head {i+1} Values Matrix:\n", values[i])

Head 1 Queries Shape: torch.Size([1, 10, 5])
Head 1 Queries Matrix:
 tensor([[[-0.3537, -0.2582, -1.0804,  1.2453,  0.3636],
         [ 1.4551,  0.7938,  0.6904,  1.0099, -0.1060],
         [-0.0323,  0.4988,  0.1480, -0.2169,  0.3064],
         [ 0.1413, -0.1496, -0.1585, -0.6643, -0.1013],
         [-0.6394,  0.0277, -0.0857,  0.1829,  0.9370],
         [-0.1441, -0.4683, -0.6981, -0.2103, -0.1838],
         [ 1.2703, -0.0261, -0.1065,  0.5500, -0.2545],
         [ 1.2149, -0.0288, -0.5788,  1.1972,  0.0184],
         [ 1.8365,  0.6732, -0.2994,  0.8017,  0.0809],
         [ 0.1365,  0.2335, -0.1007, -0.1777,  0.2385]]],
       grad_fn=<ViewBackward0>)
Head 1 Keys Shape: torch.Size([1, 10, 5])
Head 1 Keys Matrix:
 tensor([[[-0.1314, -0.9953,  0.4568, -0.1121, -0.4200],
         [ 0.4794, -1.5111,  0.5368,  0.4985, -0.7619],
         [ 0.8266, -0.8718,  0.7948, -0.7616, -0.0178],
         [ 0.3400,  0.2035,  0.1653,  0.4253, -0.2353],
         [-0.3605,  0.5337,  0.9110,  0.4017, -0.0

## Scaled Dot-Product Attention

To compute the attention scores, we take the dot product of the Query and Key matrices. The result is then scaled by the square root of the dimension of the keys (\(d_k\)) to prevent the values from becoming too large. 

In [49]:
# Compute the dot product of queries and keys, then scale
print(f"squre root of the dim of keys: {math.sqrt(d_k)}")

# Compute the dot product of queries and keys, then scale for each head
scores = [torch.matmul(queries[i], keys[i].transpose(-2, -1)) / math.sqrt(d_k) for i in range(num_heads)]

# Print the scores of each head
for i in range(num_heads):
    print(f"Head {i+1} Scores Shape:", scores[i].shape)
    print(f"Head {i+1} Scores Matrix:\n", scores[i])

squre root of the dim of keys: 2.23606797749979
Head 1 Scores Shape: torch.Size([1, 10, 10])
Head 1 Scores Matrix:
 tensor([[[-0.2157, -0.0069, -0.8411,  0.0414, -0.2311,  0.1743,  0.6738,
           0.1430, -0.2474, -0.2564],
         [-0.3285,  0.2026,  0.1307,  0.5478,  0.4205,  0.2989,  0.2462,
           0.2357,  0.1726,  0.7161],
         [-0.2366, -0.4612, -0.0824, -0.0221,  0.1372, -0.0486, -0.3659,
          -0.2282, -0.4712, -0.0607],
         [ 0.0782, -0.0202,  0.2813, -0.1195, -0.2396, -0.0553, -0.2630,
          -0.1533,  0.0995,  0.0021],
         [-0.1774, -0.4549, -0.3473, -0.1649,  0.0818, -0.1586, -0.1829,
          -0.2788, -0.5340, -0.3211],
         [ 0.1193,  0.1337, -0.0457, -0.1368, -0.4056,  0.0171,  0.1282,
          -0.0014,  0.1571, -0.1483],
         [-0.0645,  0.4738,  0.2566,  0.3143, -0.1486,  0.2553,  0.3476,
           0.1931,  0.5347,  0.5432],
         [-0.2403,  0.4017, -0.1533,  0.3651, -0.2240,  0.3499,  0.6244,
           0.2233,  0.3458,  0.461

## Softmax to Get Attention Weights

The scaled dot-product scores are then passed through a softmax function to convert them into probabilities. These probabilities represent the attention weights.

In [50]:
# Apply softmax to the scores to get the attention weights for each head
attention = [torch.nn.functional.softmax(scores[i], dim=-1) for i in range(num_heads)]

# Print the attention weights of each head
for i in range(num_heads):
    print(f"Head {i+1} Attention Weights Shape:", attention[i].shape)
    print(f"Head {i+1} Attention Weights Matrix:\n", attention[i])

Head 1 Attention Weights Shape: torch.Size([1, 10, 10])
Head 1 Attention Weights Matrix:
 tensor([[[0.0812, 0.1000, 0.0434, 0.1050, 0.0800, 0.1199, 0.1976, 0.1162,
          0.0787, 0.0780],
         [0.0535, 0.0909, 0.0846, 0.1284, 0.1131, 0.1001, 0.0950, 0.0940,
          0.0883, 0.1520],
         [0.0932, 0.0744, 0.1087, 0.1154, 0.1354, 0.1124, 0.0819, 0.0939,
          0.0737, 0.1111],
         [0.1110, 0.1006, 0.1360, 0.0911, 0.0808, 0.0971, 0.0789, 0.0881,
          0.1134, 0.1029],
         [0.1065, 0.0807, 0.0898, 0.1078, 0.1380, 0.1085, 0.1059, 0.0962,
          0.0745, 0.0922],
         [0.1133, 0.1149, 0.0960, 0.0877, 0.0670, 0.1023, 0.1143, 0.1004,
          0.1176, 0.0867],
         [0.0699, 0.1197, 0.0963, 0.1021, 0.0642, 0.0962, 0.1055, 0.0904,
          0.1272, 0.1283],
         [0.0609, 0.1157, 0.0664, 0.1116, 0.0619, 0.1099, 0.1446, 0.0968,
          0.1094, 0.1228],
         [0.0504, 0.0866, 0.0882, 0.1351, 0.0758, 0.1249, 0.0996, 0.0846,
          0.0879, 0.1668],
 

## Compute the Output as a Weighted Sum of the Values

The final step in the attention mechanism is to compute a weighted sum of the Value matrix using the attention weights. This gives us the output of the self-attention mechanism.

In [51]:
# Compute the output as a weighted sum of the values for each head
outputs = [torch.matmul(attention[i], values[i]) for i in range(num_heads)]

# Print the outputs of each head before concatenation
for i in range(num_heads):
    print(f"Head {i+1} Output Shape:", outputs[i].shape)
    print(f"Head {i+1} Output:\n", outputs[i])

Head 1 Output Shape: torch.Size([1, 10, 5])
Head 1 Output:
 tensor([[[-0.2357, -0.0381,  0.3160, -0.3249,  0.4623],
         [-0.1032, -0.0118,  0.2269, -0.3979,  0.3255],
         [-0.1322, -0.0478,  0.2636, -0.4224,  0.3654],
         [-0.1233, -0.1030,  0.2125, -0.4679,  0.3577],
         [-0.1682, -0.0543,  0.2926, -0.4078,  0.3994],
         [-0.1783, -0.1068,  0.2472, -0.4232,  0.4060],
         [-0.1188, -0.0624,  0.1967, -0.4226,  0.3410],
         [-0.1526, -0.0350,  0.2334, -0.3677,  0.3742],
         [-0.0868, -0.0076,  0.2132, -0.3678,  0.3120],
         [-0.1334, -0.0620,  0.2473, -0.4233,  0.3659]]],
       grad_fn=<UnsafeViewBackward0>)
Head 2 Output Shape: torch.Size([1, 10, 5])
Head 2 Output:
 tensor([[[ 0.1877,  0.5866,  0.1517, -0.0174,  0.2697],
         [ 0.2370,  0.5898,  0.1818, -0.0085,  0.3803],
         [ 0.3225,  0.6646, -0.0476, -0.0384,  0.4374],
         [ 0.3504,  0.6697, -0.1382,  0.0012,  0.4411],
         [ 0.2682,  0.5036,  0.1064,  0.1244,  0.3768],



### 6. **Weighted Sum of Values to Get the Output**

#### Explanation

After computing the attention weights, the next step is to calculate the final output by taking a weighted sum of the value vectors. This is done through matrix multiplication of the attention weights matrix with the value matrix.

Given:
- **Attention Weights** matrix $\mathbf{A}$ of shape $(\text{number of tokens}, \text{number of tokens})$.
- **Values** matrix $\mathbf{V}$ of shape $(\text{number of tokens}, d_{\text{model}})$.

The output is computed as:
$$
\mathbf{O} = \mathbf{A} \times \mathbf{V}
$$
Where:
- $\mathbf{O}$ is the output matrix of shape $(\text{number of tokens}, d_{\text{model}})$.

#### Example Code with Matrix Multiplication

```python
# Let's assume 3 tokens and a model dimension of 4 for simplicity
values = torch.tensor([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2]
], dtype=torch.float)

attention_weights = torch.tensor([
    [0.2, 0.3, 0.5],
    [0.1, 0.8, 0.1],
    [0.4, 0.1, 0.5]
], dtype=torch.float)

# Perform the matrix multiplication
output = torch.matmul(attention_weights, values)

print("Attention Weights Matrix (A):\n", attention_weights)
print("Values Matrix (V):\n", values)
print("Output Matrix (O = A * V):\n", output)
```

#### Expected Output

Given the example matrices:

- **Attention Weights (A):**
  $$
  \mathbf{A} =
  \begin{bmatrix}
  0.2 & 0.3 & 0.5 \\
  0.1 & 0.8 & 0.1 \\
  0.4 & 0.1 & 0.5
  \end{bmatrix}
  $$

- **Values (V):**
  $$
  \mathbf{V} =
  \begin{bmatrix}
  0.1 & 0.2 & 0.3 & 0.4 \\
  0.5 & 0.6 & 0.7 & 0.8 \\
  0.9 & 1.0 & 1.1 & 1.2
  \end{bmatrix}
  $$

The output matrix $ \mathbf{O} $ will be:

- **Output (O):**
  $$
  \mathbf{O} =
  \begin{bmatrix}
  (0.2 \times 0.1 + 0.3 \times 0.5 + 0.5 \times 0.9) & (0.2 \times 0.2 + 0.3 \times 0.6 + 0.5 \times 1.0) & (0.2 \times 0.3 + 0.3 \times 0.7 + 0.5 \times 1.1) & (0.2 \times 0.4 + 0.3 \times 0.8 + 0.5 \times 1.2) \\
  (0.1 \times 0.1 + 0.8 \times 0.5 + 0.1 \times 0.9) & (0.1 \times 0.2 + 0.8 \times 0.6 + 0.1 \times 1.0) & (0.1 \times 0.3 + 0.8 \times 0.7 + 0.1 \times 1.1) & (0.1 \times 0.4 + 0.8 \times 0.8 + 0.1 \times 1.2) \\
  (0.4 \times 0.1 + 0.1 \times 0.5 + 0.5 \times 0.9) & (0.4 \times 0.2 + 0.1 \times 0.6 + 0.5 \times 1.0) & (0.4 \times 0.3 + 0.1 \times 0.7 + 0.5 \times 1.1) & (0.4 \times 0.4 + 0.1 \times 0.8 + 0.5 \times 1.2)
  \end{bmatrix}
  $$

Evaluating the above gives:

$$
\mathbf{O} =
\begin{bmatrix}
0.62 & 0.74 & 0.86 & 0.98 \\
0.42 & 0.54 & 0.66 & 0.78 \\
0.64 & 0.76 & 0.88 & 1.00
\end{bmatrix}
$$

So, the output matrix $ \mathbf{O} $ will be:

```
tensor([[0.62, 0.74, 0.86, 0.98],
        [0.42, 0.54, 0.66, 0.78],
        [0.64, 0.76, 0.88, 1.00]])
```

This demonstrates how each element in the output matrix is a weighted sum of the corresponding value matrix, with weights given by the attention scores.


## Combining Heads and Final Linear Transformation

In a multi-head attention mechanism, multiple sets of Q, K, V matrices are computed and processed in parallel. These are then concatenated and passed through a final linear transformation to project them back to the original embedding size.

In [52]:
# Concatenate the outputs of the heads
concatenated_output = torch.cat(outputs, dim=-1)

print("Concatenated Output Shape:", concatenated_output.shape)
print("Concatenated Output:\n", concatenated_output)

Concatenated Output Shape: torch.Size([1, 10, 10])
Concatenated Output:
 tensor([[[-0.2357, -0.0381,  0.3160, -0.3249,  0.4623,  0.1877,  0.5866,
           0.1517, -0.0174,  0.2697],
         [-0.1032, -0.0118,  0.2269, -0.3979,  0.3255,  0.2370,  0.5898,
           0.1818, -0.0085,  0.3803],
         [-0.1322, -0.0478,  0.2636, -0.4224,  0.3654,  0.3225,  0.6646,
          -0.0476, -0.0384,  0.4374],
         [-0.1233, -0.1030,  0.2125, -0.4679,  0.3577,  0.3504,  0.6697,
          -0.1382,  0.0012,  0.4411],
         [-0.1682, -0.0543,  0.2926, -0.4078,  0.3994,  0.2682,  0.5036,
           0.1064,  0.1244,  0.3768],
         [-0.1783, -0.1068,  0.2472, -0.4232,  0.4060,  0.2898,  0.6990,
          -0.1169, -0.0805,  0.3478],
         [-0.1188, -0.0624,  0.1967, -0.4226,  0.3410,  0.2775,  0.6165,
           0.0567, -0.0061,  0.3933],
         [-0.1526, -0.0350,  0.2334, -0.3677,  0.3742,  0.2716,  0.6148,
           0.0179, -0.0164,  0.3350],
         [-0.0868, -0.0076,  0.2132, -0

## Final Linear Transformation

Project the concatenated output back to the original embedding size and print the final output.

In [53]:
# Final linear transformation to project back to the original embedding size
fc_out = nn.Linear(d_model, d_model)
final_output = fc_out(concatenated_output)

print("Final Output Shape:", final_output.shape)
print("Final Output:\n", final_output)

Final Output Shape: torch.Size([1, 10, 10])
Final Output:
 tensor([[[ 0.2094, -0.0633,  0.1740, -0.0485,  0.0294, -0.0105,  0.1539,
          -0.1829,  0.3235, -0.3173],
         [ 0.2478, -0.1173,  0.2454, -0.1086,  0.0644, -0.0207,  0.0795,
          -0.1813,  0.3423, -0.4046],
         [ 0.2756, -0.1733,  0.2607, -0.1177,  0.1134, -0.0120,  0.0767,
          -0.2021,  0.3376, -0.3698],
         [ 0.2832, -0.2060,  0.2393, -0.1575,  0.1465, -0.0202,  0.0634,
          -0.2309,  0.2945, -0.3711],
         [ 0.2116, -0.1373,  0.2108, -0.1043,  0.0134, -0.0571,  0.0871,
          -0.1843,  0.2657, -0.3838],
         [ 0.2753, -0.1495,  0.2082, -0.1235,  0.1485,  0.0069,  0.1184,
          -0.2319,  0.3103, -0.3211],
         [ 0.2634, -0.1516,  0.2280, -0.1352,  0.1038, -0.0176,  0.0745,
          -0.2084,  0.3178, -0.3861],
         [ 0.2509, -0.1427,  0.2084, -0.1089,  0.0855, -0.0055,  0.1038,
          -0.2008,  0.3129, -0.3503],
         [ 0.2705, -0.1674,  0.2583, -0.1134,  0.0936