In [31]:
import torch

# sample token embeddings:
inputs = torch.tensor(
  [
    [0.43, 0.15, 0.89], # Your     (x^1)
    [0.55, 0.87, 0.66], # journey  (x^2)
    [0.57, 0.85, 0.64], # starts   (x^3)
    [0.22, 0.58, 0.33], # with     (x^4)
    [0.77, 0.25, 0.10], # one      (x^5)
    [0.05, 0.80, 0.55]  # step     (x^6)
  ] 
)


In [32]:
x_2 = inputs[1] # journey
d_in = inputs.shape[1]
d_out = 2

`torch.nn.Parameter`: It's a tensor, just like the ones you use in PyTorch, but it's specifically designed to be used as a parameter in neural networks. When you use torch.nn.Parameter, it tells PyTorch that this tensor should be updated during training.

In [33]:
torch.manual_seed(42)
W_query = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)


In [34]:
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

In [35]:
query_2, key_2, value_2

(tensor([-0.3519,  0.1483]),
 tensor([1.9692, 0.4159]),
 tensor([0.6229, 0.4434]))

In [36]:
# all weights
keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value

keys.shape, queries.shape, values.shape

(torch.Size([6, 2]), torch.Size([6, 2]), torch.Size([6, 2]))

In [37]:
#attention scores
attn_scores_2 = query_2 @ keys.T
attn_scores_2

tensor([-0.4540, -0.6313, -0.6450, -0.2855, -0.7087, -0.1794])

In [38]:
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / (d_k ** 0.5), dim=-1)
attn_weights_2

tensor([0.1686, 0.1487, 0.1473, 0.1899, 0.1408, 0.2047])

In [39]:
# context vectors

context_2 = attn_weights_2 @ values
context_2

tensor([0.5633, 0.3251])

## Self Attention Class

In [40]:
import torch.nn as nn
import torch

class SelfAttentionV1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.randn(d_in, d_out))
        self.W_key = nn.Parameter(torch.randn(d_in, d_out))
        self.W_value = nn.Parameter(torch.randn(d_in, d_out))

    def forward(self, x):
        queries = x @ self.W_query
        keys = x @ self.W_key
        values = x @ self.W_value

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / (keys.shape[-1] ** 0.5), dim=-1)
        context_vectors = attn_weights @ values

        return context_vectors
    

In [41]:
torch.manual_seed(42)

sa = SelfAttentionV1(d_in, d_out)
context_vec = sa(inputs)
context_vec

tensor([[0.5141, 0.3639],
        [0.5633, 0.3251],
        [0.5659, 0.3221],
        [0.5839, 0.2941],
        [0.6180, 0.2539],
        [0.5575, 0.3262]], grad_fn=<MmBackward0>)

In [42]:
# self attention using linear layers

import torch.nn as nn
import torch

class SelfAttentionV2(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / (keys.shape[-1] ** 0.5), dim=-1)
        context_vectors = attn_weights @ values

        return context_vectors
    

In [43]:
torch.manual_seed(789)

sa_v2 = SelfAttentionV2(d_in, d_out)
context_vec = sa_v2(inputs)
context_vec

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)