In [1]:
import torch

# sample token embeddings:
inputs = torch.tensor(
  [
    [0.43, 0.15, 0.89], # Your     (x^1)
    [0.55, 0.87, 0.66], # journey  (x^2)
    [0.57, 0.85, 0.64], # starts   (x^3)
    [0.22, 0.58, 0.33], # with     (x^4)
    [0.77, 0.25, 0.10], # one      (x^5)
    [0.05, 0.80, 0.55]  # step     (x^6)
  ] 
)


In [2]:
x_2 = inputs[1] # journey
d_in = inputs.shape[1]
d_out = 2

`torch.nn.Parameter`: It's a tensor, just like the ones you use in PyTorch, but it's specifically designed to be used as a parameter in neural networks. When you use torch.nn.Parameter, it tells PyTorch that this tensor should be updated during training.

In [5]:
torch.manual_seed(42)
W_query = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)


In [7]:
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

In [8]:
query_2, key_2, value_2

(tensor([-0.3519,  0.1483]),
 tensor([1.9692, 0.4159]),
 tensor([0.6229, 0.4434]))

In [10]:
# all weights
keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value

keys.shape, queries.shape, values.shape

(torch.Size([6, 2]), torch.Size([6, 2]), torch.Size([6, 2]))

In [14]:
#attention scores
attn_scores_2 = query_2 @ keys.T
attn_scores_2

tensor([-0.4540, -0.6313, -0.6450, -0.2855, -0.7087, -0.1794])

In [16]:
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / (d_k ** 0.5), dim=-1)
attn_weights_2

tensor([0.1686, 0.1487, 0.1473, 0.1899, 0.1408, 0.2047])

In [17]:
# context vectors

context_2 = attn_weights_2 @ values
context_2

tensor([0.5633, 0.3251])