# 编码自注意力机制

## 1. 无可训练权重的简单自注意力机制

简化的不包含任何可训练权重的简化的自注意力机制的变体。

输入序列中的每个元素 $x^{(i)}$ 计算上下文向量 $z^{(i)}$（包含了序列中所有元素信息的嵌入向量） 

In [1]:
import torch

In [2]:
# 以第二个输入元素 x^2 作为查询，演示上下文向量 z^2 的计算过程
inputs = torch.tensor(
    [
        [0.43, 0.15, 0.89], # Your       (x^1)
        [0.55, 0.87, 0.66], # journey    (x^2)
        [0.57, 0.85, 0.64], # starts     (x^3)
        [0.22, 0.58, 0.33], # with       (x^4)
        [0.77, 0.25, 0.10], # one        (x^5)
        [0.05, 0.80, 0.55], # step       (x^6)
    ]
)

In [3]:
# 通过计算查询词元 x ^ 2 与其他所有输入词元的点积确定自注意分数 w 
query = inputs[1]
atten_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    atten_scores_2[i] = torch.dot(query, x_i)
print("Attention Scores:\n", atten_scores_2)

Attention Scores:
 tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [4]:
# 归一化处理，获取总和为 1 的注意力权重
atten_weithts_2_tmp = atten_scores_2 / atten_scores_2.sum()
print("Attention weights:", atten_weithts_2_tmp)
print("Sum:", atten_weithts_2_tmp.sum())

Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: tensor(1.0000)


In [5]:
atten_weights_2 = torch.softmax(atten_scores_2, dim=0)
print("Attention weights:", atten_weights_2)
print("Sum:", atten_weights_2.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [6]:
# 计算 x^2 的上下文向量
query = inputs[1]
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += atten_weights_2[i] * x_i
print("Context Vector:\n", context_vec_2)

Context Vector:
 tensor([0.4419, 0.6515, 0.5683])


In [7]:
# 计算所有词元的注意力权重
atten_scores = inputs @ inputs.T  # 点积计算注意力分数
print("Attention Scores:\n", atten_scores)

# dim=-1 对最后一维归一化注意力分数；【行，列】对列进行归一化，使得每行的值（在列的维度上的总和）为1.
atten_weights = torch.softmax(atten_scores, dim=-1)  
print("Attention Weights:\n", atten_weights)

all_context_vecs = atten_weights @ inputs  # 计算上下文向量
print("All Context Vectors:\n", all_context_vecs)

Attention Scores:
 tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
Attention Weights:
 tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
All Context Vectors:
 tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


## 2. 带可训练权重的自注意力机制

In [8]:
from SelfAttention import SelfAttention
import torch
torch.manual_seed(789)

<torch._C.Generator at 0x1be48330650>

In [9]:
inputs = torch.tensor(
    [
        [0.43, 0.15, 0.89], # Your       (x^1)
        [0.55, 0.87, 0.66], # journey    (x^2)
        [0.57, 0.85, 0.64], # starts     (x^3)
        [0.22, 0.58, 0.33], # with       (x^4)
        [0.77, 0.25, 0.10], # one        (x^5)
        [0.05, 0.80, 0.55], # step       (x^6)
    ]
)
d_in = 3
d_out = 2

In [10]:
torch.manual_seed(123)
sa_v2 = SelfAttention(d_in, d_out)

print(sa_v2(inputs))

tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)


## 3. 因果自注意力掩码

In [11]:
inputs = torch.tensor(
    [
        [0.43, 0.15, 0.89], # Your       (x^1)
        [0.55, 0.87, 0.66], # journey    (x^2)
        [0.57, 0.85, 0.64], # starts     (x^3)
        [0.22, 0.58, 0.33], # with       (x^4)
        [0.77, 0.25, 0.10], # one        (x^5)
        [0.05, 0.80, 0.55], # step       (x^6)
    ]
)

queries = sa_v2.query(inputs)
keys = sa_v2.key(inputs)
atten_scores = queries @ keys.T
print(atten_scores)



tensor([[0.3111, 0.3479, 0.3471, 0.1714, 0.2350, 0.1928],
        [0.1655, 0.2602, 0.2576, 0.1445, 0.1384, 0.1790],
        [0.1667, 0.2602, 0.2577, 0.1443, 0.1391, 0.1784],
        [0.0510, 0.1080, 0.1064, 0.0643, 0.0476, 0.0835],
        [0.1415, 0.1875, 0.1863, 0.0987, 0.1121, 0.1174],
        [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],
       grad_fn=<MmBackward0>)


In [12]:
# 计算方式1
atten_weights = torch.softmax(atten_scores / (keys.shape[-1] ** 0.5), dim=1)
context_length = atten_scores.shape[0] # demo 只有 6个单词，假设上下文就是6
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

mask_simple = atten_weights * mask_simple
print(mask_simple)

row_sums = mask_simple.sum(dim=-1, keepdim=True)
mask_simple_norm = mask_simple / row_sums
print(mask_simple_norm)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])
tensor([[0.1717, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1636, 0.1749, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1637, 0.1749, 0.1746, 0.0000, 0.0000, 0.0000],
        [0.1636, 0.1704, 0.1702, 0.1652, 0.0000, 0.0000],
        [0.1667, 0.1722, 0.1721, 0.1618, 0.1633, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<MulBackward0>)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],
        [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],
        [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<DivBackward0>)


In [13]:
# 计算方式 2
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = atten_scores.masked_fill(mask.bool(), -torch.inf)
print(masked)
atten_weights = torch.softmax(masked / (keys.shape[-1] ** 0.5), dim=1)
print(atten_weights)

atten_weights @ sa_v2.value(inputs)  # 计算上下文向量

tensor([[0.3111,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1655, 0.2602,   -inf,   -inf,   -inf,   -inf],
        [0.1667, 0.2602, 0.2577,   -inf,   -inf,   -inf],
        [0.0510, 0.1080, 0.1064, 0.0643,   -inf,   -inf],
        [0.1415, 0.1875, 0.1863, 0.0987, 0.1121,   -inf],
        [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],
       grad_fn=<MaskedFillBackward0>)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],
        [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],
        [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<SoftmaxBackward0>)


tensor([[-0.4519,  0.2216],
        [-0.5874,  0.0058],
        [-0.6300, -0.0632],
        [-0.5675, -0.0843],
        [-0.5526, -0.0981],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)

In [14]:
# 使用因果注意力类
import torch
from SelfAttention import CausalSelfAttention

# 一个批次一个元素
batch =  torch.tensor([
        [
            [0.43, 0.15, 0.89], # Your       (x^1)
            [0.55, 0.87, 0.66], # journey    (x^2)
            [0.57, 0.85, 0.64], # starts     (x^3)
            [0.22, 0.58, 0.33], # with       (x^4)
            [0.77, 0.25, 0.10], # one        (x^5)
            [0.05, 0.80, 0.55], # step       (x^6)
        ]
    ]
)

d_in = 3
d_out = 2
context_length = batch.shape[1]

In [15]:
torch.manual_seed(123)
ca = CausalSelfAttention(d_in=d_in, d_out=d_out, context_length=context_length, dropout=0)
# shape batch * context_length * d_out
ca(batch)

tensor([[[-0.4519,  0.2216],
         [-0.5874,  0.0058],
         [-0.6300, -0.0632],
         [-0.5675, -0.0843],
         [-0.5526, -0.0981],
         [-0.5299, -0.1081]]], grad_fn=<UnsafeViewBackward0>)

## 多头注意力

In [3]:
import torch
from SelfAttention import MultiHeadAttentionWrapper, MultiHeadAttention

torch.manual_seed(123)

# 一个批次一个元素
batch =  torch.tensor([
        [
            [0.43, 0.15, 0.89], # Your       (x^1)
            [0.55, 0.87, 0.66], # journey    (x^2)
            [0.57, 0.85, 0.64], # starts     (x^3)
            [0.22, 0.58, 0.33], # with       (x^4)
            [0.77, 0.25, 0.10], # one        (x^5)
            [0.05, 0.80, 0.55], # step       (x^6)
        ]
    ]
)

d_in = 3
d_out = 2
context_length = batch.shape[1]
mha = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0, num_heads=2)
context_vecs = mha(batch)

print(context_vecs)
print("context_vecs shape:", context_vecs.shape)

tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]]], grad_fn=<CatBackward0>)
context_vecs shape: torch.Size([1, 6, 4])


In [None]:
torch.manual_seed(123)
# 大语言模型架构中，默认对多头注意力上下文加量，增加一个线性投影层
# 每个注意力输出的维度为 1
mha = MultiHeadAttention(d_in, d_out, context_length, dropout=0, num_heads=2)

context_vecs = mha(batch)

print(context_vecs)
print("context_vecs shape:", context_vecs.shape)

tensor([[[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]]], grad_fn=<ViewBackward0>)
context_vecs shape: torch.Size([1, 6, 2])
