In [7]:
import torch
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]]) # step     (x^6)


In [8]:
query = inputs[1]                                               #A
attn_scores = torch.empty(inputs.shape[0])

In [12]:
# 获得score
score=torch.matmul(inputs,inputs.T)
# score 归一化
weight=torch.softmax(score,dim=-1)
weight
# 得到上下文向量
all_context_vecs=torch.matmul(weight,inputs)
all_context_vecs

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

In [18]:
# Listing 3.1 A compact self-attention class
import torch.nn as nn
class SelfAttention_v1(nn.Module):
    def __init__(self,din,dout):
        super().__init__()
        self.w_q=nn.Parameter(torch.rand(din,dout))
        self.w_k=nn.Parameter(torch.rand(din,dout))
        self.w_v=nn.Parameter(torch.rand(din,dout))
    def forward(self,x):
        query=torch.matmul(x,self.w_q)
        key=torch.matmul(x,self.w_k)
        value=torch.matmul(x,self.w_v)
        # 等价于矩阵相乘
        score=query@key.T
        # 将结果除以嵌入维度的平方根（即 key.shape[-1]），其中 key.shape[-1] 是嵌入向量的维度=2
        # 这样可以将点积结果缩放到适当的范围，避免Softmax函数进入梯度平缓区，从而保持梯度的有效性，促进模型的正常训练。
        weight=torch.softmax(score/key.shape[-1]**0.5,dim=-1)
        context=weight@value
        return context        

In [19]:
torch.manual_seed(123)
model=SelfAttention_v1(3,2)
model(inputs)

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)

In [24]:
# Listing 3.1 A compact self-attention class
#当禁用偏置单元时，`nn.Linear` 层可以有效地执行矩阵乘法。
#此外，使用 `nn.Linear` 替代手动实现的 `nn.Parameter(torch.rand(...))` 的一个显著优势在于，
# `nn.Linear` 具有优化的权重初始化方案，从而有助于实现更稳定和更高效的模型训练。
class SelfAttention_v1(nn.Module):
    def __init__(self,din,dout,bias=False):
        super().__init__()
        self.w_q=nn.Linear(din,dout,bias=bias)
        self.w_k=nn.Linear(din,dout,bias=bias)
        self.w_v=nn.Linear(din,dout,bias=bias)
    def forward(self,x):
        query=self.w_q(x)
        key=self.w_k(x)
        value=self.w_v(x)
        # 等价于矩阵相乘
        score=query@key.T
        weight=torch.softmax(score/key.shape[-1]**0.5,dim=-1)
        context=weight@value
        return context        

In [27]:
torch.manual_seed(789)
sa_v2=SelfAttention_v1(3,2)
model(inputs)

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)

In [29]:
query=sa_v2(inputs)
key=sa_v2(inputs)
value=sa_v2(inputs)
score=query@key.T

In [38]:
lens=score.shape[0]
# triu上对角线 diagonal=1表示对角线不填充1
mask=torch.triu(torch.ones(lens,lens),diagonal=1)
mask
# 掩码必须是bool类型
score=score.masked_fill(mask.bool(),-torch.inf)
attn_weights=torch.softmax(score/key.shape[-1]**0.5,dim=-1)
attn_weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667]],
       grad_fn=<SoftmaxBackward0>)

In [48]:
inputs.shape
# 模拟batch=2的输入
batch=torch.stack((inputs,inputs),0)
batch.shape


torch.Size([2, 6, 3])

In [49]:
# Listing 3.3 A compact causal attention class
class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.w_q=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.w_k=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.w_v=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.dropout=dropout
        
        # register_buffer
        # 当我们在大语言模型（LLM）中使用 `CausalAttention` 类时，buffer 会自动随模型迁移到合适的设备（CPU 或 GPU）。
        # 这意味着我们无需手动确保这些张量与模型参数在同一设备上，从而避免设备不匹配错误。
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
        
    def forward(self,x):
        b,token_size,dim=x.shape
        query=self.w_q(x)
        key=self.w_k(x)
        value=self.w_v(x)
        
        score=query@key.transpose(-1,-2)
        # masked_fill_的_表示原地操作
        score.masked_fill_(self.mask.bool()[:token_size,:token_size],-torch.inf)
        weight=torch.softmax(score/key.shape[-1]**0.5,dim=-1)
        weight=self.dropout(weight)
        context=weight@value
        return context

In [50]:
torch.manual_seed(123)
#  Dropout 通常应用于两个特定区域：计算注意力得分之后，或将注意力权重应用于 value 向量之后。
# 在这里，我们会在计算完注意力权重之后应用 dropout 掩码
# dropout会自动缩放：增大未遮盖值的相对差异；注意力分布会更集中（即更尖锐），让模型更关注特定的 token
dropout=nn.Dropout(0.5)
context_length=batch.shape[1]
ca=CausalAttention(3,2,context_length,dropout)
ca(batch).shape

torch.Size([2, 6, 2])

In [59]:
# Listing 3.4 A wrapper class to implement multi-head attention
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self, d_in, d_out, context_length,
                 dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.head=nn.ModuleList(
            [CausalAttention(d_in,d_out,context_length,dropout,qkv_bias) for _ in range(num_heads)]
        )
    def forward(self,x):
        return torch.cat([head(x) for head in self.head],dim=-1)

In [60]:
torch.manual_seed(123)
dropout=nn.Dropout(0.5)
mha=MultiHeadAttentionWrapper(3,2,batch.shape[1],dropout,2)
end=mha(batch)
end.shape

torch.Size([2, 6, 4])

In [74]:
# Listing 3.5 An efficient multi-head attention class
# 多头注意力；关键理解view和transpose
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
                 context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out%num_heads==0
        self.num_heads=num_heads
        self.d_in=d_in
        self.d_out=d_out
        self.head_dim=d_out//num_heads
        
        self.dropout=nn.Dropout(dropout)
        self.w_q=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.w_k=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.w_v=nn.Linear(d_in,d_out,bias=qkv_bias)
        
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
        self.proj=nn.Linear(d_out,d_out)
    def forward(self,x):
        b,token_size,d_in=x.shape
        query=self.w_q(x)
        key=self.w_k(x)
        # b,token_size,d_out
        value=self.w_v(x)
        
        # 多头  
        # b,token_size,heads,head_dim
        query=query.view(b,token_size,self.num_heads,self.head_dim)
        key=key.view(b,token_size,self.num_heads,self.head_dim)
        value=value.view(b,token_size,self.num_heads,self.head_dim)
        
        # b,heads,token_size,head_dim
        query=query.transpose(1,2)
        key=key.transpose(1,2)
        value=value.transpose(1,2)
        
        # b,heads,token_size,token_size
        score=query@key.transpose(-1,-2)
        score.masked_fill_(self.mask.bool()[:token_size,:token_size],-torch.inf)
        
        weight=torch.softmax(score/key.shape[-1]**0.5,dim=-1)
        weight=self.dropout(weight)
        
        # b,heads,token_size,head_dim
        context=weight@value
        
        return self.proj(context.contiguous().transpose(1,2).view(b,token_size,self.d_out))

In [75]:
torch.manual_seed(123)
batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]],

        [[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]]], grad_fn=<AddBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])


In [70]:
a = torch.tensor([[[[0.2745, 0.6584, 0.2775, 0.8573],             #A
                    [0.8993, 0.0390, 0.9268, 0.7388],
                    [0.7179, 0.7058, 0.9156, 0.4340]],
                   [[0.0772, 0.3565, 0.1479, 0.5331],
                    [0.4066, 0.2318, 0.4545, 0.9737],
                    [0.4606, 0.5159, 0.4220, 0.5786]]]])
a.shape

torch.Size([1, 2, 3, 4])

In [73]:
first_head = a[0, 0, :, :]
print(first_head.shape)
first_res = first_head @ first_head.T
print("First head:\n", first_res.shape)
second_head = a[0, 1, :, :]
second_res = second_head @ second_head.T
print("\nSecond head:\n", second_res.shape)

torch.Size([3, 4])
First head:
 torch.Size([3, 3])

Second head:
 torch.Size([3, 3])
