In [19]:
import torch
from torch.nn.functional import softmax
import matplotlib.pyplot as plt  
import torch  
import torch.nn as nn  
import torch.nn.functional as F  
import numpy as np

# 自注意力分步骤实现

## 输入数据

In [20]:
# batch_size = 2, seq_len = 3, d_k = 4
x = [
    [[1, 0, 1, 0], 
     [0, 2, 0, 2], 
     [1, 1, 1, 1]],
    [[2, 0, 2, 0], 
     [0, 1, 0, 1], 
     [1, 2, 1, 2]]
]
x = torch.tensor(x, dtype=torch.float32)

## 权重矩阵（随机初始化）

In [21]:
d_k = 4
w_k = torch.randn(d_k, d_k, dtype=torch.float32)
w_q = torch.randn(d_k, d_k, dtype=torch.float32)
w_v = torch.randn(d_k, d_k, dtype=torch.float32)
print(d_k ** 0.5)

2.0


## 计算Q, K和V

In [22]:
q = x @ w_q
k = x @ w_k
v = x @ w_v
print(q)
print(k)
print(v)

tensor([[[ 1.6102,  0.7005, -0.5583, -1.4088],
         [-2.6734,  3.2910, -1.7739,  4.6357],
         [ 0.2735,  2.3459, -1.4452,  0.9091]],

        [[ 3.2203,  1.4009, -1.1165, -2.8175],
         [-1.3367,  1.6455, -0.8870,  2.3179],
         [-1.0632,  3.9914, -2.3322,  3.2270]]])
tensor([[[-0.3881, -0.8802, -1.7169, -0.9112],
         [ 0.9074, -2.3962, -2.9904, -2.2324],
         [ 0.0656, -2.0783, -3.2121, -2.0274]],

        [[-0.7762, -1.7603, -3.4338, -1.8224],
         [ 0.4537, -1.1981, -1.4952, -1.1162],
         [ 0.5193, -3.2763, -4.7073, -3.1437]]])
tensor([[[ 5.9471e-01, -5.0952e-01, -3.0954e-01, -6.5931e-03],
         [-7.9302e-01, -1.1259e-01, -7.2944e-04,  1.0061e+00],
         [ 1.9820e-01, -5.6581e-01, -3.0991e-01,  4.9644e-01]],

        [[ 1.1894e+00, -1.0190e+00, -6.1908e-01, -1.3186e-02],
         [-3.9651e-01, -5.6296e-02, -3.6472e-04,  5.0303e-01],
         [-1.9831e-01, -6.2211e-01, -3.1027e-01,  9.9947e-01]]])


## 计算注意力分数

In [23]:
attn_scores = torch.bmm(q, k.transpose(1, 2)) / (d_k ** 0.5)
attn_scores

tensor([[[ 0.5004,  2.2985,  1.6496],
         [-1.5188, -7.6779, -5.3577],
         [-0.2590, -1.5404, -1.0292]],

        [[ 2.0015,  2.2985,  5.5977],
         [-1.5188, -1.9195, -4.5983],
         [-2.0368, -2.6897, -6.3978]]])

## 计算注意力权重

In [24]:
attn_weights = softmax(attn_scores, dim=-1)
attn_weights

tensor([[[0.0981, 0.5923, 0.3096],
         [0.9769, 0.0021, 0.0210],
         [0.5745, 0.1595, 0.2660]],

        [[0.0258, 0.0347, 0.9396],
         [0.5828, 0.3904, 0.0268],
         [0.6522, 0.3395, 0.0083]]])

## 计算注意力输出

In [25]:
output = attn_weights @ v
output

tensor([[[-0.3500, -0.2918, -0.1267,  0.7490],
         [ 0.5835, -0.5099, -0.3089,  0.0061],
         [ 0.2679, -0.4612, -0.2604,  0.2887]],

        [[-0.1694, -0.6127, -0.3075,  0.9562],
         [ 0.5331, -0.6325, -0.3693,  0.2155],
         [ 0.6395, -0.6889, -0.4065,  0.1705]]])

# 自注意力类

In [26]:
class SelfAttention(nn.Module):  
    def __init__(self, d_k):  
        super().__init__()  
        self.d_k = d_k
        self.w_q = nn.Linear(self.d_k, self.d_k)  
        self.w_k = nn.Linear(self.d_k, self.d_k)  
        self.w_v = nn.Linear(self.d_k, self.d_k)  
 
    def forward(self, x):  
        # x: (batch_size, seq_len, d_k) 
        q = self.w_q(x)  # (batch_size, seq_len, d_k)  
        k = self.w_k(x) # (batch_size, seq_len, d_k)  
        v = self.w_v(x) # (batch_size, seq_len, d_k)  
        attn_scores = torch.bmm(q, k.transpose(1, 2))  # (batch_size, seq_len, seq_len)  
        attn_scores = attn_scores / k.size(-1) ** 0.5  # 缩放  # (batch_size, seq_len, seq_len) 
        attn_weights = F.softmax(attn_scores, dim=-1)  # (batch_size, seq_len, seq_len) 
        output = attn_weights @ v  # (batch_size, seq_len, dim)  
        return output, attn_weights  

In [27]:
self_attention = SelfAttention(4)
self_attention(x)

(tensor([[[ 0.5798, -0.0193, -0.4038,  0.2496],
          [ 0.5773,  0.0346, -0.3360,  0.4236],
          [ 0.5802,  0.0058, -0.3736,  0.3312]],
 
         [[ 0.9531, -0.1236, -0.8406,  0.0476],
          [ 0.7357, -0.0441, -0.5633,  0.2262],
          [ 0.6592, -0.0076, -0.4553,  0.3167]]], grad_fn=<UnsafeViewBackward0>),
 tensor([[[0.3002, 0.3288, 0.3710],
          [0.2624, 0.4786, 0.2590],
          [0.2799, 0.3976, 0.3225]],
 
         [[0.3833, 0.1928, 0.4239],
          [0.2455, 0.4091, 0.3454],
          [0.1792, 0.4908, 0.3300]]], grad_fn=<SoftmaxBackward0>))

# 注意力类（带掩码）

In [28]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
    def forward(self, Q, K, V, attn_mask=None):
        '''
        Q: [batch_size, n_heads, len_q, d_q]  d_q = d_k
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v, d_v]  # len_v = len_k
        attn_mask: [batch_size, n_heads, len_q, len_k]
        '''
        d_k = K.size(-1)
        # scores: [batch_size, n_heads, len_q, len_k]
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) 
        if attn_mask is not None:  
            # scores: [batch_size, n_heads, len_q, len_k]
            scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.
        # attn: [batch_size, n_heads, len_q, len_k]
        attn = nn.Softmax(dim=-1)(scores)
        # context: [batch_size, n_heads, len_q, d_v]
        context = torch.matmul(attn, V) 
        # context: [batch_size, n_heads, len_q, d_v], attn: [batch_size, n_heads, len_q, len_k]
        return context, attn

In [29]:
attention = Attention()
attention(q, k, v)

(tensor([[[-0.3500, -0.2918, -0.1267,  0.7490],
          [ 0.5835, -0.5099, -0.3089,  0.0061],
          [ 0.2679, -0.4612, -0.2604,  0.2887]],
 
         [[-0.1694, -0.6127, -0.3075,  0.9562],
          [ 0.5331, -0.6325, -0.3693,  0.2155],
          [ 0.6395, -0.6889, -0.4065,  0.1705]]]),
 tensor([[[0.0981, 0.5923, 0.3096],
          [0.9769, 0.0021, 0.0210],
          [0.5745, 0.1595, 0.2660]],
 
         [[0.0258, 0.0347, 0.9396],
          [0.5828, 0.3904, 0.0268],
          [0.6522, 0.3395, 0.0083]]]))

# 多头注意力类（带掩码）

In [30]:
class MultiHeadAttention(nn.Module):  
    def __init__(self, d_model, num_heads):  
        super().__init__()  
        self.d_model = d_model  
        self.num_heads = num_heads  
        self.d_k = d_model // num_heads  
 
        self.w_q = nn.Linear(d_model, d_model)  
        self.w_k = nn.Linear(d_model, d_model)  
        self.w_v = nn.Linear(d_model, d_model)  
        self.w_o = nn.Linear(d_model, d_model)  
 
    def split_heads(self, x):  
        # x: (batch_size, seq_len, d_model)  
        # (batch_size, self.num_heads, seq_len, self.d_k)
        return x.view(x.size(0), -1, self.num_heads, self.d_k).transpose(1, 2)    
 
    def forward(self, input_Q, input_K, input_V, attn_mask=None):  
        # (batch_size, self.num_heads, seq_len_q, self.d_k)  
        q = self.split_heads(self.w_q(input_Q))  
        # (batch_size, self.num_heads, seq_len_k, self.d_k)  
        k = self.split_heads(self.w_k(input_K))  
        # (batch_size, self.num_heads, seq_len_v, self.d_k)  
        v = self.split_heads(self.w_v(input_V))  
        output, attn_weights = Attention()(q, k, v, attn_mask)
        # (batch_size, seq_len, self.d_model), (batch_size, self.num_heads, seq_len_q, seq_len_k)   
        return output, attn_weights   

In [31]:
multi_head_attention = MultiHeadAttention(4, 2)
multi_head_attention(q, k, v)

(tensor([[[[ 0.6855,  0.5687],
           [ 0.5009,  0.3546],
           [ 0.6155,  0.4866]],
 
          [[ 0.0450,  0.0735],
           [-0.0086,  0.0953],
           [ 0.0298,  0.0803]]],
 
 
         [[[ 0.6620,  0.4569],
           [ 0.5721,  0.4157],
           [ 0.5411,  0.3756]],
 
          [[ 0.0091,  0.1798],
           [ 0.0060, -0.0081],
           [ 0.0176,  0.0106]]]], grad_fn=<UnsafeViewBackward0>),
 tensor([[[[0.2473, 0.3629, 0.3898],
           [0.6268, 0.1640, 0.2092],
           [0.3813, 0.2811, 0.3377]],
 
          [[0.5394, 0.2354, 0.2251],
           [0.2666, 0.3013, 0.4321],
           [0.4651, 0.2571, 0.2778]]],
 
 
         [[[0.2934, 0.0974, 0.6092],
           [0.3550, 0.4601, 0.1850],
           [0.4028, 0.4481, 0.1492]],
 
          [[0.1232, 0.8435, 0.0333],
           [0.4097, 0.2247, 0.3656],
           [0.4025, 0.3157, 0.2818]]]], grad_fn=<SoftmaxBackward0>))