In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from torchviz import make_dot
# 设置随机种子以保证结果可重现
torch.manual_seed(42)
np.random.seed(42)


In [None]:
def self_attention_simplified(x):
    """
    简化版的Self-Attention实现
    参数:
        x: 输入张量，形状为 [batch_size, seq_len, d_model]
           batch_size: 批量大小
           seq_len: 序列长度
           d_model: 特征维度
    返回:
        attention_output: Self-Attention的输出
    """
    # 步骤 1: 计算查询(Query)、键(Key)、值(Value)矩阵
    # 在实际Transformer中，这些矩阵是通过学习的线性变换得到的
    # 但这里为了简单，我们直接使用输入作为Q、K、V
    query = x
    key = x
    value = x
    
    # 步骤 2: 计算注意力分数 (Attention Scores)
    # 公式: scores = Q × K^T
    # 其中 Q 和 K^T 的维度分别为 [batch_size, seq_len, d_model] 和 [batch_size, d_model, seq_len]
    scores = torch.bmm(query, key.transpose(1, 2))
    
    print("scores.shape:", scores.shape)
    print("scores:", scores)
    
    # 步骤 3: 对分数进行缩放 (Scaling)
    # 公式: scaled_scores = scores / sqrt(d_model)
    d_model = query.size(-1)
    scaled_scores = scores / torch.sqrt(torch.tensor(d_model, dtype=torch.float32))
    # 步骤 4: 通过softmax函数获取注意力权重
    # 在序列维度(dim=2)上应用softmax，把注意力分数变成概率分布
    attention_weights = torch.softmax(scaled_scores, dim=-1)
    
    print("attention_weights.shape:", attention_weights.shape)
    print("attention_weights:", attention_weights)
    
    # 步骤 5: 将注意力权重与Value矩阵相乘，得到最终的输出
    # 公式: output = attention_weights × V
    attention_output = torch.bmm(attention_weights, value)
    
    return attention_output, attention_weights

In [3]:
# 2. 创建一个简单的例子来演示self-attention
# 定义一个包含3个token的序列，每个token用4维向量表示
batch_size = 1  # 批量大小为1（处理单个序列）
seq_len = 3     # 序列长度为3（比如3个单词）
d_model = 4     # 每个token的特征维度为4

# 创建一个随机输入张量
x = torch.randn(batch_size,seq_len, d_model)

print("输入序列 x 的形状:", x.shape)
print("输入序列的值:\n", x)

# 3. 应用self-attention函数
output, attention_weights = self_attention_simplified(x)

print("\nSelf-Attention输出的形状:", output.shape)
print("Self-Attention输出的值:\n", output)


输入序列 x 的形状: torch.Size([1, 3, 4])
输入序列的值:
 tensor([[[ 0.3367,  0.1288,  0.2345,  0.2303],
         [-1.1229, -0.1863,  2.2082, -0.6380],
         [ 0.4617,  0.2674,  0.5349,  0.8094]]])
scores.shape: torch.Size([1, 3, 3])
scores: tensor([[[ 0.2380, -0.0313,  0.5017],
         [-0.0313,  6.5787,  0.0966],
         [ 0.5017,  0.0966,  1.2258]]])
attention_weights.shape: torch.Size([1, 3, 3])
attention_weights: tensor([[[0.3317, 0.2899, 0.3784],
         [0.0341, 0.9295, 0.0364],
         [0.3074, 0.2511, 0.4415]]])

Self-Attention输出的形状: torch.Size([1, 3, 4])
Self-Attention输出的值:
 tensor([[[-0.0391,  0.0899,  0.9203,  0.1977],
         [-1.0154, -0.1591,  2.0800, -0.5557],
         [ 0.0254,  0.1109,  0.8626,  0.2680]]])
