In [10]:
import torch
import torch.nn as nn
import math

In [11]:
# 自注意力机制
class SelfAttention(nn.Module):
    def __init__(self, embedding_size: int, heads: int):
        super(SelfAttention, self).__init__()
        assert embedding_size % heads == 0, "Embedding size must be divisible by number of heads"
        self.embedding_size = embedding_size
        self.heads = heads
        self.head_dim = embedding_size // heads

        self.query_linear = nn.Linear(self.head_dim, self.head_dim)
        self.key_linear = nn.Linear(self.head_dim, self.head_dim)
        self.value_linear = nn.Linear(self.head_dim, self.head_dim)
        self.fc_out = nn.Linear(self.heads * self.head_dim, self.embedding_size)
    
    def forward(self, values, keys, queries, mask=None):
        N = queries.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]
        print("SelfAttention输入values形状：", values.shape)

        # 分割嵌入向量以进行多头注意力
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        # 线性变换
        values = self.value_linear(values)
        keys = self.key_linear(keys)
        queries = self.query_linear(queries)

        # 计算注意力分数
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys]) / (self.head_dim ** 0.5) # (N, heads, query_len, key_len)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))
    
        attention = torch.softmax(energy, dim=-1)  # (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )  # (N, query_len, heads * head_dim)

        out = self.fc_out(out)
        return out

In [12]:
# 前馈神经网络的实现
class FeedForward(nn.Module):
    def __init__(self, embedding_size: int, hidden_dim: int, dropout: float = 0.1):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

In [13]:
# Transformer层堆叠
class TransformerBlock(nn.Module):
    def __init__(self, embedding_size: int, heads: int, hidden_dim: int, dropout: float = 0.1):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embedding_size, heads)
        self.norm1 = nn.LayerNorm(embedding_size)
        self.norm2 = nn.LayerNorm(embedding_size)
        self.feed_forward = FeedForward(embedding_size, hidden_dim, dropout)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        attention = self.attention(x, x, x, mask)
        norm1_out = self.norm1(x + attention)
        forward = self.feed_forward(norm1_out)
        norm2_out = self.norm2(norm1_out + forward)
        return self.dropout(norm2_out)

In [14]:
# gpt-2模型的层堆叠部分
class GPT2LayerStack(nn.Module):
    def __init__(self, embedding_size: int, heads: int, hidden_dim: int, num_layers: int, dropout: float = 0.1):
        super(GPT2LayerStack, self).__init__()
        self.layers = nn.ModuleList(
            [
                TransformerBlock(embedding_size, heads, hidden_dim, dropout)
                for _ in range(num_layers)
            ]
        )
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return self.dropout(x)

In [15]:
# 超参数
embedding_size = 128
heads = 8
hidden_dim = 512
num_layers = 4
dropout = 0.1
batch_size = 2
seq_length = 10

In [16]:
# 初始化GPT-2层堆叠
gpt2_layers = GPT2LayerStack(
    embedding_size=embedding_size,
    heads=heads,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout=dropout
)

In [None]:
# 随机生成输入数据和掩码
input_data = torch.randn(batch_size, seq_length, embedding_size)
head_dim = embedding_size // heads # 16
mask = torch.tril(torch.ones(seq_length, seq_length)).expand(batch_size, heads, seq_length, seq_length)  # mask是实现单向注意力的机制,通过下三角矩阵实现

# 前向传播
output = gpt2_layers(input_data, mask)
print("层堆叠模型输出形状：", output.shape)
print("层堆叠模型输出", output)

SelfAttention输入values形状： torch.Size([2, 10, 128])
SelfAttention输入values形状： torch.Size([2, 10, 128])
SelfAttention输入values形状： torch.Size([2, 10, 128])
SelfAttention输入values形状： torch.Size([2, 10, 128])
层堆叠模型输出形状： torch.Size([2, 10, 128])
层堆叠模型输出 tensor([[[ 0.9455, -0.0000,  2.2023,  ...,  2.9020, -0.4766, -0.0000],
         [ 0.8986,  1.4101,  3.0224,  ...,  0.0000, -1.4877, -0.6093],
         [-2.9227,  0.4423,  1.6822,  ..., -0.0659,  0.5900, -3.2350],
         ...,
         [ 0.8116,  0.1872,  0.5965,  ..., -1.6258, -0.2729, -1.0999],
         [-0.0000, -0.1287,  0.0000,  ...,  0.0000, -0.6247, -1.1164],
         [ 0.7957, -0.0000,  0.0000,  ...,  0.8903, -0.1610, -0.0000]],

        [[ 0.0039,  1.3446,  0.2288,  ..., -0.0998, -0.0000, -0.3030],
         [ 0.0000,  1.3229,  0.0000,  ...,  0.4040, -1.5794, -0.8535],
         [-1.4718,  0.5498,  1.2732,  ...,  0.0000, -1.8443, -0.6322],
         ...,
         [ 0.0227,  1.4154,  0.9191,  ...,  2.1432, -0.0000, -0.2910],
         [ 0.644