In [53]:
import torch
import torch.nn as nn
import math
from transformers import AutoTokenizer


In [51]:
# tips：加入 tokenizer 
# tokenizer 目录路径
local_tokenizer_dir = "./tokenizer_files/"  # 替换为你的实际路径
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(
    local_tokenizer_dir,
    trust_remote_code=True,  # 如果使用的分词器有自定义代码，需要启用此选项
    truncation_side='right', # 设置分词器的截断侧
    padding_side='right'     # 设置分词器的填充侧
)

In [52]:
# tips：初始函数设置
# 参数设置
vocab_size = tokenizer.vocab_size
d_model = 1536
num_heads = 8
num_decoder_layers = 2
dim_feedforward = 256
max_seq_length = 5
dropout = 0.1
batch_size = 2


在多头注意力中，每个注意力头的计算遵循**Scaled Dot-Product Attention**公式：

#### **2.1. 单头注意力公式**

对于每个头，计算方式为：

$$
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

- $Q$: 查询向量 (query)。
- $K$: 键向量 (key)。
- $V$: 值向量 (value)。
- $d_k$: 每个头的键/查询向量维度。
#### **2.2. 多头注意力公式**

多头注意力通过多个头并行计算注意力，然后将结果合并：

$$
\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \text{head}_2, \dots, \text{head}_h)W^O
$$

- 每个注意力头的计算：

$$
\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
$$
$W_i^Q, W_i^K, W_i^V$ 是每个头的权重矩阵。
$$
\text{FFN}(x)=max(0, xW_1+b_1)W_2+b_2
$$

![formula](../decoder_img/formula_multi-head%20attention.png)

![encoder-decoder](../decoder_img/encoder-decoder.png)
![sdqa](../decoder_img/sdpa.png)
![multi-head-attention](../decoder_img/multi-head%20attention.png)

In [40]:
# tips：多头自注意力机制
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
        
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        
        # 定义线性变换层
        self.q_linear = nn.Linear(hidden_size, hidden_size)
        self.k_linear = nn.Linear(hidden_size, hidden_size)
        self.v_linear = nn.Linear(hidden_size, hidden_size)
        self.o_linear = nn.Linear(hidden_size, hidden_size)

        self.scale = math.sqrt(self.head_dim)

    def split_head(self, x, batch_size):
        """Split the last dimension into (num_heads, head_dim).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, head_dim)
        """
        x = x.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        return x

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        # 线性变换
        q = self.q_linear(q)  # (batch_size, seq_len_q, hidden_size)  对 emb 的值或者维度做了改变
        k = self.k_linear(k)  # (batch_size, seq_len_k, hidden_size)
        v = self.v_linear(v)  # (batch_size, seq_len_v, hidden_size)
        print(f"q.size: {q.size()}")
        print(f"k.size: {k.size()}")
        print(f"v.size: {v.size()}")
        
        # 分割头
        q = self.split_head(q, batch_size)  # (batch_size, num_heads, seq_len_q, head_dim)
        k = self.split_head(k, batch_size)  # (batch_size, num_heads, seq_len_k, head_dim)
        v = self.split_head(v, batch_size)  # (batch_size, num_heads, seq_len_v, head_dim)
        print(f"new_q.size: {q.size()}")
        print(f"new_k.size: {k.size()}")
        print(f"new_v.size: {v.size()}")
    
        # 每头独立计算
        ## 多头计算注意力分数 QK^T
        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale  # (batch_size, num_heads, seq_len_q, seq_len_k)
        if mask is not None:
            # scores 张量中对应于 mask 中为0的位置（即不应该被关注的位置），将其值设置为 -1e9
            # 这样-1e9在经过softmax以后得到的概率接近于零
            # 实现了 mask 中掩码为0的位置是没有score分数的
            scores = scores.masked_fill(mask == 0, -1e9)
        ## 减少计算使用过设置为负无穷大，然后负无穷大的对应函数值为0，0不参与计算实现的
        ## softmax 作用与最后一维，负无穷大对应的是 0，对应到 v 就不会有最后向量
        attention_weights = torch.softmax(scores, dim=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)
        ## 应用注意力权重到值向量
        output = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len_q, head_dim)

        # 合并头
        ######## 多头是如何实现的？？？？
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)  # (batch_size, seq_len_q, hidden_size)
        # (batch_size, num_heads, seq_len, head_dim) 变化 transpose(1, 2)
        # (batch_size, seq_len, num_heads, head_dim)
        # view 之前用 contiguous() 是为了确保张量在内存中的布局是连续的
        # view 进行重塑，将 num_heads 和 head_dim 合并在一起
        
        # 最后一个线性变换
        output = self.o_linear(output)  # (batch_size, seq_len_q, hidden_size)

        return output, attention_weights

![FFN](../decoder_img/FFN.png)

In [41]:
# tips：FNN
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)  # 从 d_model 映射到 d_ff
        self.fc2 = nn.Linear(d_ff, d_model) # 再从 d_ff 回到 d_model
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


![pe](../decoder_img/position%20embedding.png)

In [42]:
# tips：位置编码
# 继承了PyTorch的 nn.Module 类，这意味着它可以像其他PyTorch模块一样被使用
class PositionalEncoding(nn.Module):
    # 构造函数接收两个参数 d_model：嵌入向量大小，max_len：最长序列长度
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model) # (max_len, d_model) 存储位置编码
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # ①
        pe[:, 0::2] = torch.sin(position * div_term) # 偶数维赋值  # ②
        pe[:, 1::2] = torch.cos(position * div_term) # 奇数维赋值  # ②
        pe = pe.unsqueeze(0).transpose(0, 1) # [max_len, 1, d_model]
        self.register_buffer('pe', pe) # 注册为一个缓冲区，被保存在模型的字典中，不会参与梯度更新

    def forward(self, x):
        x = x + self.pe[:x.size(0), :] # 调整位置编码大小：根据输入 x 的实际序列长度切片位置编码矩阵，确保只添加必要的部分。
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout, vocab_size):
        super(DecoderLayer, self).__init__()
        
        # 自注意力层（Causal-Attention）
        self.self_attn = MultiHeadAttention(d_model, num_heads)     
        # 编码器-解码器注意力层（Cross-Attention）
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        # 前馈网络（Feed-Forward）
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        
        # LayerNorm 层（Layer Normalization）
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        # Dropout 层
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # 将输入索引转换为嵌入向量，并调整尺度
       
        # 自注意力：输入为解码器自身的输入（x），q,k,v全部都是x 
        attn_output, block1 = self.self_attn(x, x, x, tgt_mask)  # [bs, seq_len, emb] [bs, 1, seq_len, seq_len]
        x = self.norm1(x + self.dropout(attn_output))  # 残差连接和 LayerNorm
        
        # 交叉注意力：输入为编码器输出（enc_output）和解码器输入（x）
        # q：decoder's output. k + v: encoder's output 
        attn_output, block2 = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))  # 残差连接和 LayerNorm
        
        # 前馈网络（Feed-Forward）
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))  # 残差连接和 LayerNorm
        
        return x, block1, block2

In [43]:
# tips：Decoder 层
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout, vocab_size, max_len=5000):
        super(Decoder, self).__init__()
        
        # 建立模型的 emb 层  创建一个形状为 (vocab_size, d_model) 的可训练参数矩阵
        # 初始化：嵌入矩阵的权重通常会在初始化时随机分配，或者从预训练模型中加载。
        # 训练：在训练过程中，嵌入矩阵的权重会通过反向传播进行更新，以优化模型性能。
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # 位置编码
        self.pos_encoding = PositionalEncoding(d_model, max_len=max_len)
        
        # 解码器层列表
        self.dec_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout, vocab_size)
            for _ in range(num_layers)
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attention_weights = {}
        seq_len = x.size(1)
        
        # 添加嵌入和位置编码 
        ## 嵌入的扩大：乘以嵌入维度的平方根 嵌入向量的初始值通常是从一个较小的标准差分布中随机抽取的，比如标准正态分布或均匀分布。如果直接将这些小数值输入到后续的线性变换、激活函数等操作中，可能会导致信号逐渐衰减，特别是当网络层数较多时。通过乘以嵌入维度的平方根，可以放大这些初始值，使得它们在整个网络中的传播更为稳定。
        # 在 Transformer 模型中，位置编码（positional encoding）会直接加到词嵌入（token embeddings）上。位置编码的设计通常是基于正弦和余弦函数，其幅度大致为 1。如果不调整词嵌入的规模，那么位置编码的影响可能会过大或过小，从而破坏了两者之间的相对比例。乘以 sqrt(d_model) 可以使词嵌入的均方根（RMS, Root Mean Square）与位置编码相近，维持两者在一个相似的数量级上，避免一方压倒另一方。
        # 这种做法源自 Vaswani 等人在他们的论文《Attention Is All You Need》中提出的建议。他们指出，这样的缩放有助于保持模型各部分的输出具有相似的方差，进而促进更有效的学习。
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        ## final output 用来进行输出， 输出和attention weight 是不一样的
        ## block1 和 block2 是自注意力和交叉注意力 的参数，所以是不是模型内部的参数指的就是 block1 和 block2 的参数
        ## 遍历 self.dec_layers 列表中的每一层
        for i, dec_layer in enumerate(self.dec_layers):
            x, block1, block2 = dec_layer(x, enc_output, src_mask, tgt_mask)  # [bs, seq_len, emb] [bs, seq_len, emb], [bs, heads, 1, seq_len], [bs, 1, seq_len, seq_len]
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        
        # 最后通过全连接层映射到词汇表大小
        final_output = self.fc_out(x)  # (batch_size, target_seq_len, vocab_size)
        
        return final_output, attention_weights

In [44]:
# tips：trans 整合
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_decoder_layers, dim_feedforward, max_seq_length, dropout=0.1):
        super(TransformerModel, self).__init__()

        # 加载分词器
        self.tokenizer = AutoTokenizer.from_pretrained(
            "./tokenizer_files/",  # 替换为你的实际路径
            trust_remote_code=True,
            truncation_side='right',
            padding_side='right'
        )
        # emb
        self.embedding = nn.Embedding(vocab_size, d_model)

        # 编码器
        # self.encoder = EncoderLayer(d_model, num_heads, dim_feedforward, dropout)
        # 解码器层
        self.decoder = Decoder(num_decoder_layers, d_model, num_heads, dim_feedforward, dropout, vocab_size, max_seq_length)

    def forward(self, src_input_ids, encoder_outputs, src_mask, tgt_mask):

        # 模拟编码器输出（随机初始化） !!!!!!!!!!!!!!!!!!!!
        encoder_outputs = torch.randn(src_input_ids.size(0), src_input_ids.size(1), 1536)  # [batch_size, seq_len_src, d_model]
        # 编码器的输入2：调用编码器，调用的编码器的输入是 [batch_size, seq_len, d_model]
        # x = self.embedding(src_input_ids) * math.sqrt(self.embedding.embedding_dim)

        # encoder_outputs = self.encoder(x)

        output, attn_weights = self.decoder(src_input_ids, encoder_outputs, src_mask, tgt_mask)

        return output, attn_weights

    def generate(self, start_token, max_len, src_input_ids, src_mask, tgt_mask):
        with torch.no_grad():
            # 确保 start_token 是整数类型，并初始化生成序列
            if not isinstance(start_token, int):
                raise ValueError("start_token must be an integer representing a valid token ID.")
            generated_sequence = [start_token]

            for i in range(max_len - 1):  # 减一因为已经包含了一个 start_token
                # 构造当前的目标序列张量，确保所有元素都是整数类型
                tgt_tensor = torch.tensor(generated_sequence, dtype=torch.long).unsqueeze(0).to(
                    next(self.parameters()).device)

                # 检查生成的 tgt_tensor 是否包含有效的 token 索引
                if tgt_tensor.max() >= self.embedding.num_embeddings:
                    raise ValueError(
                        f"Generated token index {tgt_tensor.max().item()} exceeds embedding size {self.embedding.num_embeddings}.")

                # 通过 forward 函数获取解码器输出
                output = self(src_input_ids, tgt_tensor, src_mask, tgt_mask)  # 使用 self(...) 而不是 self.forward(...)
                print(output.size())
                # 从输出中选择概率最大的 token ID，并确保它是整数类型
                next_token = int(output.argmax(dim=-1)[:, -1].item())

                # 将下一个 token 添加到生成序列中
                generated_sequence.append(next_token)

                # 如果遇到了结束标记，则停止生成
                if next_token == self.tokenizer.eos_token_id:
                    break

            return generated_sequence

    @classmethod
    def generate_square_subsequent_mask(cls, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [45]:
# tips： 测试 Transformer
def simple_test_transformer_model():
# 初始化模型
    model = TransformerModel(
        vocab_size=vocab_size,
        d_model=d_model,
        num_heads=num_heads,
        num_decoder_layers=num_decoder_layers,
        dim_feedforward=dim_feedforward,
        max_seq_length=max_seq_length,
        dropout=dropout
    )
    # batch_size = 2
    source_texts = ["Translate this sentence.", "Another example sentence, final example sentence."]

    # 关注 src_input_ids, src_mask 和 tgt_mask
    # 分词
    tokenized_source = tokenizer(source_texts, return_tensors="pt", padding=True, truncation=True)
    src_input_ids = tokenized_source["input_ids"]  # [batch, seq_len]
    src_attention_mask = tokenized_source["attention_mask"]

    ## 目标序列掩码（decoder self-attention） causal mask 不看当前位置之后
    seq_len = src_input_ids.size(1)
    # 下三角包括对角线全为1
    tgt_mask = torch.torch.tril(torch.ones(seq_len, seq_len))
    # tgt_mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1)  # [seq_len, seq_len]
    tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1).to(src_input_ids.device)  # [1, 1, seq_len, seq_len]
    tgt_mask = tgt_mask.expand(batch_size, -1, -1, -1)  # [batch_size, 1, seq_len, seq_len]

    ## padding mask 不看padding
    src_mask = src_attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
    src_mask = src_mask.expand(-1, num_heads, -1, -1)  # [batch_size, num_heads, 1, seq_len]
    #
    # print(f"src_input_ids:{src_input_ids}")
    # print(f"src_input_ids_size: {src_input_ids.size()}")
    # print(f"tgt_mask: {tgt_mask}")
    # print(f"tgt_mask_size: {tgt_mask.size()}")
    # print(f"src_mask: {src_mask}")
    # print(f"src_mask_size:{src_mask.size()}")
    # 前向传播测试 测试 Transformer 的 forward 函数成功
    # 第一个参数是decoder的input。第二个参数不重要。第三个参数是防止padding参与运算，第四个参数是自回归掩码
    output, attn_weights = model(src_input_ids, src_input_ids, src_mask, tgt_mask)
    print("Forward pass output shape:", output.size())  # 应为 (batch_size, seq_length, vocab_size)

# 运行测试函数
simple_test_transformer_model()


q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
Forward pass output shape: torch.Size([2, 10, 30522])


In [46]:
# tips： 下面不是必要要求

In [47]:
# 推理的时候进行单条单条的推理
def simple_test_inference_transformer_model():
    import torch

    # 参数设置
    vocab_size = tokenizer.vocab_size
    d_model = 1536
    nhead = 4
    num_decoder_layers = 2
    dim_feedforward = 256
    max_seq_length = 5
    dropout = 0.1

    # 初始化模型
    model = TransformerModel(
        vocab_size=vocab_size,
        d_model=d_model,
        nhead=nhead,
        num_decoder_layers=num_decoder_layers,
        dim_feedforward=dim_feedforward,
        max_seq_length=max_seq_length,
        dropout=dropout
    )
    source_texts = ["Translate this sentence."]
    target_texts = ["Translate this sentence."]
    
    # 准备两个tokenizer 和 两个掩码
    # 分词
    tokenized_source = tokenizer(source_texts, return_tensors="pt", padding=True, truncation=True)
    tokenized_target = tokenizer(target_texts, return_tensors="pt", padding=True, truncation=True)
    
    src_input_ids = tokenized_source["input_ids"]  # [batch, seq_len]
    tgt_input_ids = tokenized_target["input_ids"]  # [batch, seq_len]
    
    src_attention_mask = tokenized_source["attention_mask"]
    tgt_attention_mask = tokenized_target["attention_mask"]
    
    ## 生成掩码 padding mask 不看padding
    src_mask = src_attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
    src_mask = src_mask.expand(-1, num_heads, -1, -1)  # [batch_size, num_heads, 1, seq_len]
    
    ## 目标序列掩码（decoder self-attention） causal mask 不看当前位置之后
    seq_len = tgt_input_ids.size(1)
    # 上三角对角矩阵 mask 矩阵来说不是 0 就是 1
    tgt_mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1)  # [seq_len, seq_len]
    tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1).to(tgt_input_ids.device)  # [1, 1, seq_len, seq_len]
    # tgt_mask = tgt_mask.expand(batch_size, -1, -1, -1)
    
    print(src_input_ids.size())
    print(tgt_input_ids.size())
    print(src_mask.size())
    print(tgt_mask.size())
    # 前向传播测试 测试 Transformer 的 forward 函数成功
    output = model(src_input_ids, tgt_input_ids, src_mask, tgt_mask)
    print("Forward pass output shape:", output.shape)  # 应为 (batch_size, seq_length, vocab_size)

    
    # 设置起始 token
    start_token = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 1  # 假设 cls_token 作为起始 token

    # 调用生成函数
    print("Generation test:")
    generated_seq = model.generate(start_token=start_token, max_len=max_seq_length, 
                                   src_input_ids=src_input_ids, src_mask=src_mask, tgt_mask=tgt_mask)

    # 检查生成的序列
    print("Generated token sequence:", generated_seq)

    # 反编码为文本
    generated_text = tokenizer.decode(generated_seq, skip_special_tokens=True)
    print("Generated text:", generated_text)
    
    
    # # 模拟输入数据
    # tgt = torch.randint(0, vocab_size, (2, max_seq_length))  # 批大小2，序列长度为max_seq_length
    # enc_output = torch.rand(2, max_seq_length, d_model)  # 假设编码器的输出
    # src_mask = None  # 示例中不使用具体的mask
    # tgt_mask = TransformerModel.generate_square_subsequent_mask(max_seq_length)
    # 

    # 生成测试
    ## 设置起始 token
    start_token = 1

    # 假设编码器的输出
    enc_output = torch.rand(2, max_seq_length, d_model)
    print(enc_output.size())

    # 调用生成函数
    print(1)
    ## 从 start_token 开始生成内容 start_token: max_seq_length: enc_output: 
    # 有没有必要设置 src_mask, tgt_mask？？？？？
    generated_seq = model.generate(max_seq_length, enc_output, src_mask, tgt_mask)

    # 检查生成的序列
    print("Generated token sequence:", generated_seq)
    
    print(4)
    # 反编码为文本
    generated_text = model.tokenizer.decode(generated_seq, skip_special_tokens=True)
    print("Generated text:", generated_text)

# 运行测试函数
simple_test_transformer_model()


q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
q.size: torch.Size([2, 10, 1536])
k.size: torch.Size([2, 10, 1536])
v.size: torch.Size([2, 10, 1536])
new_q.size: torch.Size([2, 8, 10, 192])
new_k.size: torch.Size([2, 8, 10, 192])
new_v.size: torch.Size([2, 8, 10, 192])
Forward pass output shape: torch.Size([2, 10, 30522])


In [48]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        tgt_input = batch['input'].to(device)
        tgt_output = batch['output'].to(device)
        enc_output = batch.get('enc_output', None)  # 如果有编码器输出
        src_mask = batch.get('src_mask', None)
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        optimizer.zero_grad()
        output = model(tgt_input, enc_output, src_mask, tgt_mask)
        loss = criterion(output.view(-1, output.size(-1)), tgt_output.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [219]:
def create_mock_data(batch_size, seq_length, vocab_size):
    # 创建随机输入序列，保证 token ID 在 [0, vocab_size - 1] 范围内
    input_sequences = torch.randint(0, vocab_size, (batch_size, seq_length))
    output_sequences = torch.randint(0, vocab_size, (batch_size, seq_length))
    
    # 其他部分保持不变...
    enc_output = torch.randn(batch_size, seq_length, d_model)  # 示例编码器输出
    src_mask = torch.ones(batch_size, 1, seq_length)           # 示例源序列掩码
    
    return {
        'input': input_sequences,
        'output': output_sequences,
        'enc_output': enc_output,
        'src_mask': src_mask
    }

In [49]:
# tips：三种掩码方式：padding mask, causal mask, combined mask
import torch

# 假设输入数据
batch_size = 2
seq_len = 5
num_heads = 8

# 假设填充 token ID 是 0
input_ids = torch.tensor([
    [1, 2, 3, 0, 0],  # 第一个序列，最后两个是填充
    [4, 5, 6, 7, 0]   # 第二个序列，最后一个是填充
])
# 1. 构造 Padding Mask # padding_mask: 1 表示有效位置，0 表示填充位置
padding_mask = (input_ids != 0).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)

# 2. 构造 Causal Mask
causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()  # (seq_len, seq_len)
causal_mask = causal_mask.unsqueeze(0).expand(batch_size, -1, -1)  # (batch_size, seq_len, seq_len)
print(causal_mask)

print(padding_mask)
# 3. Combine Masks
# 使用或 运算符 
combined_mask = causal_mask | ~padding_mask.squeeze(1).expand(-1, seq_len, -1)  # (batch_size, seq_len, seq_len)
print(combined_mask)

tensor([[[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, False, False, False, False]],

        [[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, False, False, False, False]]])
tensor([[[[ True,  True,  True, False, False]]],


        [[[ True,  True,  True,  True, False]]]])
tensor([[[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False,  True,  True]],

        [[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, Fa