![encoder-decoder](./decoder_img/encoder-decoder.png)


In [60]:
import torch
import torch.nn as nn
import math
from transformers import AutoTokenizer


In [61]:
# tokenizer 目录路径
local_tokenizer_dir = "./tokenizer_files/"  # 替换为你的实际路径
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(
    local_tokenizer_dir,
    trust_remote_code=True,  # 如果使用的分词器有自定义代码，需要启用此选项
    truncation_side='right', # 设置分词器的截断侧
    padding_side='right'     # 设置分词器的填充侧
)

In [62]:
# parameter setting
d_model = 512   # 输入的维度 Qwen/Qwen2.5-1.5B-Instruct 中是 hidden_size
d_ff = 2048     # 前向传播隐藏层维度
d_k = d_v = 64  # K(=Q), V的维度 
n_layers = 6    # N of encoder and decoder
n_heads = 8     # the number of Multi-Head Attention

batch_size = 2 # 假设每个样本的长度为 5
seq_len = 512 # 假设每个样本的长度为 5
hidden_size = 1024
num_heads = 8   # 多头注意力中的头数

# 假设这些是我们所需的模型参数
dropout = 0.1         # Dropout 比例
vocab_size = 151936   # Qwen/Qwen2.5-1.5B-Instruct 的词汇表的大小
num_layers = 6        # Decoder 中的层数

# 定义嵌入层，词表大小和分词器一致
embedding_dim = 512  # 嵌入维度，可根据需求调整
max_seq_len = 512  # 最大序列长度


在多头注意力中，每个注意力头的计算遵循**Scaled Dot-Product Attention**公式：

#### **2.1. 单头注意力公式**

对于每个头，计算方式为：

$$
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

- $Q$: 查询向量 (query)。
- $K$: 键向量 (key)。
- $V$: 值向量 (value)。
- $d_k$: 每个头的键/查询向量维度。
#### **2.2. 多头注意力公式**

多头注意力通过多个头并行计算注意力，然后将结果合并：

$$
\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \text{head}_2, \dots, \text{head}_h)W^O
$$

- 每个注意力头的计算：

$$
\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
$$
$W_i^Q, W_i^K, W_i^V$ 是每个头的权重矩阵。

![formula](./decoder_img/formula_multi-head%20attention.png)

![sdqa](./decoder_img/sdpa.png)
![multi-head-attention](./decoder_img/multi-head%20attention.png)

In [78]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
        
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        
        # 定义线性变换层
        self.q_linear = nn.Linear(hidden_size, hidden_size)
        self.k_linear = nn.Linear(hidden_size, hidden_size)
        self.v_linear = nn.Linear(hidden_size, hidden_size)
        self.o_linear = nn.Linear(hidden_size, hidden_size)

        self.scale = math.sqrt(self.head_dim)

    def split_head(self, x, batch_size):
        """Split the last dimension into (num_heads, head_dim).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, head_dim)
        """
        x = x.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        return x

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        # 线性变换
        q = self.q_linear(q)  # (batch_size, seq_len_q, hidden_size)  对 emb 的值或者维度做了改变
        k = self.k_linear(k)  # (batch_size, seq_len_k, hidden_size)
        v = self.v_linear(v)  # (batch_size, seq_len_v, hidden_size)
        print(f"q.size: {q.size()}")
        print(f"k.size: {k.size()}")
        print(f"v.size: {v.size()}")
        
        # 分割头
        q = self.split_head(q, batch_size)  # (batch_size, num_heads, seq_len_q, head_dim)
        k = self.split_head(k, batch_size)  # (batch_size, num_heads, seq_len_k, head_dim)
        v = self.split_head(v, batch_size)  # (batch_size, num_heads, seq_len_v, head_dim)
        print(f"new_q.size: {q.size()}")
        print(f"new_k.size: {k.size()}")
        print(f"new_v.size: {v.size()}")
    
        # 每头独立计算
        ## 多头计算注意力分数 QK^T
        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale  # (batch_size, num_heads, seq_len_q, seq_len_k)
        if mask is not None:
            # scores 张量中对应于 mask 中为0的位置（即不应该被关注的位置），将其值设置为 -1e9
            # 这样-1e9在经过softmax以后得到的概率接近于零
            # 实现了 mask 中掩码为0的位置是没有score分数的
            scores = scores.masked_fill(mask == 0, -1e9)
        ## 减少计算使用过设置为负无穷大，然后负无穷大的对应函数值为0，0不参与计算实现的
        ## softmax 作用与最后一维，负无穷大对应的是 0，对应到 v 就不会有最后向量
        attention_weights = torch.softmax(scores, dim=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)
        ## 应用注意力权重到值向量
        output = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len_q, head_dim)

        # 合并头
        ######## 多头是如何实现的？？？？
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)  # (batch_size, seq_len_q, hidden_size)
        # (batch_size, num_heads, seq_len, head_dim) 变化 transpose(1, 2)
        # (batch_size, seq_len, num_heads, head_dim)
        # view 之前用 contiguous() 是为了确保张量在内存中的布局是连续的
        # view 进行重塑，将 num_heads 和 head_dim 合并在一起
        
        # 最后一个线性变换
        output = self.o_linear(output)  # (batch_size, seq_len_q, hidden_size)

        return output, attention_weights

![FFN](./decoder_img/FFN.png)

In [77]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


![pe](./decoder_img/position%20embedding.png)

In [65]:
# 继承了PyTorch的 nn.Module 类，这意味着它可以像其他PyTorch模块一样被使用
class PositionalEncoding(nn.Module):
    # 构造函数接收两个参数 d_model：嵌入向量大小，max_len：最长序列长度
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model) # (max_len, d_model) 存储位置编码
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # ①
        pe[:, 0::2] = torch.sin(position * div_term) # 偶数维赋值  # ②
        pe[:, 1::2] = torch.cos(position * div_term) # 奇数维赋值  # ②
        pe = pe.unsqueeze(0).transpose(0, 1) # [max_len, 1, d_model]
        self.register_buffer('pe', pe) # 注册为一个缓冲区，被保存在模型的字典中，不会参与梯度更新

    def forward(self, x):
        x = x + self.pe[:x.size(0), :] # 调整位置编码大小：根据输入 x 的实际序列长度切片位置编码矩阵，确保只添加必要的部分。
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout, vocab_size):
        super(DecoderLayer, self).__init__()
        
        # 自注意力层（Causal-Attention）
        self.self_attn = MultiHeadAttention(d_model, num_heads)     
        # 编码器-解码器注意力层（Cross-Attention）
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        # 前馈网络（Feed-Forward）
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        
        # LayerNorm 层（Layer Normalization）
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        # Dropout 层
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # 将输入索引转换为嵌入向量，并调整尺度
       
        # 自注意力：输入为解码器自身的输入（x），q,k,v全部都是x 
        attn_output, block1 = self.self_attn(x, x, x, tgt_mask)  # [bs, seq_len, emb] [bs, 1, seq_len, seq_len]
        x = self.norm1(x + self.dropout(attn_output))  # 残差连接和 LayerNorm
        
        # 交叉注意力：输入为编码器输出（enc_output）和解码器输入（x）
        # q：decoder's output. k + v: encoder's output 
        attn_output, block2 = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))  # 残差连接和 LayerNorm
        
        # 前馈网络（Feed-Forward）
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))  # 残差连接和 LayerNorm
        
        return x, block1, block2

In [66]:
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout, vocab_size, max_len=5000):
        super(Decoder, self).__init__()
        
        # 建立模型的 emb 层  创建一个形状为 (vocab_size, d_model) 的可训练参数矩阵
        # 初始化：嵌入矩阵的权重通常会在初始化时随机分配，或者从预训练模型中加载。
        # 训练：在训练过程中，嵌入矩阵的权重会通过反向传播进行更新，以优化模型性能。
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # 位置编码
        self.pos_encoding = PositionalEncoding(d_model, max_len=max_len)
        
        # 解码器层列表
        self.dec_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout, vocab_size)
            for _ in range(num_layers)
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attention_weights = {}
        seq_len = x.size(1)
        
        # 添加嵌入和位置编码 
        ## 嵌入的扩大：乘以嵌入维度的平方根 嵌入向量的初始值通常是从一个较小的标准差分布中随机抽取的，比如标准正态分布或均匀分布。如果直接将这些小数值输入到后续的线性变换、激活函数等操作中，可能会导致信号逐渐衰减，特别是当网络层数较多时。通过乘以嵌入维度的平方根，可以放大这些初始值，使得它们在整个网络中的传播更为稳定。
        # 在 Transformer 模型中，位置编码（positional encoding）会直接加到词嵌入（token embeddings）上。位置编码的设计通常是基于正弦和余弦函数，其幅度大致为 1。如果不调整词嵌入的规模，那么位置编码的影响可能会过大或过小，从而破坏了两者之间的相对比例。乘以 sqrt(d_model) 可以使词嵌入的均方根（RMS, Root Mean Square）与位置编码相近，维持两者在一个相似的数量级上，避免一方压倒另一方。
        # 这种做法源自 Vaswani 等人在他们的论文《Attention Is All You Need》中提出的建议。他们指出，这样的缩放有助于保持模型各部分的输出具有相似的方差，进而促进更有效的学习。
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        ## final output 用来进行输出， 输出和attention weight 是不一样的
        ## block1 和 block2 是自注意力和交叉注意力 的参数，所以是不是模型内部的参数指的就是 block1 和 block2 的参数
        ## 遍历 self.dec_layers 列表中的每一层
        for i, dec_layer in enumerate(self.dec_layers):
            x, block1, block2 = dec_layer(x, enc_output, src_mask, tgt_mask)  # [bs, seq_len, emb] [bs, seq_len, emb], [bs, heads, 1, seq_len], [bs, 1, seq_len, seq_len]
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        
        # 最后通过全连接层映射到词汇表大小
        final_output = self.fc_out(x)  # (batch_size, target_seq_len, vocab_size)
        
        return final_output, attention_weights

In [73]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_decoder_layers, dim_feedforward, max_seq_length, dropout=0.1):
        super(TransformerModel, self).__init__()
        
        # 加载分词器
        self.tokenizer = AutoTokenizer.from_pretrained(
            "./tokenizer_files/",  # 替换为你的实际路径
            trust_remote_code=True,
            truncation_side='right',
            padding_side='right'
        )   
        # 解码器层
        self.decoder = Decoder(num_decoder_layers, d_model, num_heads, dim_feedforward, dropout, max_seq_length)
        
        # 输出层
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src_input_ids, encoder_outputs, src_mask, tgt_mask):
        
        # 模拟编码器输出（随机初始化） !!!!!!!!!!!!!!!!!!!! 
        encoder_outputs = torch.randn(src_input_ids.size(0), src_input_ids.size(1), d_model)  # [batch_size, seq_len_src, d_model]
        
        # 初始化模型
        # encoder = Encoder(num_layers, d_model, num_heads, d_ff, dropout)  # 假设有 Encoder 类
        decoder = Decoder(num_layers, d_model, num_heads, d_ff, dropout, tokenizer.vocab_size)
        
        # 编码器输出
        # encoder_outputs = encoder(src_input_ids, src_mask)  # 假设返回形状为 [batch_size, seq_len, d_model]
        # 解码器输出 attn_weights 没有做处理
        output, attn_weights = decoder(src_input_ids, encoder_outputs, src_mask, tgt_mask)
        
        return output


    def generate(self, start_token, max_len, src_input_ids, src_mask, tgt_mask):
        with torch.no_grad():
            # 确保 start_token 是整数类型，并初始化生成序列
            if not isinstance(start_token, int):
                raise ValueError("start_token must be an integer representing a valid token ID.")
            generated_sequence = [start_token]
    
            for i in range(max_len - 1):  # 减一因为已经包含了一个 start_token
                # 构造当前的目标序列张量，确保所有元素都是整数类型
                tgt_tensor = torch.tensor(generated_sequence, dtype=torch.long).unsqueeze(0).to(next(self.parameters()).device)
                
                # 检查生成的 tgt_tensor 是否包含有效的 token 索引
                if tgt_tensor.max() >= self.embedding.num_embeddings:
                    raise ValueError(f"Generated token index {tgt_tensor.max().item()} exceeds embedding size {self.embedding.num_embeddings}.")
                
             
                # 通过 forward 函数获取解码器输出
                output = self(src_input_ids, tgt_tensor, src_mask, tgt_mask)  # 使用 self(...) 而不是 self.forward(...)
                print(output.size())
                # 从输出中选择概率最大的 token ID，并确保它是整数类型
                next_token = int(output.argmax(dim=-1)[:, -1].item())
                
                # 将下一个 token 添加到生成序列中
                generated_sequence.append(next_token)
    
                # 如果遇到了结束标记，则停止生成
                if next_token == self.tokenizer.eos_token_id:
                    break
            
            return generated_sequence

    @classmethod
    def generate_square_subsequent_mask(cls, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [79]:
# test Transformer's forward function
def simple_test_transformer_model():
    import torch

    # 参数设置
    vocab_size = tokenizer.vocab_size
    d_model = 1536
    nhead = 4
    num_decoder_layers = 2
    dim_feedforward = 256
    max_seq_length = 5
    dropout = 0.1

    # 初始化模型
    model = TransformerModel(
        vocab_size=vocab_size,
        d_model=d_model,
        nhead=nhead,
        num_decoder_layers=num_decoder_layers,
        dim_feedforward=dim_feedforward,
        max_seq_length=max_seq_length,
        dropout=dropout
    )
    # batch_size = 2
    source_texts = ["Translate this sentence.", "Another example sentence, final example sentence."]
    
    # 关注 src_input_ids, src_mask 和 tgt_mask
    # 分词
    tokenized_source = tokenizer(source_texts, return_tensors="pt", padding=True, truncation=True)
    src_input_ids = tokenized_source["input_ids"]  # [batch, seq_len]
    src_attention_mask = tokenized_source["attention_mask"]
    
    ## 目标序列掩码（decoder self-attention） causal mask 不看当前位置之后
    seq_len = src_input_ids.size(1)
    # 下三角包括对角线全为1
    tgt_mask = torch.torch.tril(torch.ones(seq_len, seq_len))
    # tgt_mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1)  # [seq_len, seq_len]
    tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1).to(src_input_ids.device)  # [1, 1, seq_len, seq_len]
    tgt_mask = tgt_mask.expand(batch_size, -1, -1, -1) # [batch_size, 1, seq_len, seq_len]
    
    ## padding mask 不看padding
    src_mask = src_attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
    src_mask = src_mask.expand(-1, num_heads, -1, -1)  # [batch_size, num_heads, 1, seq_len]
    # 
    # print(f"src_input_ids:{src_input_ids}")
    # print(f"src_input_ids_size: {src_input_ids.size()}")
    # print(f"tgt_mask: {tgt_mask}")
    # print(f"tgt_mask_size: {tgt_mask.size()}")
    # print(f"src_mask: {src_mask}")
    # print(f"src_mask_size:{src_mask.size()}")
    # 前向传播测试 测试 Transformer 的 forward 函数成功 
    # 第一个参数是decoder的input。第二个参数不重要。第三个参数是防止padding参与运算，第四个参数是自回归掩码
    output = model(src_input_ids, src_input_ids, src_mask, tgt_mask)
    print("Forward pass output shape:", output.shape)  # 应为 (batch_size, seq_length, vocab_size)

# 运行测试函数
simple_test_transformer_model()


q.size: torch.Size([2, 10, 512])
k.size: torch.Size([2, 10, 512])
v.size: torch.Size([2, 10, 512])
new_q.size: torch.Size([2, 8, 10, 64])
new_k.size: torch.Size([2, 8, 10, 64])
new_v.size: torch.Size([2, 8, 10, 64])
q.size: torch.Size([2, 10, 512])
k.size: torch.Size([2, 10, 512])
v.size: torch.Size([2, 10, 512])
new_q.size: torch.Size([2, 8, 10, 64])
new_k.size: torch.Size([2, 8, 10, 64])
new_v.size: torch.Size([2, 8, 10, 64])
q.size: torch.Size([2, 10, 512])
k.size: torch.Size([2, 10, 512])
v.size: torch.Size([2, 10, 512])
new_q.size: torch.Size([2, 8, 10, 64])
new_k.size: torch.Size([2, 8, 10, 64])
new_v.size: torch.Size([2, 8, 10, 64])
q.size: torch.Size([2, 10, 512])
k.size: torch.Size([2, 10, 512])
v.size: torch.Size([2, 10, 512])
new_q.size: torch.Size([2, 8, 10, 64])
new_k.size: torch.Size([2, 8, 10, 64])
new_v.size: torch.Size([2, 8, 10, 64])
q.size: torch.Size([2, 10, 512])
k.size: torch.Size([2, 10, 512])
v.size: torch.Size([2, 10, 512])
new_q.size: torch.Size([2, 8, 10, 64]

In [50]:
# 推理的时候进行单条单条的推理
def simple_test_inference_transformer_model():
    import torch

    # 参数设置
    vocab_size = tokenizer.vocab_size
    d_model = 1536
    nhead = 4
    num_decoder_layers = 2
    dim_feedforward = 256
    max_seq_length = 5
    dropout = 0.1

    # 初始化模型
    model = TransformerModel(
        vocab_size=vocab_size,
        d_model=d_model,
        nhead=nhead,
        num_decoder_layers=num_decoder_layers,
        dim_feedforward=dim_feedforward,
        max_seq_length=max_seq_length,
        dropout=dropout
    )
    source_texts = ["Translate this sentence."]
    target_texts = ["Translate this sentence."]
    
    # 准备两个tokenizer 和 两个掩码
    # 分词
    tokenized_source = tokenizer(source_texts, return_tensors="pt", padding=True, truncation=True)
    tokenized_target = tokenizer(target_texts, return_tensors="pt", padding=True, truncation=True)
    
    src_input_ids = tokenized_source["input_ids"]  # [batch, seq_len]
    tgt_input_ids = tokenized_target["input_ids"]  # [batch, seq_len]
    
    src_attention_mask = tokenized_source["attention_mask"]
    tgt_attention_mask = tokenized_target["attention_mask"]
    
    ## 生成掩码 padding mask 不看padding
    src_mask = src_attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
    src_mask = src_mask.expand(-1, num_heads, -1, -1)  # [batch_size, num_heads, 1, seq_len]
    
    ## 目标序列掩码（decoder self-attention） causal mask 不看当前位置之后
    seq_len = tgt_input_ids.size(1)
    # 上三角对角矩阵 mask 矩阵来说不是 0 就是 1
    tgt_mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1)  # [seq_len, seq_len]
    tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1).to(tgt_input_ids.device)  # [1, 1, seq_len, seq_len]
    # tgt_mask = tgt_mask.expand(batch_size, -1, -1, -1)
    
    print(src_input_ids.size())
    print(tgt_input_ids.size())
    print(src_mask.size())
    print(tgt_mask.size())
    # 前向传播测试 测试 Transformer 的 forward 函数成功
    output = model(src_input_ids, tgt_input_ids, src_mask, tgt_mask)
    print("Forward pass output shape:", output.shape)  # 应为 (batch_size, seq_length, vocab_size)

    
    # 设置起始 token
    start_token = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 1  # 假设 cls_token 作为起始 token

    # 调用生成函数
    print("Generation test:")
    generated_seq = model.generate(start_token=start_token, max_len=max_seq_length, 
                                   src_input_ids=src_input_ids, src_mask=src_mask, tgt_mask=tgt_mask)

    # 检查生成的序列
    print("Generated token sequence:", generated_seq)

    # 反编码为文本
    generated_text = tokenizer.decode(generated_seq, skip_special_tokens=True)
    print("Generated text:", generated_text)
    
    
    # # 模拟输入数据
    # tgt = torch.randint(0, vocab_size, (2, max_seq_length))  # 批大小2，序列长度为max_seq_length
    # enc_output = torch.rand(2, max_seq_length, d_model)  # 假设编码器的输出
    # src_mask = None  # 示例中不使用具体的mask
    # tgt_mask = TransformerModel.generate_square_subsequent_mask(max_seq_length)
    # 

    # 生成测试
    ## 设置起始 token
    start_token = 1

    # 假设编码器的输出
    enc_output = torch.rand(2, max_seq_length, d_model)
    print(encoder_outputs.size())

    # 调用生成函数
    print(1)
    ## 从 start_token 开始生成内容 start_token: max_seq_length: enc_output: 
    # 有没有必要设置 src_mask, tgt_mask？？？？？
    generated_seq = model.generate(max_seq_length, enc_output, src_mask, tgt_mask)

    # 检查生成的序列
    print("Generated token sequence:", generated_seq)
    
    print(4)
    # 反编码为文本
    generated_text = model.tokenizer.decode(generated_seq, skip_special_tokens=True)
    print("Generated text:", generated_text)

# 运行测试函数
simple_test_transformer_model()


torch.Size([2, 6])
torch.Size([2, 6])
torch.Size([2, 8, 1, 6])
torch.Size([2, 1, 6, 6])
Forward pass output shape: torch.Size([2, 6, 30522])


In [264]:
print(tokenizer.additional_special_tokens[0])

<|im_start|>


In [214]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        tgt_input = batch['input'].to(device)
        tgt_output = batch['output'].to(device)
        enc_output = batch.get('enc_output', None)  # 如果有编码器输出
        src_mask = batch.get('src_mask', None)
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        optimizer.zero_grad()
        output = model(tgt_input, enc_output, src_mask, tgt_mask)
        loss = criterion(output.view(-1, output.size(-1)), tgt_output.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [219]:
def create_mock_data(batch_size, seq_length, vocab_size):
    # 创建随机输入序列，保证 token ID 在 [0, vocab_size - 1] 范围内
    input_sequences = torch.randint(0, vocab_size, (batch_size, seq_length))
    output_sequences = torch.randint(0, vocab_size, (batch_size, seq_length))
    
    # 其他部分保持不变...
    enc_output = torch.randn(batch_size, seq_length, d_model)  # 示例编码器输出
    src_mask = torch.ones(batch_size, 1, seq_length)           # 示例源序列掩码
    
    return {
        'input': input_sequences,
        'output': output_sequences,
        'enc_output': enc_output,
        'src_mask': src_mask
    }

In [224]:
def test_transformer_model():

    seq_length = 2
    # 初始化模型、损失函数和优化器
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TransformerModel(
        vocab_size=vocab_size,
        d_model=d_model,
        nhead=n_heads,
        num_decoder_layers=num_layers,
        dim_feedforward=d_ff,
        max_seq_length=seq_len
    ).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=model.tokenizer.pad_token_id)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    # 创建模拟数据加载器
    dataloader = [create_mock_data(batch_size, seq_length, vocab_size) for _ in range(2)]  # 创建两个批次的数据

    # 训练模型
    print("Starting training...")
    for epoch in range(1):  # 只进行一轮训练作为示例
        train_loss = train(model, dataloader, criterion, optimizer, device)
        print(f"Epoch {epoch + 1}, Loss: {train_loss}")

    # 进行推理
    print("Starting inference...")
    start_token = model.tokenizer.bos_token_id  # 使用开始标记的ID
    max_len = 10
    enc_output = torch.randn(1, seq_length, d_model).to(device)  # 示例编码器输出，注意d_model应与模型一致
    src_mask = torch.ones(1, 1, seq_length).to(device)  # 示例源序列掩码
    
    # 确保传入的start_token是整数类型
    generated_sequence_ids = infer(model, start_token, max_len, enc_output, src_mask, device)
    
    # 将生成的token ID转换为文本（假设有一个对应的分词器方法）
    generated_text = model.tokenizer.decode(generated_sequence_ids, skip_special_tokens=True)
    print("Generated text:", generated_text)

if __name__ == "__main__":
    test_transformer_model()

Starting training...


IndexError: index out of range in self

In [177]:
# 三种掩码方式：padding mask, causal mask, combined mask
import torch

# 假设输入数据
batch_size = 2
seq_len = 5
num_heads = 8

# 假设填充 token ID 是 0
input_ids = torch.tensor([
    [1, 2, 3, 0, 0],  # 第一个序列，最后两个是填充
    [4, 5, 6, 7, 0]   # 第二个序列，最后一个是填充
])
# 1. 构造 Padding Mask # padding_mask: 1 表示有效位置，0 表示填充位置
padding_mask = (input_ids != 0).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)

# 2. 构造 Causal Mask
causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()  # (seq_len, seq_len)
causal_mask = causal_mask.unsqueeze(0).expand(batch_size, -1, -1)  # (batch_size, seq_len, seq_len)
print(causal_mask)

print(padding_mask)
# 3. Combine Masks
# 使用或 运算符 
combined_mask = causal_mask | ~padding_mask.squeeze(1).expand(-1, seq_len, -1)  # (batch_size, seq_len, seq_len)
print(combined_mask)

tensor([[[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, False, False, False, False]],

        [[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, False, False, False, False]]])
tensor([[[[ True,  True,  True, False, False]]],


        [[[ True,  True,  True,  True, False]]]])
tensor([[[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False,  True,  True]],

        [[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, Fa

![pe](./decoder_img/position%20embedding.png)

https://blog.csdn.net/Q52099999/article/details/136180399

In [42]:
import math
import torch
from torch import nn

In [43]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].detach()
        return self.dropout(x)

In [44]:
# test PositionalEncoding
input=torch.ones(4,10,512)
positional_encoding=PositionalEncoding(512,0.1)
output=positional_encoding(input)
print(output.shape)     #torch.Size([4, 10, 512])
print(output)


torch.Size([4, 10, 512])
tensor([[[1.1111, 2.2222, 1.1111,  ..., 2.2222, 1.1111, 2.2222],
         [2.0461, 1.7114, 2.0243,  ..., 2.2222, 1.1112, 2.2222],
         [2.1214, 0.6487, 2.1516,  ..., 2.2222, 1.1113, 2.2222],
         ...,
         [1.8411, 0.0000, 1.6138,  ..., 2.2222, 1.1119, 2.2222],
         [2.2104, 0.9494, 2.2119,  ..., 0.0000, 1.1120, 2.2222],
         [1.5690, 0.0987, 1.8626,  ..., 2.2222, 1.1121, 2.2222]],

        [[1.1111, 2.2222, 1.1111,  ..., 2.2222, 1.1111, 0.0000],
         [2.0461, 1.7114, 2.0243,  ..., 2.2222, 1.1112, 2.2222],
         [2.1214, 0.6487, 2.1516,  ..., 0.0000, 1.1113, 2.2222],
         ...,
         [1.8411, 1.9488, 0.0000,  ..., 2.2222, 1.1119, 0.0000],
         [2.2104, 0.9494, 2.2119,  ..., 2.2222, 1.1120, 2.2222],
         [1.5690, 0.0987, 1.8626,  ..., 2.2222, 1.1121, 2.2222]],

        [[1.1111, 2.2222, 1.1111,  ..., 2.2222, 1.1111, 2.2222],
         [2.0461, 1.7114, 2.0243,  ..., 2.2222, 0.0000, 2.2222],
         [2.1214, 0.6487, 2.1516,

![sdqa](./decoder_img/sdpa.png)
![multi-head-attention](./decoder_img/multi-head%20attention.png)
![formula](./decoder_img/formula_multi-head%20attention.png)

https://hwcoder.top/Manual-Coding-1

In [45]:
## SDPA

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, query, key, value, attention_mask=None):
        # query, key, value 形状: (batch_size, seq_len, hidden_size)
        
        # 计算注意力分数
        # key.transpose(-1, -2) 将最后两个维度进行转置，以进行点积
        # attention_scores 形状: (batch_size, seq_len, seq_len)
        d_k = query.size(-1)  # 获取 hidden_size
        attention_scores = torch.matmul(query, key.transpose(-1, -2)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
        
        # 添加注意力掩码（seq_len, seq_len），掩码位置（1）的值为负无穷
        if attention_mask is not None:
            attention_scores += attention_mask * -1e9
                
        # 对注意力分数进行归一化，得到注意力概率
        attention_probs = torch.softmax(attention_scores, dim=-1)  # (batch_size, num_heads, seq_len, seq_len)
        
        # 计算注意力输出，通过注意力概率加权值
        attention_output = torch.matmul(attention_probs, value)  # (batch_size, num_heads, seq_len, hidden_size)
        
        return attention_output

In [46]:
def test_attn():
    
    query = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)
    key = torch.randn(batch_size, seq_len, hidden_size)    # (batch_size, seq_len, hidden_size)
    value = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)

    sdpa = ScaledDotProductAttention()
    output = sdpa(query, key, value)
    
    print("Query shape:", query.shape)
    print("Key shape:", key.shape)
    print("Value shape:", value.shape)
    print("Output shape:", output.shape)
    
if __name__ == "__main__":
	test_attn()

Query shape: torch.Size([128, 512, 1024])
Key shape: torch.Size([128, 512, 1024])
Value shape: torch.Size([128, 512, 1024])
Output shape: torch.Size([128, 512, 1024])


In [47]:
## 编码 MHA
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, hidden_size, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads  # 每个头的维度，二者必须整除
        self.hidden_size = hidden_size
        
        # 初始化 Q、K、V 的投影矩阵，将输入词向量线性变换为 Q、K、V，维度保持一致
        self.q_linear = nn.Linear(hidden_size, hidden_size) 
        self.k_linear = nn.Linear(hidden_size, hidden_size)
        self.v_linear = nn.Linear(hidden_size, hidden_size)
        
        # 输出线性层，将拼接后的多头注意力输出变换为所需的输出维度，这里维度保持一致
        self.o_linear = nn.Linear(hidden_size, hidden_size)
        
        # Dropout 层
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, hidden_state, attention_mask=None):
        # hidden_state 形状: (batch_size, seq_len, hidden_size)
        batch_size = hidden_state.size(0)  # 获取批量大小

        # 计算 Q、K、V，线性变换
        query = self.q_linear(hidden_state)  # (batch_size, seq_len, hidden_size)
        key = self.k_linear(hidden_state)    # (batch_size, seq_len, hidden_size)
        value = self.v_linear(hidden_state)  # (batch_size, seq_len, hidden_size)

        # 分割多头，将每个头的维度拆分出来
        query = self.split_head(query)  # (batch_size, num_heads, seq_len, head_dim)
        key = self.split_head(key)      # (batch_size, num_heads, seq_len, head_dim)
        value = self.split_head(value)  # (batch_size, num_heads, seq_len, head_dim)

        # 计算注意力分数，使用缩放点积注意力机制
        # attention_scores 形状: (batch_size, num_heads, seq_len, seq_len)
        attention_scores = torch.matmul(query, key.transpose(-1, -2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        
        # 添加注意力掩码（seq_len, seq_len），掩码位置（1）的值为负无穷
        if attention_mask is not None:
            attention_scores += attention_mask * -1e9
        
        # 对注意力分数进行归一化，得到注意力概率
        attention_probs = torch.softmax(attention_scores, dim=-1)  # (batch_size, num_heads, seq_len, seq_len)

        # 计算注意力输出，通过注意力概率加权值
        output = torch.matmul(attention_probs, value)  # (batch_size, num_heads, seq_len, head_dim)
        
        # 对多头注意力输出进行拼接
        # output.transpose(1, 2) 将 num_heads 和 seq_len 维度转置
        # 将形状调整为 (batch_size, seq_len, hidden_size)
        output = output.transpose(1, 2).reshape(batch_size, -1, self.head_dim * self.num_heads)
        
        # 通过线性层将拼接后的输出变换为所需的输出维度
        output = self.o_linear(output)  # (batch_size, seq_len, hidden_size)
        
        return output

    def split_head(self, x):
        batch_size = x.size(0)  # 获取批量大小
        # x 形状: (batch_size, seq_len, hidden_size)
        # 将 hidden_size 分割为 num_heads 和 head_dim
        return x.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        # 返回形状: (batch_size, num_heads, seq_len, head_dim)

In [48]:
def test_MHA():
    
    # 随机生成输入数据
    hidden_state = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)
    
    # 创建多头注意力模块
    mha = MultiHeadAttention(hidden_size, num_heads)
    
    # 计算多头注意力输出
    output = mha(hidden_state)
    
    print("Input shape:", hidden_state.shape)
    print("Output shape:", output.shape)
    print(hidden_state - output) # 说明输入和输出是有差距的
if __name__ == "__main__":
	test_MHA()

Input shape: torch.Size([128, 512, 1024])
Output shape: torch.Size([128, 512, 1024])
tensor([[[-1.6193, -0.0891, -0.4542,  ...,  0.4297, -0.8153,  0.3470],
         [-2.7809, -0.7091, -0.4975,  ...,  0.7200,  0.1808,  0.2124],
         [ 1.3735, -0.8517,  0.3719,  ..., -1.7809,  1.1514,  0.3892],
         ...,
         [-1.3494,  0.2579, -1.1982,  ...,  0.5016,  2.3327,  0.4530],
         [-1.3341,  0.5193,  1.6964,  ..., -1.6268,  0.1739,  0.8901],
         [ 0.3217,  0.3426,  0.6627,  ..., -0.6213,  0.6771,  0.6266]],

        [[ 1.1545, -1.5229, -0.1207,  ..., -0.8561,  0.7075,  0.1770],
         [ 1.0253,  1.1284,  0.5222,  ...,  0.0522, -0.6267,  0.6013],
         [-0.5846, -0.4513, -0.8590,  ..., -0.3691, -0.7737, -1.9918],
         ...,
         [ 0.8158, -2.1588,  0.0743,  ..., -1.1199,  0.2521, -0.0692],
         [-1.1471, -1.4210,  0.8020,  ..., -1.3011,  0.8843,  0.4221],
         [-0.6515,  2.2974, -0.8135,  ...,  0.4630,  0.1544, -1.3763]],

        [[ 0.2843,  1.0236, -0.

![FFN](./decoder_img/FFN.png)

In [12]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


In [50]:
# 测试代码
def test_positionwise_feedforward():
    # 创建一个随机输入张量，形状为 [batch_size, seq_len, d_model]
    x = torch.randn(batch_size, seq_len, d_model)

    # 实例化 PositionWiseFeedForward 模块
    ff = PositionWiseFeedForward(d_model=d_model, d_ff=d_ff)

    # 打印输入张量的形状
    print("Input shape:", x.shape)

    # 前向传播
    output = ff(x)

    # 打印输出张量的形状
    print("Output shape:", output.shape)

# 运行测试
test_positionwise_feedforward()

Input shape: torch.Size([128, 512, 512])
Output shape: torch.Size([128, 512, 512])


In [None]:
### my purpose

# load tokenizer and decoder/encoder/encoder-decoder model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# tokenizer
inputs = tokenizer("Hello, my dog is cute.", return_tensors="pt")
print(inputs)

# use decoder to generate
generated_ids = model.generate(**inputs)
# decoder to text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


# use encoder to embedding
outputs = model(**inputs)
# get the last layer's hidden state
last_hidden_states = outputs.last_hidden_state
print("Embeddings shape:", last_hidden_states.shape)

next question:
1. how to convert input text to tokenizer ID using tokenizer model?
2. how to transfer the tokenizer ID to model's input embedding?
3. how to use embedding to generate the final answer?

In [81]:
import torch
import torch.nn as nn
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
        
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        
        # 定义线性变换层
        self.q_linear = nn.Linear(hidden_size, hidden_size)
        self.k_linear = nn.Linear(hidden_size, hidden_size)
        self.v_linear = nn.Linear(hidden_size, hidden_size)
        self.o_linear = nn.Linear(hidden_size, hidden_size)

        self.scale = math.sqrt(self.head_dim)

    def split_head(self, x, batch_size):
        """Split the last dimension into (num_heads, head_dim).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, head_dim)
        """
        x = x.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        return x

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        # 线性变换
        q = self.q_linear(q)  # (batch_size, seq_len_q, hidden_size)
        k = self.k_linear(k)  # (batch_size, seq_len_k, hidden_size)
        v = self.v_linear(v)  # (batch_size, seq_len_v, hidden_size)

        # 分割头部
        q = self.split_head(q, batch_size)  # (batch_size, num_heads, seq_len_q, head_dim)
        k = self.split_head(k, batch_size)  # (batch_size, num_heads, seq_len_k, head_dim)
        v = self.split_head(v, batch_size)  # (batch_size, num_heads, seq_len_v, head_dim)

        # 计算注意力分数
        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale  # (batch_size, num_heads, seq_len_q, seq_len_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attention_weights = torch.softmax(scores, dim=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)

        # 应用注意力权重到值向量
        output = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len_q, head_dim)

        # 合并头部
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)  # (batch_size, seq_len_q, hidden_size)

        # 最后一个线性变换
        output = self.o_linear(output)  # (batch_size, seq_len_q, hidden_size)

        return output, attention_weights

![decoder](./decoder_img/decoder_code.png)

In [87]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout, vocab_size):
        super(DecoderLayer, self).__init__()
        
        # 自注意力层（Self-Attention）
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        
        # 编码器-解码器注意力层（Cross-Attention）
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        
        # 前馈网络（Feed-Forward）
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        
        # LayerNorm 层（Layer Normalization）
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        # Dropout 层
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # 将输入索引转换为嵌入向量，并调整尺度
       
        # 自注意力：输入为解码器自身的输入（x）
        attn_output, block1 = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))  # 残差连接和 LayerNorm
        
        # 交叉注意力：输入为编码器输出（enc_output）和解码器输入（x）
        attn_output, block2 = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))  # 残差连接和 LayerNorm
        
        # 前馈网络（Feed-Forward）
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))  # 残差连接和 LayerNorm
        
        return x, block1, block2

In [88]:
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout, vocab_size, max_len=5000):
        super(Decoder, self).__init__()
        
        # 嵌入层
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # 位置编码
        self.pos_encoding = PositionalEncoding(d_model, max_len=max_len)
        
        # 解码器层列表
        self.dec_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout, vocab_size)
            for _ in range(num_layers)
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attention_weights = {}
        seq_len = x.size(1)
        
        # 添加嵌入和位置编码
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        for i, dec_layer in enumerate(self.dec_layers):
            x, block1, block2 = dec_layer(x, enc_output, src_mask, tgt_mask)
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        
        # 最后通过全连接层映射到词汇表大小
        final_output = self.fc_out(x)  # (batch_size, target_seq_len, vocab_size)
        
        return final_output, attention_weights

In [99]:
# 示例参数设置
batch_size = 32
seq_len = 10
d_model = 512
num_heads = 8
d_ff = 2048
dropout = 0.1
vocab_size = 10000
num_layers = 6

# 创建随机输入
x = torch.randint(0, vocab_size, (batch_size, seq_len))  # 解码器输入，类型为 Long
enc_output = torch.randn(batch_size, seq_len, d_model)   # 编码器输出，类型为 Float
src_mask = torch.ones(batch_size, 1, 1, seq_len)         # 源序列的 mask，类型为 Float
tgt_mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1)  # 目标序列的 mask
tgt_mask = tgt_mask.expand(batch_size, num_heads, seq_len, seq_len)

print(enc_output.size())
print(src_mask.size())
print(tgt_mask.size())

# 初始化 Decoder
decoder = Decoder(num_layers, d_model, num_heads, d_ff, dropout, vocab_size)

# 获取解码器的输出
output, attn_weights = decoder(x, enc_output, src_mask, tgt_mask)

# 输出的形状应该是 [batch_size, seq_len, vocab_size]
print(output.shape)  # 应该是 (32, 10, 10000)

torch.Size([32, 10, 512])
torch.Size([32, 1, 1, 10])
torch.Size([32, 8, 10, 10])
torch.Size([32, 10, 10000])


In [111]:
## 输入数据
# 示例文本
source_texts = ["Translate this sentence.", "Another example sentence."]
target_texts = ["Translate this sentence.", "Another example sentence."]

In [112]:
## step 1: tokenizer
from transformers import AutoTokenizer

# 本地分词器目录路径
local_tokenizer_dir = "./tokenizer_files/"  # 替换为你的实际路径

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(
    local_tokenizer_dir,
    trust_remote_code=True,  # 如果使用的分词器有自定义代码，需要启用此选项
    truncation_side='right', # 设置分词器的截断侧
    padding_side='right'     # 设置分词器的填充侧
)

# 分词
tokenized_output = tokenizer(
    source_texts, 
    return_tensors="pt",  # 返回 PyTorch 张量
    padding=True,         # 启用填充
    truncation=True       # 启用截断
)

src_input_ids = tokenized_output["input_ids"] # [batch, tokenizer]
src_attention_mask = tokenized_output["attention_mask"]

print(src_input_ids.shape)
print(src_attention_mask.shape)

# # step2 E & PE
# embedding_layer = nn.Embedding(
#     num_embeddings=tokenizer.vocab_size,  # 词汇表大小
#     embedding_dim=embedding_dim          # 嵌入维度
# )
# position_embedding = nn.Embedding(max_seq_len, embedding_dim)

torch.Size([2, 4])
torch.Size([2, 4])


In [113]:
# 编码目标序列
encoded_target = tokenizer(target_texts, padding=True, truncation=True, return_tensors='pt')
tgt_input_ids = encoded_target['input_ids']
tgt_attention_mask = encoded_target['attention_mask']

# 准备交叉注意力的 src_mask
src_mask = src_attention_mask.unsqueeze(1).unsqueeze(2)

# 准备自注意力的 tgt_mask
seq_len = tgt_input_ids.size(1)
tgt_mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1)
tgt_mask = tgt_mask.to(tgt_input_ids.device).expand(tgt_input_ids.size(0), num_heads, seq_len, seq_len)

# 初始化 Decoder 并获取输出
decoder = Decoder(num_layers, d_model, num_heads, d_ff, dropout, tokenizer.vocab_size)
output, attn_weights = decoder(tgt_input_ids, enc_output, src_mask, tgt_mask)

# 输出的形状应该是 [batch_size, seq_len, vocab_size]
print(output.shape)  # 应该是 (32, 10, 10000)

RuntimeError: The size of tensor a (4) must match the size of tensor b (160) at non-singleton dimension 3

In [67]:
# 生成嵌入
token_embeddings = embedding_layer(input_ids) # [batch_size, seq_len, emb]
print(token_embeddings.shape)
# Position Embedding
batch_size = token_embeddings.shape[0]
seq_len = token_embeddings.shape[1]

positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len) # [batch_size, seq_len]
print(positions.shape)
position_embeddings = position_embedding(positions)  # [batch_size, seq_len, embed_dim] 
print(position_embeddings.shape)
# Combine Token and Position Embedding
# this is the final embedding inputting to decoder

embeddings = token_embeddings + position_embeddings  # [batch_size, seq_len, embed_dim]

# Prepare Causal Mask for Decoder 对于生成任务，还会添加 causal mask（自回归掩码），确保当前 token 只关注自身及之前的 token。
# this is the final mask inputting to decoder
# final_mask 的维度应该是多大的？？？？？？？？？？？？？？？？？？？？？？ 生成的掩码的维度和embedding后的维度的关系
causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=input_ids.device), diagonal=1).bool()
final_mask = causal_mask | ~attention_mask.bool().unsqueeze(1).expand(-1, seq_len, -1)  # Combine causal and attention masks
print(causal_mask.shape)
print(final_mask.shape)
# step2 final answer:embedding & mask

# 查看分词结果
# print("Input Text:", input_text)
# print("Tokenized IDs:", tokenized_output["input_ids"])
# print("Attention Mask:", tokenized_output["attention_mask"])
# print(tokenized_output)
# print(len(tokenized_output["input_ids"][0]))
# print("Embeddings Shape:", embeddings.shape)
# print("tokenizer.vocab_size:", tokenizer.vocab_size)

torch.Size([1, 8, 512])
torch.Size([1, 8])
torch.Size([1, 8, 512])
torch.Size([8, 8])
torch.Size([1, 8, 8])


![连接](./decoder_img/emb_to_mha.png)

In [None]:
# 3. how to use embedding to generate the final answer?
## embedding + final_mask

### 构造 Q K V

In [None]:
### my purpose

# load tokenizer and decoder/encoder/encoder-decoder model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# tokenizer
inputs = tokenizer("Hello, my dog is cute.", return_tensors="pt")
print(inputs)

# use decoder to generate
generated_ids = model.generate(**inputs)
# decoder to text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


# use encoder to embedding
outputs = model(**inputs)
# get the last layer's hidden state
last_hidden_states = outputs.last_hidden_state
print("Embeddings shape:", last_hidden_states.shape)