In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np

SOS_TOKEN = 0
EOS_TOKEN = 1
MAX_LENGTH = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 解码器层


解码器层结构如下:

![](./images/image-20241103212541411.png)

它的组成部分如下:
1. embedding层
2. GRU 层

输入一个批次的文本,先通过Embedding层将其转化为向量。接着送入GRU神经网络, 最后返回当前时间步GRU的输出和隐藏状态



In [2]:
# 编码器
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(
            input_size, hidden_size, device=device
        )  # 词嵌入层
        self.gru = nn.GRU(hidden_size, hidden_size, device=device)  # GRU层

    def forward(self, input_tensor, hidden):
        embedded = self.embedding(input_tensor).view(1, 1, -1)  # (1, 1, hidden_size)
        output, hidden = self.gru(
            embedded, hidden
        )  # (1, 1, hidden_size) 和 (1, 1, hidden_size)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(
            1, 1, self.hidden_size, device=device
        )  # 初始化隐藏状态 (1, 1, hidden_size)

# 带注意力机制的解码器

公式如下:

$$
\text{Attention}(Q, K, V) = \text{Softmax}\left(\frac{Q K^T}{\sqrt{d_k}}\right) V
$$

这里的Attention类就是实现了上面的公式,用户只需要传入对应的QKV, 它就会返回一个经过注意力加权后的向量


In [3]:
# 带注意力机制的解码器
class AttentionDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.1, max_length=MAX_LENGTH):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length

        self.embedding = nn.Embedding(
            output_size, hidden_size, device=device
        )  # 词嵌入层
        self.attn = nn.Linear(2 * hidden_size, max_length).to(device)  # 计算注意力权重
        self.attn_combine = nn.Linear(2 * hidden_size, hidden_size).to(
            device
        )  # 合并嵌入向量和注意力加权值
        self.gru = nn.GRU(hidden_size, hidden_size).to(device)  # GRU层
        self.dropout = nn.Dropout(dropout).to(device)  # Dropout层
        self.linear = nn.Linear(hidden_size, output_size).to(device)  # 输出层

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.dropout(
            self.embedding(input).view(1, 1, -1)
        )  # (1, 1, hidden_size)

        # 计算注意力权重
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), dim=1)), dim=1
        )  # (1, max_length)

        # 计算加权后的上下文向量
        attn_applied = torch.bmm(
            attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
        )  # (1, 1, hidden_size)

        # 拼接嵌入向量和上下文向量
        output = torch.cat((embedded[0], attn_applied[0]), 1)  # (1, 2 * hidden_size)
        output = self.attn_combine(output).unsqueeze(0)  # (1, 1, hidden_size)
        output = F.relu(output)  # (1, 1, hidden_size)

        output, hidden = self.gru(
            output, hidden
        )  # (1, 1, hidden_size) 和 (1, 1, hidden_size)
        output = self.linear(output[0])  # (1, output_size)
        output = F.log_softmax(output, dim=1)  # (1, output_size)

        return output, hidden, attn_weights

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Seq2Seq网络

结构如下:

![](./images/Seq2SeqStruction.png)

它的组成是:
1. encode层
2. decode层

decode终止输出的情况有两种:
1. 到达最大预测的长度
2. 遇到结束符


教师机制:

在训练过程中，模型在初期阶段往往表现不佳，其输出结果可能频繁与正确答案不匹配，导致收敛速度较慢。为加速模型收敛，我们会在训练过程中适时地对输出进行合理的纠正，使模型逐步接近正确答案，从而提高训练效果。


In [4]:
# Seq2Seq模型
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_tensor, target_tensor, teacher_forcing_ratio=0.5):
        input_length = input_tensor.size(0)  # 输入序列的长度
        target_length = target_tensor.size(0)  # 目标序列的长度

        encoder_hidden = self.encoder.init_hidden()  
        encoder_outputs = torch.zeros(
            MAX_LENGTH, self.encoder.hidden_size, device=device
        )  

        # 编码阶段
        for ei in range(input_length):
            encoder_output, encoder_hidden = self.encoder(
                input_tensor[ei], encoder_hidden
            )  # (1, 1, hidden_size) 和 (1, 1, hidden_size)
            encoder_outputs[ei] = encoder_output[
                0, 0
            ]  # 取出每个时间步的输出 (MAX_LENGTH, hidden_size)

        # 初始化解码器输入（开始符号）和隐藏状态
        decoder_input = torch.tensor([[SOS_TOKEN]], device=device)  # (1, 1)
        decoder_hidden = encoder_hidden 
        # (target_length, output_size)
        all_decoder_outputs = torch.zeros(
            target_length, self.decoder.output_size, device=device
        )  

        use_teacher_force = random.random() < teacher_forcing_ratio  # 是否使用教师强制

        # 解码阶段
        for di in range(target_length):
            # (1, output_size), (1, 1, hidden_size), (1, max_length)
            decoder_output, decoder_hidden, attn_weights = self.decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )  
            # 存储每一步的输出 (target_length, output_size)
            all_decoder_outputs[di] = (
                decoder_output 
            )
            # 获取最大概率的词索引
            topv, topi = decoder_output.topk(1) 
            # 获取下一个时间步的输入 (1)
            decoder_input = topi.squeeze().detach()  
            # 使用真实标签作为下一步的输入
            if use_teacher_force:
                decoder_input = target_tensor[di]  
        # (target_length, output_size)
        return all_decoder_outputs

我们自己模拟一些数据测试一下,看看整个网络能不能跑通

In [7]:
input_size = 10  # 输入词汇表大小
output_size = 10  # 输出词汇表大小
hidden_size = 256  # GRU的隐藏层大小
batch_size = 1  # 测试时使用单个序列
max_length = 10  # 输入和输出序列的最大长度
teacher_forcing_ratio = 0.5  # 教师强制率，控制是否使用真实标签作为下一步的输入

# 输入序列：形状为 (input_length, batch_size)
input_tensor = torch.randint(
    0, input_size, (max_length, 1), device=device
)  
# 目标序列：形状为 (target_length, batch_size)
target_tensor = torch.randint(
    0, output_size, (max_length, 1), device=device
) 

# 初始化编码器、解码器和Seq2Seq模型
encoder = Encoder(input_size, hidden_size).to(device)  
decoder = AttentionDecoder(hidden_size, output_size).to(device)  
seq2seq_model = Seq2Seq(encoder, decoder).to(device)  

# 通过Seq2Seq模型进行前向传播
output = seq2seq_model(input_tensor, target_tensor, teacher_forcing_ratio)


print("输出的形状:", output.shape)  
print("输出结果:", output)

输出的形状: torch.Size([10, 10])
输出结果: tensor([[-2.2461, -2.2819, -2.4620, -2.2147, -2.4411, -2.2934, -2.2494, -2.1325,
         -2.3702, -2.3839],
        [-2.2118, -2.3072, -2.4071, -2.2543, -2.3731, -2.3399, -2.3419, -2.2153,
         -2.2430, -2.3539],
        [-2.2386, -2.2638, -2.3979, -2.2506, -2.3947, -2.3371, -2.3556, -2.1821,
         -2.2624, -2.3682],
        [-2.2188, -2.2772, -2.3897, -2.3419, -2.3797, -2.2719, -2.3886, -2.2583,
         -2.1884, -2.3350],
        [-2.2116, -2.1980, -2.3550, -2.3035, -2.3246, -2.2725, -2.4459, -2.2368,
         -2.3017, -2.4052],
        [-2.1951, -2.2575, -2.3521, -2.3529, -2.3478, -2.3122, -2.4296, -2.2681,
         -2.2390, -2.2926],
        [-2.2118, -2.2135, -2.3671, -2.3687, -2.3352, -2.2857, -2.3720, -2.2856,
         -2.2745, -2.3280],
        [-2.2952, -2.2617, -2.2988, -2.3434, -2.3602, -2.2751, -2.4099, -2.2849,
         -2.2504, -2.2581],
        [-2.2218, -2.2963, -2.3636, -2.3404, -2.3504, -2.2878, -2.4835, -2.2837,
         -2.2