In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import unicodedata
import re
import random
from io import open
import time
import math
import matplotlib.pyplot as plt
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# 定义开始符号和结束符号的常量
SOS_TOKEN = 0
EOS_TOKEN = 1
MAX_LENGTH = 10  # 定义句子的最大长度

# 语言类，负责处理语言相关操作，如创建词汇表
class Lang:
    def __init__(self, language):
        self.language = language  # 保存语言的名称
        self.size = 2  # 初始词汇表大小为2（SOS和EOS）
        self.word2index = {}  # 单词到索引的映射
        self.index2word = {0: 'SOS', 1: 'EOS'}  # 索引到单词的映射，初始包含SOS和EOS

    # 添加单词到词汇表
    def AddWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.size  # 为新单词分配索引
            self.index2word[self.size] = word  # 保存索引到单词的映射
            self.size += 1  # 增加词汇表的大小

    # 添加句子中的所有单词到词汇表
    def AddSentence(self, sentence):
        for word in sentence.split(" "):
            self.AddWord(word)

In [None]:
# 字符规范化
def unicodeToAscii(text):
    # 将Unicode字符转换为ASCII字符, 同时去掉重音符号，将é转换为e
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

def normalizeString(text):
    # 这个函数确保输入的文本只包含标准的ASCII字符
    # 防止多种编码格式导致不必要的错误
    s = unicodeToAscii(text.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
# 从文件中读取数据并进行初步处理
def readLangs(path, input_lang, output_lang):
    data = open(path, encoding='utf-8').read().split('\n') 
    pairs = [[normalizeString(text) for text in l.split('\t')] for l in data]  # 将每行中的句子分割并规范化
    input_lang = Lang(input_lang)  # 创建输入语言类
    output_lang = Lang(output_lang)  # 创建输出语言类
    return input_lang, output_lang, pairs  # 返回输入和输出语言类以及句子对

In [None]:
# 读取英语和法语数据
path = 'data/eng-fra.txt'  # 数据文件路径
input_lang, output_lang, pairs = readLangs(path, "eng", "fra")

# 由于数据集比较大,我们只取以下面开头的句子作为数据集来演示
eng_prefixes = ("i am ", "i m ", "he is", "he s ", "she is", "she s ")

In [None]:
# 数据过滤，定义只保留长度小于MAX_LENGTH且符合特定前缀的英语句子
def filterPair(p):
    return len(p[0].split()) < MAX_LENGTH and any(p[0].startswith(prefix) for prefix in eng_prefixes) and len(p[1].split()) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# 数据预处理，包括创建词汇表和句子过滤
def preprocess(path, input_lang, output_lang):
    input_lang, output_lang, pairs = readLangs(path, input_lang, output_lang)  # 读取语言和句子对
    pairs = filterPairs(pairs)  # 过滤不符合条件的句子对
    for pair in pairs:
        input_lang.AddSentence(pair[0])  # 添加输入句子中的单词到词汇表
        output_lang.AddSentence(pair[1])  # 添加输出句子中的单词到词汇表
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = preprocess(path, 'eng', 'fra')  # 预处理数据

In [None]:
# 将句子转换为张量表示
def TensorFormSentence(Lang, sentence):
    indexes = [Lang.word2index[word] for word in sentence.split(' ')]  # 将每个单词转换为对应的索引
    indexes.append(EOS_TOKEN)  # 添加结束符号
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)  # 返回形状为 (句子长度, 1) 的张量

# 将句子对转换为张量表示
def TensorFormPairs(input_lang, output_lang, pair):
    input_tensor = TensorFormSentence(input_lang, pair[0])  # 转换输入句子为张量
    output_tensor = TensorFormSentence(output_lang, pair[1])  # 转换输出句子为张量
    return (input_tensor, output_tensor)  # 返回输入和输出张量对

In [72]:
# 编码器
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(
            input_size, hidden_size, device=device
        )  # 词嵌入层
        self.gru = nn.GRU(hidden_size, hidden_size, device=device)  # GRU层

    def forward(self, input_tensor, hidden):
        embedded = self.embedding(input_tensor).view(1, 1, -1)  # (1, 1, hidden_size)
        output, hidden = self.gru(
            embedded, hidden
        )  # (1, 1, hidden_size) 和 (1, 1, hidden_size)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(
            1, 1, self.hidden_size, device=device
        )  # 初始化隐藏状态 (1, 1, hidden_size)

In [73]:
# 带注意力机制的解码器
class AttentionDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.1, max_length=MAX_LENGTH):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length

        self.embedding = nn.Embedding(
            output_size, hidden_size, device=device
        )  # 词嵌入层
        self.attn = nn.Linear(2 * hidden_size, max_length).to(device)  # 计算注意力权重
        self.attn_combine = nn.Linear(2 * hidden_size, hidden_size).to(
            device
        )  # 合并嵌入向量和注意力加权值
        self.gru = nn.GRU(hidden_size, hidden_size).to(device)  # GRU层
        self.dropout = nn.Dropout(dropout).to(device)  # Dropout层
        self.linear = nn.Linear(hidden_size, output_size).to(device)  # 输出层

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.dropout(
            self.embedding(input).view(1, 1, -1)
        )  # (1, 1, hidden_size)

        # 计算注意力权重
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), dim=1)), dim=1
        )  # (1, max_length)

        # 计算加权后的上下文向量
        attn_applied = torch.bmm(
            attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
        )  # (1, 1, hidden_size)

        # 拼接嵌入向量和上下文向量
        output = torch.cat((embedded[0], attn_applied[0]), 1)  # (1, 2 * hidden_size)
        output = self.attn_combine(output).unsqueeze(0)  # (1, 1, hidden_size)
        output = F.relu(output)  # (1, 1, hidden_size)

        output, hidden = self.gru(
            output, hidden
        )  # (1, 1, hidden_size) 和 (1, 1, hidden_size)
        output = self.linear(output[0])  # (1, output_size)
        output = F.log_softmax(output, dim=1)  # (1, output_size)

        return output, hidden, attn_weights

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [74]:
# Seq2Seq模型
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_tensor, target_tensor, teacher_forcing_ratio=0.5):
        input_length = input_tensor.size(0)  # 输入序列的长度
        target_length = target_tensor.size(0)  # 目标序列的长度

        encoder_hidden = self.encoder.init_hidden()  
        encoder_outputs = torch.zeros(
            MAX_LENGTH, self.encoder.hidden_size, device=device
        )  

        # 编码阶段
        for ei in range(input_length):
            encoder_output, encoder_hidden = self.encoder(
                input_tensor[ei], encoder_hidden
            )  # (1, 1, hidden_size) 和 (1, 1, hidden_size)
            encoder_outputs[ei] = encoder_output[
                0, 0
            ]  # 取出每个时间步的输出 (MAX_LENGTH, hidden_size)

        # 初始化解码器输入（开始符号）和隐藏状态
        decoder_input = torch.tensor([[SOS_TOKEN]], device=device)  # (1, 1)
        decoder_hidden = encoder_hidden 
        # (target_length, output_size)
        all_decoder_outputs = torch.zeros(
            target_length, self.decoder.output_size, device=device
        )  

        use_teacher_force = random.random() < teacher_forcing_ratio  # 是否使用教师强制

        # 解码阶段
        for di in range(target_length):
            # (1, output_size), (1, 1, hidden_size), (1, max_length)
            decoder_output, decoder_hidden, attn_weights = self.decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )  
            # 存储每一步的输出 (target_length, output_size)
            all_decoder_outputs[di] = (
                decoder_output 
            )
            # 获取最大概率的词索引
            topv, topi = decoder_output.topk(1) 
            # 获取下一个时间步的输入 (1)
            decoder_input = topi.squeeze().detach()  
            # 使用真实标签作为下一步的输入
            if use_teacher_force:
                decoder_input = target_tensor[di]  
        # (target_length, output_size)
        return all_decoder_outputs

In [None]:

# 训练函数
def train(input_tensor, output_tensor, model, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio=0.5):
    encoder_optimizer.zero_grad()  # 梯度清零
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    output_length = output_tensor.size(0)
    loss = 0

    decoded_words = model(input_tensor, output_tensor, teacher_forcing_ratio)  # 使用Seq2Seq模型进行前向传播

    # 计算损失
    for di in range(output_length):
        target_word = output_tensor[di]  # 获取目标词
        output_word = decoded_words[di].unsqueeze(0)  # 使维度变为 (1, output_size)
        loss += criterion(output_word, target_word)  # 计算损失

    loss.backward()  # 反向传播

    encoder_optimizer.step()  # 更新参数
    decoder_optimizer.step()

    return loss.item() / output_length


In [None]:
# 训练迭代器
def trainIters(model, n_iters, learning_rate=0.01, plot_every=100):
    start = time.time()
    plot_loss_total = 0
    total_loss = 0
    encoder_optimizer = optim.SGD(model.encoder.parameters(), lr=learning_rate)  # 使用随机梯度下降优化器
    decoder_optimizer = optim.SGD(model.decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()  # 损失函数使用负对数似然损失

    tbar = tqdm(range(n_iters), desc='epoch', leave=False)

    for epoch in tbar:
        pair = random.choice(pairs)  # 随机选择一个句子对
        input_tensor = TensorFormSentence(input_lang, pair[0])
        output_tensor = TensorFormSentence(output_lang, pair[1])

        loss = train(input_tensor, output_tensor, model, encoder_optimizer, decoder_optimizer, criterion)

        plot_loss_total += loss
        total_loss += loss

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_loss_total = 0
            tbar.set_postfix(loss=f"{total_loss / (epoch + 1):.4f}")

In [None]:
# 定义 Seq2Seq 模型
hidden_size = 256
encoder1 = Encoder(input_lang.size, hidden_size).to(device)  # 初始化编码器
attn_decoder1 = AttentionDecoder(hidden_size, output_lang.size, max_length=10, dropout=0.1).to(device)  # 初始化带注意力机制的解码器
seq2seq_model = Seq2Seq(encoder1, attn_decoder1).to(device)  # 创建Seq2Seq模型

# 训练模型
trainIters(seq2seq_model, n_iters=5000)

                                                                       

In [None]:
import random
import torch


# 评估函数
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    # 评估时不计算梯度
    with torch.no_grad(): 
        # 将输入句子转换为张量
        input_tensor = TensorFormSentence(input_lang, sentence)  
        input_length = input_tensor.size(0)

        # 初始化编码器隐藏状态
        encoder_hidden = encoder.init_hidden()  
        # 初始化编码器输出
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)  

        # 编码器过程
        for ei in range(input_length):
            # 计算编码器的输出
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)  
            encoder_outputs[ei] = encoder_output[0, 0]
         # 解码器的输入初始化为SOS符号
        decoder_input = torch.tensor([[SOS_TOKEN]], device=device) 
        # 解码器的隐藏状态初始化为编码器的隐藏状态
        decoder_hidden = encoder_hidden  
        # 存储解码器生成的单词
        decoder_words = []  
        # 存储注意力权重
        decoder_attentions = torch.zeros(max_length, max_length, device=device)  

        # 解码器过程
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)  # 解码每个时间步
            decoder_attentions[di] = decoder_attention.data  # 保存注意力权重

            topv, topi = decoder_output.topk(1)  # 获取概率最高的单词索引
            if topi.item() == EOS_TOKEN:  # 如果是结束符，停止解码
                decoder_words.append('<EOS>')
                break
            else:
                decoder_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()  # 将预测的单词作为下一个时间步的输入

        return decoder_words, decoder_attentions[:di+1]

In [None]:
# 随机选择句子对并进行评估
def evaluateRandomly(encoder, decoder, n=6):
    for i in range(n):
        pair = random.choice(pairs)  # 随机选择一个句子对
        print(f'输入> {pair[0]}')
        print(f'输出> {pair[1]}')
        output_words, output_attn = evaluate(encoder, decoder, pair[0])  # 评估模型的输出
        output_sentence = ' '.join(output_words)  # 将生成的单词拼接为句子
        print(f'预测结果> {output_sentence}')
        print('-' * 50)

# 调用评估函数
evaluateRandomly(encoder1, attn_decoder1)  # 评估模型性能


输入> i m looking forward to seeing you .
输出> j ai hate de vous voir .
预测结果> je me de de te . . <EOS>
--------------------------------------------------
输入> he is what we call a scholar .
输出> il est ce qu on appelle un lettre .
预测结果> c est un homme un homme . . <EOS>
--------------------------------------------------
输入> he is enrolled at that university .
输出> il est entre a cette universite .
预测结果> il est en train de . . <EOS>
--------------------------------------------------
输入> i m certain .
输出> je suis sur .
预测结果> je suis en . . <EOS>
--------------------------------------------------
输入> she is always fishing for compliments .
输出> elle est toujours en quete de compliments .
预测结果> elle a la le . . . <EOS>
--------------------------------------------------
输入> he is popular with everybody .
输出> il est populaire aupres de tout le monde .
预测结果> il est le de de . . . <EOS>
--------------------------------------------------
