In [11]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate
from datasets import load_dataset

In [12]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# 数据集Dataset

#### 第一行代码的作用是 修改 Hugging Face 的默认 API 访问地址，使其指向镜像站 hf-mirror.com 而非官方源 (https://huggingface.co)。
#### 第二行行代码 从 Hugging Face Hub 加载名为 bentrevett/multi30k 的数据集。

In [13]:
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# dataset = datasets.load_dataset("bentrevett/multi30k")
dataset = load_dataset("multi30k")


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [15]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [16]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [17]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [18]:
string = "What a lovely day it is today!"

[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

### 这个tokenize_example函数用于 对双语（英语和德语）句子进行分词和预处理，通常用于机器翻译任务（如 multi30k 数据集）

1.标准化文本处理：统一分词、大小写和长度，便于后续转换为词嵌入或模型输入。

2.适配序列模型：添加 sos 和 eos 标记，帮助 Transformer 等模型识别句子边界。

3.控制输入维度：通过 max_length 避免过长的序列影响训练效率（如 GPU 内存不足）。

In [19]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

1.<sos>是Start of Sentence的缩写，表示句子的开始，帮助模型知道何时开始生成输出。

2.<eos>	是End of Sentence的缩写，表示句子的结束，告诉模型何时停止生成。

3.dataset.map() 是 Hugging Face datasets 库的方法，用于对数据集中的每个样本应用一个处理函数，此处为tokenize_example

In [20]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [21]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [22]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [23]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'in', 'the', 'on', 'man']

In [24]:
de_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'ein', 'einem', 'in', 'eine', ',']

In [25]:
en_vocab["the"]

7

In [26]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [27]:
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [28]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 0, 821]

In [29]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', 'shows']

### 这两个单元格的代码共同完成了将分词后的文本转换为数值ID序列的过程。

1.第一段代码的作用是将分词后的英文和德文单词列表分别转换为对应的数值ID序列：通过查询词汇表（en_vocab和de_vocab）的lookup_indices方法，将example中的"en_tokens"和"de_tokens"每个单词替换为词汇表中的整数编号（如"cat"→42），最终返回包含"en_ids"和"de_ids"这两个数值序列的新字典。

2.这段代码使用fn_kwargs固定传递词汇表参数，通过dataset.map()将numericalize_example函数批量应用到数据集的所有样本上，为每条数据新增数值ID列（"en_ids"和"de_ids"），同时保留原始的分词列。例如，将["the","cat"]转换为[5,12]这样的数字序列，而原始文本仍可查看。

In [30]:
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [31]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [32]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3],
 'de_ids': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3]}

In [33]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [34]:
train_data[0]

{'en_ids': tensor([   2,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            3]),
 'de_ids': tensor([   2,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    3]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

### 这两个函数共同完成了数据批处理和数据加载器创建的工作，是准备训练数据的关键步骤。

#### 1、get_collate_fn函数（数据批处理函数工厂），生成一个专门处理序列数据的collate函数。

核心功能：接收一个batch的样本(包含不等长的en_ids和de_ids)，使用pad_sequence进行填充对齐(padding_value指定填充值)，返回填充后的整齐batch数据。

#### 2、get_data_loader函数（数据加载器创建函数），创建可直接用于训练的数据加载器

核心功能：调用get_collate_fn获取定制的批处理函数，创建DataLoader实例并配置关键参数如下：

dataset：要加载的数据集

batch_size：批大小

collate_fn：使用上面创建的定制批处理函数

shuffle：是否打乱数据

In [35]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [36]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader


In [37]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

### Encoder类实现了一个基于LSTM的序列编码器，主要用于将输入的符号序列编码为隐藏状态表示。

1. **输入参数：**

+ input_dim：源语言词汇表大小

+ embedding_dim：词嵌入维度

+ hidden_dim：LSTM隐藏层维度

+ n_layers：LSTM层数

+ dropout：dropout概率

2. **核心组件：**

+ nn.Embedding：将离散的单词索引转换为连续的词向量

+ nn.LSTM：处理变长序列的双向LSTM

+ nn.Dropout：防止过拟合

3. **forward处理流程：**

+ 输入形状转换：`# src = [src length, batch size]`

+ 词嵌入+dropout：`embedded = self.dropout(self.embedding(src))`

+ LSTM编码：`outputs, (hidden, cell) = self.rnn(embedded)`

其中：

outputs：所有时间步的顶层隐藏状态

hidden：最后时间步的所有层隐藏状态

cell：最后时间步的所有层细胞状态

4. **输出说明：**

返回元组`(hidden, cell)`：

hidden形状：`[n layers, batch size, hidden dim]`

cell形状：`[n layers, batch size, hidden dim]`

PS：如果是双向LSTM，`n layers`需要乘以2

In [38]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

###  Decoder类的工作流程详解

1. **初始化阶段：**

- **参数说明：**

    + output_dim：目标语言词汇表大小

    + embedding_dim：词嵌入维度（需与Encoder一致）

    + hidden_dim：LSTM隐藏层维度（需与Encoder一致）

    + n_layers：LSTM层数（需与Encoder一致）

    + dropout：dropout概率

2. **核心组件：**

- nn.Embedding：将目标语言的单词索引转换为词向量

- nn.LSTM：单步解码的LSTM单元（与Encoder结构对称）

- nn.Linear：将隐藏状态映射到词汇表空间（fc_out）

- nn.Dropout：正则化层

3. **forward流程：**

a) 输入预处理：`input = input.unsqueeze(0)  # [batch_size] -> [1, batch_size]`，将当前时间步的输入（单个单词索引）扩展为序列形式。

b)词嵌入+dropout：`embedded = self.dropout(self.embedding(input))  # [1, batch_size, embedding_dim]`

c) LSTM解码：`output, (hidden, cell) = self.rnn(embedded, (hidden, cell))`，接收来自Encoder的hidden/cell作为初始状态，输出当前时间步的隐藏状态output和更新后的LSTM状态。

d) 词汇预测：`prediction = self.fc_out(output.squeeze(0))  # [batch_size, output_dim]`，通过全连接层计算每个单词的预测概率分布。

**4.输出说明：**

- prediction：当前时间步的词汇概率分布（用于计算loss）

- hidden/cell：更新后的LSTM状态（传递给下一个时间步）

In [39]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

### `Seq2Seq`类的详细解释

1. **类结构概述**

`Seq2Seq`类是一个标准的编码器-解码器架构，包含：

- 编码器（encoder）：将源语言序列编码为上下文向量

- 解码器（decoder）：基于上下文向量逐步生成目标语言序列

- 设备（device）：指定模型运行设备（CPU/GPU）

- 一致性检查：确保编码器和解码器的隐藏层维度和层数匹配

2. **forward函数流程**

**输入：**

- `src`：源语言序列，形状为[src长度, batch大小]

- `trg`：目标语言序列，形状为[trg长度, batch大小]

- `teacher_forcing_ratio`：teacher forcing概率（0.0~1.0）

**处理步骤：**

a)初始化输出张量：`outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)`，创建一个全零张量用于存储所有时间步的解码器输出。

b)编码阶段：`hidden, cell = self.encoder(src)`，编码器处理源语言序列，返回最终的隐藏状态和细胞状态，作为解码器的初始状态。

c)解码阶段：

+ 初始输入：使用目标序列的第一个token（通常是`<sos>`），`input = trg[0, :]`
+ 循环解码：
``` python
for t in range(1, trg_length):
    output, hidden, cell = self.decoder(input, hidden, cell)
    outputs[t] = output
    # teacher forcing逻辑
    teacher_force = random.random() < teacher_forcing_ratio
    top1 = output.argmax(1)  # 获取预测最可能的token
    input = trg[t] if teacher_force else top1
 ```
- 每个时间步：

    + 解码器接收当前输入和LSTM状态，生成输出和新状态

    + 存储当前时间步的输出

    + 根据teacher_forcing_ratio决定下一个输入是真实token还是预测token

**输出：**

- outputs：形状为`[trg长度, batch大小, trg词汇表大小]`的张量，包含每个时间步对目标词汇的预测概率分布。

3. **Teacher Forcing机制**

Teacher Forcing是一种训练技巧，其核心思想是：

- 概率性选择输入：

    + 以teacher_forcing_ratio的概率使用真实目标序列中的token作为下一个解码器输入。

    + 以1 - teacher_forcing_ratio的概率使用解码器自己的预测作为下一个输入。

- 作用：

    + 使用真实token（teacher forcing）可以加速模型收敛，防止早期训练时错误累积。

    + 使用预测token（非teacher forcing）可以让模型学习处理自己的错误，提高鲁棒性。
 
- 典型设置：

    + 训练初期：高`teacher_forcing_ratio`（如0.75）

    + 训练后期：逐步降低该比例

    + 推理阶段：始终使用预测`token（teacher_forcing_ratio=0）`
 
4. 设计要点

- 状态传递：编码器的最终状态完整传递给解码器，包含源序列的全部上下文信息。

- 序列生成：解码是自回归过程，每个时间步依赖前一步的输出。

- 灵活性：通过teacher_forcing_ratio可以灵活控制训练行为。

In [40]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [41]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 编码器初始化
encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

# 解码器初始化
decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

# Seq2Seq模型整合
model = Seq2Seq(encoder, decoder, device).to(device)

In [42]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [43]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,898,501 trainable parameters


In [44]:
optimizer = optim.Adam(model.parameters())

In [45]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [46]:
# 定义训练函数
def train_fn(
    model,           # 要训练的模型
    data_loader,     # 数据加载器，用于批量加载训练数据
    optimizer,       # 优化器，用于更新模型参数
    criterion,       # 损失函数，用于计算模型输出与目标之间的差异
    clip,           # 梯度裁剪的阈值，防止梯度爆炸
    teacher_forcing_ratio,  # 教师强制比例，控制使用真实标签作为解码器输入的概率
    device          # 计算设备（如'cuda'或'cpu'）
):
    # 将模型设置为训练模式（这会启用dropout和batch normalization等训练特有的层）
    model.train()
    
    # 初始化epoch损失为0
    epoch_loss = 0
    
    # 遍历数据加载器中的每个批次
    for i, batch in enumerate(data_loader):
        # 从批次中获取源语言（德语）数据并移动到指定设备
        src = batch["de_ids"].to(device)
        # 从批次中获取目标语言（英语）数据并移动到指定设备
        trg = batch["en_ids"].to(device)
        # src的形状 = [src长度, 批次大小]
        # trg的形状 = [trg长度, 批次大小]
        
        # 清除优化器中的梯度（防止梯度累积）
        optimizer.zero_grad()
        
        # 前向传播：将源数据和目标数据输入模型
        # teacher_forcing_ratio控制是否使用教师强制
        output = model(src, trg, teacher_forcing_ratio)
        # output的形状 = [trg长度, 批次大小, trg词汇表大小]
        
        # 获取输出维度（目标词汇表大小）
        output_dim = output.shape[-1]
        
        # 重塑输出：忽略第一个token（通常是<bos>），并将结果展平
        output = output[1:].view(-1, output_dim)
        # output的形状 = [(trg长度 - 1) * 批次大小, trg词汇表大小]
        
        # 重塑目标：忽略第一个token（<bos>），并将结果展平
        trg = trg[1:].view(-1)
        # trg的形状 = [(trg长度 - 1) * 批次大小]
        
        # 计算损失：模型输出与真实目标之间的差异
        loss = criterion(output, trg)
        
        # 反向传播：计算梯度
        loss.backward()
        
        # 梯度裁剪：防止梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # 更新模型参数
        optimizer.step()
        
        # 累加当前批次的损失值
        epoch_loss += loss.item()
    
    # 返回整个epoch的平均损失
    return epoch_loss / len(data_loader)

In [47]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [59]:
n_epochs = 1 # 因模型训练对计算资源要求较高，此处只设立了一轮训练。
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)  # 将模型移至GPU

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [10:27<00:00, 627.03s/it]

	Train Loss:   3.625 | Train PPL:  37.507
	Valid Loss:   4.155 | Valid PPL:  63.727





In [60]:
model.load_state_dict(torch.load("tut1-model.pt"))

<All keys matched successfully>

In [61]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [62]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [63]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [64]:
translation

['<sos>',
 'a',
 'man',
 'in',
 'a',
 'blue',
 'shirt',
 'is',
 'a',
 'a',
 '.',
 '.',
 '<eos>']