In [1]:
import json
import random
import hashlib
import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    get_cosine_schedule_with_warmup
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import random

  torch.utils._pytree._register_pytree_node(


In [2]:
class Config:
    train_path = "./data/DuReaderQG/train.json"
    valid_path = "./data/DuReaderQG/dev.json"
    # model_path = "./model"
    model_checkpoint = 'langboat/mengzi-t5-base'
    save_dir = "./best_models"
    max_source_length = 1024
    max_target_length = 128
    batch_size = 4
    accum_steps = 4
    epochs = 30
    val_samples_per_epoch = 1
    seed = 42
    valid_shuffle = True  

In [3]:
# 保证了所有 PyTorch 随机数（如 dropout、权重初始化）都能被固定，确保实验可复现。
torch.manual_seed(Config.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ================== 数据加载 ==================
def load_train_data(path):
    """加载训练数据（保持原始格式）"""
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f if line.strip()]
    
def load_valid_data(path):
    """加载验证数据（合并相同context+question）"""
    grouped = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                sample = json.loads(line)
                key = hashlib.md5(
                    (sample["context"] + sample["question"]).encode()
                ).hexdigest()
                if key not in grouped:
                    grouped[key] = {
                        "context": sample["context"],
                        "question": sample["question"],
                        "answers": [],
                        "ids": []
                    }
                grouped[key]["answers"].append(sample["answer"])
                grouped[key]["ids"].append(sample["id"])
    return list(grouped.values()) 


Using device: cuda


In [4]:
# 数据检查点
print("\n=== 数据加载检查 ===")
train_data = load_train_data(Config.train_path)
valid_data = load_valid_data(Config.valid_path)
print(f"训练集样本数: {len(train_data)}")
print(f"验证集样本数: {len(valid_data)} (合并后)")


=== 数据加载检查 ===
训练集样本数: 14520
验证集样本数: 700 (合并后)


In [5]:
train_data[0], valid_data[0]

({'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。',
  'answer': '第35集',
  'question': '仙剑奇侠传3第几集上天界',
  'id': 0},
 {'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。',
  'question': '2017年银行贷款基准利率',
  'answers': ['年基准利率4.35%', '4.35%'],
  'ids': [0, 1]})

In [6]:
class QADataset(Dataset):
    def __init__(self, data, is_train=True):
        self.data = data
        self.is_train = is_train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        if self.is_train:
            return {
                "context": item["context"],
                "question": item["question"],
                "answer": item["answer"],
                "id": item["id"]
            }
        else:
            return {
                "context": item["context"],
                "question": item["question"],
                "answers": item["answers"],
                "id": item["ids"][0]
            }

train_dataset = QADataset(train_data, is_train=True)
valid_dataset = QADataset(valid_data, is_train=False)

In [7]:
train_dataset[0], valid_dataset[0]

({'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。',
  'question': '仙剑奇侠传3第几集上天界',
  'answer': '第35集',
  'id': 0},
 {'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。',
  'question': '2017年银行贷款基准利率',
  'answers': ['年基准利率4.35%', '4.35%'],
  'id': 0})

In [8]:
""" 把验证集按轮次（epoch）均匀地拆分成多份，以便在每个 epoch 只评估其中一部分样本，从而减少一次评估的开销或实现跨 epoch 的完整覆盖。"""
class ValidChunker:
    def __init__(self, dataset, total_epochs):
        self.dataset = dataset
        self.total_samples = len(dataset)
        self.total_epochs = total_epochs
        self.samples_per_epoch = self.total_samples // self.total_epochs
        
        self.indices = torch.arange(self.total_samples).tolist()
        if Config.valid_shuffle:
            random.shuffle(self.indices)
        
        # 处理不能整除的情况
        if self.total_samples % self.total_epochs != 0:
            pad = self.total_epochs - (self.total_samples % self.total_epochs)
            self.indices += self.indices[:pad]
            self.total_samples += pad

    def get_chunk_indices(self, epoch):
        start = epoch * self.samples_per_epoch
        end = start + self.samples_per_epoch
        return self.indices[start:end]

In [9]:
valid_chunker = ValidChunker(valid_dataset, Config.epochs)
tokenizer = AutoTokenizer.from_pretrained(Config.model_checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
def train_collate(batch_data):
    batch_inputs = [f"question: {x['question']} context: {x['context']}" for x in batch_data]
    batch_targets = [x["answer"] for x in batch_data]
    
    sources = tokenizer(
        batch_inputs,
        max_length=Config.max_source_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    targets = tokenizer(
        batch_targets,
        max_length=Config.max_target_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).input_ids
    
    targets[targets == tokenizer.pad_token_id] = -100
    return {
        "input_ids": sources.input_ids,
        "attention_mask": sources.attention_mask,
        "labels": targets
    }
    

In [11]:
def valid_collate(batch):
    batch_inputs = [f"question: {x['question']} context: {x['context']}" for x in batch]
    processed = tokenizer(
        batch_inputs,
        max_length=Config.max_source_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    return {
        "input_ids": processed.input_ids,
        "attention_mask": processed.attention_mask,
        "contexts": [x["context"] for x in batch],
        "questions": [x["question"] for x in batch],
        "answers": [x["answers"] for x in batch],
        "ids": [x["id"] for x in batch]
    }

In [12]:
def valid_collate(batch):
    """专注于前向推理和评估所需的所有信息"""
    inputs = [f"question: {x['question']} context: {x['context']}" for x in batch]
    processed = tokenizer(
        inputs,
        max_length=Config.max_source_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    return {
        "input_ids": processed.input_ids,
        "attention_mask": processed.attention_mask,
        "contexts": [x["context"] for x in batch],
        "questions": [x["question"] for x in batch],
        "answers": [x["answers"] for x in batch],
        "ids": [x["id"] for x in batch]
    }

In [13]:
train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, collate_fn=train_collate)
valid_loader = DataLoader(valid_dataset, batch_size=Config.batch_size,collate_fn=valid_collate)

In [14]:
# 假设你已经定义好 train_dataloader
batch = next(iter(train_loader))  # train_loader 就是你的 DataLoader 对象

# 将张量移动到 CPU，便于查看
for key, value in batch.items():
    print(f"{key}:")
    print(value if isinstance(value, list) else value.shape)
    print("-" * 50)

input_ids:
torch.Size([4, 422])
--------------------------------------------------
attention_mask:
torch.Size([4, 422])
--------------------------------------------------
labels:
torch.Size([4, 7])
--------------------------------------------------


In [15]:
print("input_ids 示例:")
print(batch["input_ids"][0])  # 第一个样本的 token id 序列
print("解码为文本:")
print(tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True))

print("\nlabels 示例:")
print(batch["labels"][0])
print("解码为文本:")
label_ids = batch["labels"][0]
# 替换掉 -100（忽略值）为 pad_token_id，再解码
label_ids = [id if id != -100 else tokenizer.pad_token_id for id in label_ids]
print(tokenizer.decode(label_ids, skip_special_tokens=True))

input_ids 示例:
tensor([    7,  3454,  2055, 21293, 15807,    13,     7, 22316,   271,   930,
          217,   373,     7, 25395,  7368,  1733,  1550,    13,     7, 22316,
          271,  2273,    13, 19227,   864,  1709,  5334, 24841,  1290, 15312,
           25,   226,   373,    13, 20829,   373,    22, 17518, 14711, 27436,
        11641,  9459,  4988, 20737, 12252,  1334, 25018,  8692, 18732, 24948,
           25,   271,  2855,    13, 22316,   864,  2137,  2055, 20560,     7,
        20814, 12303, 11979,   908,  8909, 16903,  5334, 24841,  1290, 15312,
           25,   480,  1365,   617,    13, 18607,  1365,  2357,   193,     6,
          281,     6,  1057,    86, 22316, 12123, 14886,   276,     3,  6212,
         6425,    86, 16149,  1641, 22316,   190,   841,     6,  2881,   387,
         4446, 20829,  2273,  2218,  2114,  1038,     4,   480,  1365,   239,
           13, 18607,   289,    23,    45,   191, 19227,     6,  4976,    86,
        22316,   387,  3821, 14654, 23527,     3, 

In [16]:
class BleuEvaluator:
    def __init__(self):
        # BLEU 在遇到 n-gram 零计数时会变为 0，smoothing 可以避免完全为 0 的极端情况，使得短句或稀疏 n-gram 的得分更平滑、更有区分度。
        self.smooth = SmoothingFunction().method1
        self.weights = {
            1: (1, 0, 0, 0),
            2: (0.5, 0.5, 0, 0),
            3: (1/3, 1/3, 1/3, 0),
            4: (0.25, 0.25, 0.25, 0.25)
        }

    # 这里把整个字符串按 字符（而不是按词）切分成 token 列表，常见于中文处理。
    def calc_bleu(self, pred, refs):
        pred_tokens = list(pred.strip())
        ref_tokens = [list(r.strip()) for r in refs]
        return {
            f"BLEU-{n}": sentence_bleu(
                ref_tokens, pred_tokens,
                weights=self.weights[n],
                smoothing_function=self.smooth
            ) for n in range(1, 5)
        }

    @staticmethod
    # 动态综合得分
    def dynamic_score(scores, pred_len):
        """短回答（如 1–2 字）完全依赖较低阶 n-gram；长回答则更需要高阶 n-gram 来衡量连贯性和词序。"""
        if pred_len <= 2:
            return scores["BLEU-1"] * 0.6 + scores["BLEU-2"] * 0.4
        elif 3 <= pred_len <= 5:
            return scores["BLEU-2"] * 0.5 + scores["BLEU-3"] * 0.3 + scores["BLEU-4"] * 0.2
        else:
            return scores["BLEU-4"] * 0.7 + scores["BLEU-3"] * 0.3

In [17]:
model = T5ForConditionalGeneration.from_pretrained(Config.model_checkpoint).to(device)
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [18]:
loss_history = []
lr_history = []
bleu1_history = []
bleu2_history = []
bleu3_history = []
bleu4_history = []
dynamic_history = []
evaluator = BleuEvaluator()
all_bleu = []
all_dynamic = []

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

num_training_steps = len(train_loader) * Config.epochs // Config.accum_steps
num_warmup_steps = int(0.1 * num_training_steps)  # 10% warmup，可调

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)


In [19]:
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    for step, batch_data in enumerate(dataloader, start=1):
        batch_data = {k: v.to(device) for k, v in batch_data.items()}
        outputs = model(**batch_data)
        loss = outputs.loss / Config.accum_steps
        loss.backward()

        if (step+1) % Config.accum_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() *  Config.accum_steps
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + step):>7f}')
        progress_bar.update(1)
        
    return total_loss

In [20]:
def validate(loader, model):
    
    model.eval()
    for batch in loader:
        inputs = batch["input_ids"].to(device)
        with torch.no_grad():
            generated = model.generate(
                inputs,
                max_length=Config.max_target_length,
                num_beams=5,
                early_stopping=True
            )
        preds = tokenizer.batch_decode(generated, skip_special_tokens=True)
        
        for i, (pred, refs) in enumerate(zip(preds, batch["answers"])):
            bleu = evaluator.calc_bleu(pred, refs)
            all_bleu.append(bleu)
            
            pred_len = len(list(pred.strip()))
            dynamic = BleuEvaluator.dynamic_score(bleu, pred_len)
            all_dynamic.append(dynamic)
    
    avg_bleu = {
        f"BLEU-{n}": np.mean([b[f"BLEU-{n}"] for b in all_bleu]) * 100
        for n in range(1, 5)
    }
    avg_dynamic = np.mean(all_dynamic) * 100
    return avg_bleu, avg_dynamic

def save_model(epoch, dynamic_score):
    save_path = Path(Config.save_dir) / f"epoch_{epoch}_dynamic_{dynamic_score:.2f}"
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"\n模型保存至: {save_path}")


In [21]:
total_loss = 0.
best_dynamic = 0.

for epoch in range(Config.epochs):
    print(f"Epoch {epoch+1}/{Config.epochs}\n-------------------------------")
    total_loss = train_loop(train_loader, model, optimizer, scheduler, epoch+1, total_loss)
    chunk_bleu, dynamic_score = validate(valid_loader, model)

# 拆分各 BLEU 分数
    bleu1_history.append(chunk_bleu["BLEU-1"])
    bleu2_history.append(chunk_bleu["BLEU-2"])
    bleu3_history.append(chunk_bleu["BLEU-3"])
    bleu4_history.append(chunk_bleu["BLEU-4"])
    dynamic_history.append(dynamic_score)

    print(f"Epoch {epoch+1} 验证结果：BLEU-1: {chunk_bleu['BLEU-1']:.2f}%, "
      f"BLEU-2: {chunk_bleu['BLEU-2']:.2f}%, "
      f"BLEU-3: {chunk_bleu['BLEU-3']:.2f}%, "
      f"BLEU-4: {chunk_bleu['BLEU-4']:.2f}%, "
      f"动态得分: {dynamic_score:.2f}%")

    if dynamic_score > best_dynamic:
        best_dynamic = dynamic_score
        save_model(epoch+1, dynamic_score)
    
    if (epoch+1) % 5 == 0 or epoch == Config.epochs-1:
        full_loader = DataLoader(
            valid_dataset,
            batch_size=Config.batch_size,
            collate_fn=valid_collate
        )

        full_bleu , full_avg_dynamic = validate(full_loader, model)
        print(f"\n[完整验证] BLEU-4: {full_bleu['BLEU-4']:.2f}% | 动态得分: {full_avg_dynamic:.2f}%")

Epoch 1/30
-------------------------------


loss: 1.762119: 100%|██████████| 3630/3630 [08:48<00:00,  6.86it/s]


Epoch 1 验证结果：BLEU-1: 77.47%, BLEU-2: 75.32%, BLEU-3: 68.31%, BLEU-4: 62.08%, 动态得分: 73.72%

模型保存至: best_models/epoch_1_dynamic_73.72
Epoch 2/30
-------------------------------


loss: 1.126257: 100%|██████████| 3630/3630 [08:49<00:00,  6.85it/s]


Epoch 2 验证结果：BLEU-1: 78.59%, BLEU-2: 76.59%, BLEU-3: 69.46%, BLEU-4: 63.22%, 动态得分: 74.95%

模型保存至: best_models/epoch_2_dynamic_74.95
Epoch 3/30
-------------------------------


loss: 0.864148: 100%|██████████| 3630/3630 [08:43<00:00,  6.93it/s]


Epoch 3 验证结果：BLEU-1: 78.58%, BLEU-2: 76.57%, BLEU-3: 69.24%, BLEU-4: 62.91%, 动态得分: 74.84%
Epoch 4/30
-------------------------------


loss: 0.712567: 100%|██████████| 3630/3630 [08:44<00:00,  6.92it/s]


Epoch 4 验证结果：BLEU-1: 78.63%, BLEU-2: 76.63%, BLEU-3: 69.40%, BLEU-4: 63.14%, 动态得分: 74.91%
Epoch 5/30
-------------------------------


loss: 0.607310: 100%|██████████| 3630/3630 [08:52<00:00,  6.81it/s]


Epoch 5 验证结果：BLEU-1: 78.33%, BLEU-2: 76.32%, BLEU-3: 69.02%, BLEU-4: 62.68%, 动态得分: 74.55%

[完整验证] BLEU-4: 62.38% | 动态得分: 74.31%
Epoch 6/30
-------------------------------


loss: 0.528563: 100%|██████████| 3630/3630 [08:52<00:00,  6.82it/s]


Epoch 6 验证结果：BLEU-1: 78.05%, BLEU-2: 76.01%, BLEU-3: 68.69%, BLEU-4: 62.29%, 动态得分: 74.20%
Epoch 7/30
-------------------------------


loss: 0.470695: 100%|██████████| 3630/3630 [09:04<00:00,  6.66it/s]


Epoch 7 验证结果：BLEU-1: 77.90%, BLEU-2: 75.85%, BLEU-3: 68.54%, BLEU-4: 62.18%, 动态得分: 74.07%
Epoch 8/30
-------------------------------


loss: 0.425744: 100%|██████████| 3630/3630 [08:57<00:00,  6.76it/s]


Epoch 8 验证结果：BLEU-1: 77.80%, BLEU-2: 75.73%, BLEU-3: 68.45%, BLEU-4: 62.09%, 动态得分: 73.95%
Epoch 9/30
-------------------------------


loss: 0.389001: 100%|██████████| 3630/3630 [09:03<00:00,  6.68it/s]


Epoch 9 验证结果：BLEU-1: 77.85%, BLEU-2: 75.77%, BLEU-3: 68.50%, BLEU-4: 62.13%, 动态得分: 73.98%
Epoch 10/30
-------------------------------


loss: 0.357560: 100%|██████████| 3630/3630 [09:07<00:00,  6.64it/s]


Epoch 10 验证结果：BLEU-1: 77.68%, BLEU-2: 75.57%, BLEU-3: 68.29%, BLEU-4: 61.93%, 动态得分: 73.78%

[完整验证] BLEU-4: 61.76% | 动态得分: 73.60%
Epoch 11/30
-------------------------------


loss: 0.330642: 100%|██████████| 3630/3630 [09:04<00:00,  6.66it/s]


Epoch 11 验证结果：BLEU-1: 77.44%, BLEU-2: 75.29%, BLEU-3: 68.00%, BLEU-4: 61.64%, 动态得分: 73.50%
Epoch 12/30
-------------------------------


loss: 0.307831: 100%|██████████| 3630/3630 [08:59<00:00,  6.72it/s]


Epoch 12 验证结果：BLEU-1: 77.42%, BLEU-2: 75.26%, BLEU-3: 68.03%, BLEU-4: 61.67%, 动态得分: 73.47%
Epoch 13/30
-------------------------------


loss: 0.291099:  85%|████████▍ | 3071/3630 [07:38<01:23,  6.73it/s]

KeyboardInterrupt: 