In [2]:
# 安装必要的组件来消除警告
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Looking in indexes: http://mirrors.cloud.aliyuncs.com/pypi/simple/
Collecting ipywidgets
  Downloading http://mirrors.cloud.aliyuncs.com/pypi/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading http://mirrors.cloud.aliyuncs.com/pypi/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.7 MB/s[0m  [33m0:00:00[0m
[?25hCollecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading http://mirrors.cloud.aliyuncs.com/pypi/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import warnings
warnings.filterwarnings("ignore", message="IProgress not found. Please update jupyter and ipywidgets.")

from datasets import load_dataset
dataset = load_dataset("yelp_review_full")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [4]:
!pip install 'accelerate>=0.26.0' -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[0m

In [1]:
pip install --upgrade accelerate>=0.26.0

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# 取部分样本（1000和500条）进行比较

# 导入必要的库
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import load_dataset, DatasetDict
import numpy as np
import evaluate
import os

# 确保所有依赖库都已正确安装
def check_dependencies():
    try:
        import torch
        print(f"PyTorch版本: {torch.__version__}")
    except ImportError:
        raise ImportError("请安装PyTorch: pip install torch")
    
    try:
        import accelerate
        from accelerate import __version__ as accelerate_version
        from packaging import version
        required_version = version.parse("0.26.0")
        current_version = version.parse(accelerate_version)
        if current_version < required_version:
            raise ImportError(f"accelerate版本过低 (当前: {accelerate_version}, 要求: >=0.26.0)")
        print(f"accelerate版本: {accelerate_version}")
    except ImportError as e:
        raise ImportError(f"请安装或升级accelerate: pip install 'accelerate>=0.26.0'\n原错误: {str(e)}")

# 检查依赖
check_dependencies()

# 1. 加载数据集（示例：Yelp情感分类数据集，可替换为你的数据）
# 若使用本地数据，可通过 load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv"}) 加载
dataset = load_dataset("yelp_polarity")

# 为了演示，取部分样本（1000和500条）
small_dataset_1k = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(1000)),
    "test": dataset["test"].shuffle(seed=42).select(range(200))  # 测试集取200条
})
small_dataset_500 = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(500)),
    "test": dataset["test"].shuffle(seed=42).select(range(200))
})

# 2. 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# 3. 定义数据预处理函数
def preprocess_function(examples):
    # 处理文本，返回tokenized结果（padding和truncation自动处理）
    return tokenizer(examples["text"], truncation=True, max_length=512)

# 4. 对数据集进行tokenize
tokenized_1k = small_dataset_1k.map(preprocess_function, batched=True)
tokenized_500 = small_dataset_500.map(preprocess_function, batched=True)

# 5. 定义评估指标（准确率）
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# 6. 定义训练和评估函数
def train_and_evaluate(train_dataset, model_suffix, num_epochs=3):
    # 加载BERT模型（分类头随机初始化，属于正常现象）
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-cased",
        num_labels=2  # Yelp是二分类，根据任务调整
    )
    
    # 数据整理器（自动处理padding）
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # 配置训练参数
    output_dir = f"./yelp_model_{model_suffix}"  # 使用相对路径
    os.makedirs(output_dir, exist_ok=True)  # 创建输出目录
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        logging_dir=f"{output_dir}/logs",
        logging_steps=10,
        eval_strategy="epoch",  # 使用新参数名
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        seed=42,
        fp16=False  # 若无GPU，保持False
    )
    
    # 初始化Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset["train"],
        eval_dataset=train_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # 开始训练
    train_results = trainer.train()
    
    # 评估最佳模型
    eval_results = trainer.evaluate()
    
    # 保存模型
    trainer.save_model(f"{output_dir}/final_model")
    
    return {
        "train_loss": train_results.training_loss,
        "eval_accuracy": eval_results["eval_accuracy"],
        "eval_loss": eval_results["eval_loss"]
    }

# 7. 分别训练两组数据集
results_1k = train_and_evaluate(tokenized_1k, "1000_samples")
results_500 = train_and_evaluate(tokenized_500, "500_samples")

# 8. 展示对比结果
print("=== 1000样本训练结果 ===")
print(f"训练损失: {results_1k['train_loss']:.4f}")
print(f"评估准确率: {results_1k['eval_accuracy']:.4f}")
print(f"评估损失: {results_1k['eval_loss']:.4f}\n")

print("=== 500样本训练结果 ===")
print(f"训练损失: {results_500['train_loss']:.4f}")
print(f"评估准确率: {results_500['eval_accuracy']:.4f}")
print(f"评估损失: {results_500['eval_loss']:.4f}")


PyTorch版本: 2.7.1+cu126
accelerate版本: 1.9.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2808,0.249556,0.91
2,0.1346,0.274717,0.905
3,0.0681,0.351585,0.92


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.307,0.222353,0.925
2,0.1172,0.395594,0.89
3,0.0624,0.393888,0.915


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


=== 1000样本训练结果 ===
训练损失: 0.2059
评估准确率: 0.9100
评估损失: 0.2496

=== 500样本训练结果 ===
训练损失: 0.2187
评估准确率: 0.9250
评估损失: 0.2224


In [1]:
# 全量YelpReviewFull数据集快速训练对比代码
# （优化参数确保1-2小时内完成）

# 确保依赖库完整
!pip install numpy pandas datasets transformers evaluate torch scikit-learn accelerate

# 导入所需库
import os
import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import evaluate
from IPython.display import display, HTML
import torch

# 设置缓存路径（根据实际情况修改）
os.environ['HF_HOME'] = 'D:/Z_Model/hf'
os.environ['HF_HUB_CACHE'] = 'D:/Z_Model/hf/hub'

# 忽略不必要的警告
import warnings
warnings.filterwarnings("ignore")

# 检查GPU是否可用
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"使用设备: {device}")
if device == "cuda":
    print(f"GPU型号: {torch.cuda.get_device_name(0)}")
    print(f"显存大小: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# 1. 加载完整数据集
print("\n加载完整YelpReviewFull数据集...")
dataset = load_dataset("yelp_review_full")
print("数据集结构：")
print(dataset)

# 2. 数据预处理（优化版）
print("\n数据预处理...")
# 使用轻量级模型的分词器
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    """高效分词处理，缩短文本长度以加速训练"""
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=64  # 缩短文本长度，情感分类足够用
    )

# 对全量数据进行分词（使用batched=True加速）
tokenized_train = dataset["train"].map(tokenize_function, batched=True, batch_size=1000)
tokenized_test = dataset["test"].map(tokenize_function, batched=True, batch_size=1000)

# 格式化数据集为PyTorch张量
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# 3. 定义评估指标
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 4. 训练函数（优化版）
def train_with_optimizations(train_dataset, model_name, num_epochs=2):
    """使用优化参数训练模型"""
    # 使用轻量级模型DistilBERT（BERT的蒸馏版，速度快60%，保留95%性能）
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", 
        num_labels=5
    ).to(device)
    
    # 优化的训练参数设置
    output_dir = f"D:/Z_Model/hf/hub/yelp_{model_name}"
    training_args = TrainingArguments(
        output_dir=output_dir,
        # 批次大小：根据GPU显存调整，越大越快
        per_device_train_batch_size=32 if device == "cuda" else 8,
        per_device_eval_batch_size=32 if device == "cuda" else 8,
        # 训练轮次：2轮足够在全量数据上收敛
        num_train_epochs=num_epochs,
        # 学习率：稍大以加速收敛
        learning_rate=2e-4,
        # 日志和评估策略：减少评估频率
        logging_dir=f"{output_dir}/logs",
        logging_steps=1000,  # 每1000步才日志一次
        eval_strategy="steps",
        eval_steps=5000,     # 每5000步评估一次
        save_strategy="no",  # 不保存检查点以节省时间
        # 硬件加速
        fp16=True if device == "cuda" else False,  # 启用混合精度训练
        load_best_model_at_end=False,  # 不加载最佳模型以节省时间
        report_to="none",  # 不报告到任何平台
        seed=42,
        # 数据加载优化
        dataloader_num_workers=4 if device == "cuda" else 0,
        dataloader_pin_memory=True if device == "cuda" else False,
    )
    
    # 初始化训练器
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )
    
    # 开始训练
    print(f"\n===== 开始训练 {model_name} =====")
    train_results = trainer.train()
    
    # 最终评估
    eval_results = trainer.evaluate()
    return {
        "train_loss": train_results.training_loss,
        "eval_accuracy": eval_results["eval_accuracy"],
        "eval_loss": eval_results["eval_loss"]
    }

# 5. 准备两组对比数据（全量数据的不同采样策略）
# 组1：全量数据（650,000条）
# 组2：全量数据的50%（325,000条）- 用于对比
print("\n准备对比数据集...")
full_train_100 = tokenized_train  # 100%全量数据
full_train_50 = tokenized_train.shuffle(seed=42).select(range(325000))  # 50%全量数据

# 6. 分别训练两组数据集
results_100 = train_with_optimizations(full_train_100, "full_100", num_epochs=2)
results_50 = train_with_optimizations(full_train_50, "full_50", num_epochs=2)

# 7. 展示对比结果
comparison = pd.DataFrame({
    "数据集规模": ["650,000条 (100%)", "325,000条 (50%)"],
    "训练损失": [
        f"{results_100['train_loss']:.4f}",
        f"{results_50['train_loss']:.4f}"
    ],
    "测试准确率": [
        f"{results_100['eval_accuracy']:.4f}",
        f"{results_50['eval_accuracy']:.4f}"
    ],
    "测试损失": [
        f"{results_100['eval_loss']:.4f}",
        f"{results_50['eval_loss']:.4f}"
    ],
    "准确率差距": [
        "-",
        f"{(results_50['eval_accuracy'] - results_100['eval_accuracy']):.4f}"
    ]
})

print("\n===== 全量数据集对比结果 =====")
display(HTML(comparison.to_html(index=False)))

# 8. 结果分析
print("\n===== 结果分析 =====")
acc_diff = results_100["eval_accuracy"] - results_50["eval_accuracy"]
print(f"100%全量数据比50%数据的准确率高 {acc_diff:.2%}")
print("性能优化说明：")
print("- 使用DistilBERT替代BERT-base，训练速度提升60%")
print("- 启用混合精度训练(fp16)，进一步加速30%")
print("- 增大批次大小并减少评估频率，减少I/O开销")
print("- 缩短文本长度至64 tokens，在情感分类任务中影响较小但大幅提速")


Looking in indexes: http://mirrors.cloud.aliyuncs.com/pypi/simple/
[0m使用设备: cuda
GPU型号: Tesla T4
显存大小: 14.74 GB

加载完整YelpReviewFull数据集...
数据集结构：
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

数据预处理...


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]


准备对比数据集...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== 开始训练 full_100 =====


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy
5000,1.1261,1.111401,0.51472
10000,1.0774,1.069205,0.53698
15000,1.0607,1.045258,0.5476
20000,1.0396,1.022065,0.55488
25000,0.9732,1.038385,0.55346
30000,0.9754,0.995694,0.56612
35000,0.9601,0.990295,0.57082
40000,0.9432,0.985918,0.57148


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TOKENIZERS_PARALLELISM=(true | false)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== 开始训练 full_50 =====


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy
5000,1.1066,1.109148,0.52412
10000,1.0758,1.054906,0.54236
15000,0.9829,1.050751,0.5536
20000,0.9586,1.012064,0.56244


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


===== 全量数据集对比结果 =====


数据集规模,训练损失,测试准确率,测试损失,准确率差距
"650,000条 (100%)",1.0297,0.5711,0.9858,-
"325,000条 (50%)",1.0482,0.5627,1.0111,-0.0084



===== 结果分析 =====
100%全量数据比50%数据的准确率高 0.84%
性能优化说明：
- 使用DistilBERT替代BERT-base，训练速度提升60%
- 启用混合精度训练(fp16)，进一步加速30%
- 增大批次大小并减少评估频率，减少I/O开销
- 缩短文本长度至64 tokens，在情感分类任务中影响较小但大幅提速


In [5]:
pip install --upgrade transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.cloud.aliyuncs.com/pypi/simple/
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import numpy as np
import datasets
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer, 
    AutoModelForQuestionAnswering, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
import time

# ======================== 1. 极速配置（适配旧版本） ========================
model_name = "distilbert-base-uncased-distilled-squad"
dataset_name = "squad"  # 用v1版本简化逻辑
max_length = 128
batch_size = 64
num_epochs = 1
sample_size = 2000  # 极小训练集

# 环境检测
device = "cuda" if torch.cuda.is_available() else "cpu"
use_fp16 = torch.cuda.is_available()
squad_v2 = False

# ======================== 2. 超精简数据处理 ========================
print("加载并裁剪数据集...")
datasets = load_dataset(dataset_name)
small_train = datasets["train"].select(range(sample_size))
small_validation = datasets["validation"].select(range(1000))
id_to_index = {ex["id"]: i for i, ex in enumerate(small_validation)}

# 加载分词器（启用fast加速）
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# 极简预处理
def prepare_features(examples, is_train=True):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    
    if is_train:
        tokenized["start_positions"] = []
        tokenized["end_positions"] = []
        for context, ans in zip(examples["context"], examples["answers"]):
            start_char = ans["answer_start"][0]
            end_char = start_char + len(ans["text"][0])
            start_token = min(len(tokenized["input_ids"][0])-1, start_char//2)
            end_token = min(len(tokenized["input_ids"][0])-1, end_char//2)
            tokenized["start_positions"].append(start_token)
            tokenized["end_positions"].append(end_token)
    
    return tokenized

# 预处理训练集
print("预处理训练集...")
tokenized_train = small_train.map(
    lambda x: prepare_features(x, is_train=True),
    batched=True,
    remove_columns=small_train.column_names,
    batch_size=1000
)
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])

# 预处理验证集
print("预处理验证集...")
tokenized_validation = small_validation.map(
    lambda x: prepare_features(x, is_train=False),
    batched=True,
    remove_columns=small_validation.column_names,
    batch_size=1000
)
tokenized_validation = tokenized_validation.add_column(
    "offset_mapping", 
    [[(i*2, (i+1)*2) for i in range(max_length)] for _ in range(len(tokenized_validation))]
)
tokenized_validation = tokenized_validation.add_column(
    "example_id", 
    [ex["id"] for ex in small_validation]
)
tokenized_validation.set_format("torch", columns=["input_ids", "attention_mask", "offset_mapping", "example_id"])

# ======================== 3. 加载模型 ========================
print(f"加载模型到{device}...")
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)

# ======================== 4. 极速评估函数 ========================
def evaluate_model(trainer, eval_dataset, validation_data, id_map):
    raw_preds = trainer.predict(eval_dataset)
    formatted_preds = [{"id": eval_dataset[i]["example_id"], "prediction_text": ""} 
                      for i in range(len(eval_dataset))]
    
    for i in range(min(500, len(eval_dataset))):
        start_idx = raw_preds.predictions[0][i].argmax()
        end_idx = raw_preds.predictions[1][i].argmax()
        if start_idx > end_idx:
            continue
        
        try:
            sample_idx = id_map[eval_dataset[i]["example_id"]]
            context = validation_data[sample_idx]["context"]
            formatted_preds[i]["prediction_text"] = context[:50]
        except:
            continue
    
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in validation_data]
    metric = evaluate.load("squad")
    return metric.compute(predictions=formatted_preds, references=references)

# ======================== 5. 训练配置（修复旧版本参数） ========================
# 旧版本用do_eval=False替代evaluation_strategy="no"
training_args = TrainingArguments(
    output_dir="./fast_qa",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    num_train_epochs=num_epochs,
    learning_rate=3e-5,
    fp16=use_fp16,
    logging_steps=10,
    save_steps=1000000,  # 不保存模型（设置极大值）
    do_eval=False,  # 禁用训练中评估（旧版本参数）
    report_to="none",
    dataloader_num_workers=4 if device=="cuda" else 0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=tokenized_train,
)

# ======================== 6. 执行流程 ========================
start_time = time.time()

# 初始评估
print("\n=== 初始评估 ===")
initial_results = evaluate_model(trainer, tokenized_validation, small_validation, id_to_index)
print(f"初始F1: {initial_results['f1']:.2f}%")

# 再训练
print("\n=== 开始再训练 ===")
trainer.train()

# 最终评估
print("\n=== 最终评估 ===")
final_results = evaluate_model(trainer, tokenized_validation, small_validation, id_to_index)
print(f"最终F1: {final_results['f1']:.2f}%")

# 总耗时
total_time = (time.time() - start_time) / 60
print(f"\n总耗时: {total_time:.1f}分钟")

加载并裁剪数据集...
预处理训练集...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

预处理验证集...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

加载模型到cuda...

=== 初始评估 ===


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

初始F1: 3.09%

=== 开始再训练 ===


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
10,7.7548
20,3.7464
30,3.4296



=== 最终评估 ===


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

最终F1: 3.02%

总耗时: 0.3分钟


In [3]:
import torch
import numpy as np
import datasets
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer, 
    AutoModelForQuestionAnswering, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
import time
import os

# ======================== 1. 配置参数 ========================
local_model_path = "./saved_model"  # 本地模型路径
dataset_name = "squad"
max_length = 256  # 增加长度以捕获更多上下文
batch_size = 32
num_train_epochs = 2  # 增加轮次提高性能
sample_size = 5000  # 增加训练样本量

# 环境配置
device = "cuda" if torch.cuda.is_available() else "cpu"
use_fp16 = torch.cuda.is_available()
squad_v2 = False

# 确保本地模型路径存在
if not os.path.exists(local_model_path):
    print(f"本地模型路径不存在，将使用预训练模型并保存到{local_model_path}")
    from transformers import AutoModelForQuestionAnswering
    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
    model.save_pretrained(local_model_path)
    tokenizer.save_pretrained(local_model_path)

# ======================== 2. 数据处理 ========================
print("加载数据集...")
datasets = load_dataset(dataset_name)
# 选择更多样本以提高性能
small_train = datasets["train"].select(range(min(sample_size, len(datasets["train"]))))
small_validation = datasets["validation"].select(range(2000))  # 更多验证样本

# 构建ID映射
id_to_index = {ex["id"]: i for i, ex in enumerate(small_validation)}

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)

# 改进的预处理函数（更精确的标签计算）
def prepare_features(examples, is_train=True):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_offsets_mapping=not is_train,  # 仅验证集需要偏移映射
    )
    
    if is_train:
        tokenized["start_positions"] = []
        tokenized["end_positions"] = []
        
        for i, (context, ans) in enumerate(zip(examples["context"], examples["answers"])):
            # 获取答案信息
            answer_text = ans["text"][0]
            start_char = ans["answer_start"][0]
            end_char = start_char + len(answer_text)
            
            # 获取上下文在token中的位置
            context_tokens = tokenizer(
                context, 
                truncation=True, 
                max_length=max_length - len(tokenizer(examples["question"][i])["input_ids"]),
                return_offsets_mapping=True
            )
            offsets = context_tokens["offset_mapping"]
            
            # 精确计算答案在token中的位置
            start_token = None
            end_token = None
            for idx, (s, e) in enumerate(offsets):
                if start_token is None and s <= start_char < e:
                    start_token = idx
                if end_token is None and s < end_char <= e:
                    end_token = idx
                    break
            
            # 确保有有效值
            start_token = start_token if start_token is not None else 0
            end_token = end_token if end_token is not None else 0
            
            # 加上问题的token长度偏移
            question_length = len(tokenizer(examples["question"][i], truncation=True)["input_ids"])
            tokenized["start_positions"].append(start_token + question_length)
            tokenized["end_positions"].append(end_token + question_length)
    
    return tokenized

# 预处理训练集
print("预处理训练集...")
tokenized_train = small_train.map(
    lambda x: prepare_features(x, is_train=True),
    batched=True,
    remove_columns=small_train.column_names,
    batch_size=1000
)
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])

# 预处理验证集
print("预处理验证集...")
tokenized_validation = small_validation.map(
    lambda x: prepare_features(x, is_train=False),
    batched=True,
    remove_columns=small_validation.column_names,
    batch_size=1000
)
# 添加example_id
tokenized_validation = tokenized_validation.add_column(
    "example_id", 
    [ex["id"] for ex in small_validation]
)
tokenized_validation.set_format("torch", columns=["input_ids", "attention_mask", "offset_mapping", "example_id"])

# ======================== 3. 加载本地模型 ========================
print(f"从{local_model_path}加载模型到{device}...")
model = AutoModelForQuestionAnswering.from_pretrained(local_model_path).to(device)

# ======================== 4. 改进的评估函数 ========================
def evaluate_model(trainer, eval_dataset, validation_data, id_map):
    print("开始评估...")
    raw_preds = trainer.predict(eval_dataset)
    formatted_preds = [{"id": eval_dataset[i]["example_id"], "prediction_text": ""} 
                      for i in range(len(eval_dataset))]
    
    for i in range(len(eval_dataset)):
        start_logits = raw_preds.predictions[0][i]
        end_logits = raw_preds.predictions[1][i]
        
        # 取top3的起始和结束位置组合，提高准确率
        start_indices = start_logits.argsort()[-3:][::-1]
        end_indices = end_logits.argsort()[-3:][::-1]
        
        best_score = -float("inf")
        best_start = 0
        best_end = 0
        
        for s in start_indices:
            for e in end_indices:
                if s > e:
                    continue
                score = start_logits[s] + end_logits[e]
                if score > best_score:
                    best_score = score
                    best_start = s
                    best_end = e
        
        # 提取最佳答案
        offset = eval_dataset[i]["offset_mapping"]
        if (best_start < len(offset) and best_end < len(offset) and 
            offset[best_start] is not None and offset[best_end] is not None):
            try:
                sample_id = eval_dataset[i]["example_id"]
                sample_idx = id_map[sample_id]
                context = validation_data[sample_idx]["context"]
                start_char = offset[best_start][0]
                end_char = offset[best_end][1]
                formatted_preds[i]["prediction_text"] = context[start_char:end_char].strip()
            except (KeyError, IndexError):
                continue
    
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in validation_data]
    metric = evaluate.load("squad")
    return metric.compute(predictions=formatted_preds, references=references)

# ======================== 5. 训练配置 ========================
training_args = TrainingArguments(
    output_dir="./local_model_results",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    num_train_epochs=num_train_epochs,
    learning_rate=2e-5,  # 较小学习率精细调整
    fp16=use_fp16,
    logging_steps=200,
    save_steps=10000,  # 仅在需要时保存
    do_eval=False,  # 适配旧版本
    report_to="none",
    dataloader_num_workers=4 if device=="cuda" else 0,
    weight_decay=0.01,  # 添加权重衰减防止过拟合
    warmup_steps=100,  # 学习率预热
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=tokenized_train,
)

# ======================== 6. 执行流程 ========================
start_time = time.time()

# 初始评估
print("\n=== 初始评估 ===")
initial_results = evaluate_model(trainer, tokenized_validation, small_validation, id_to_index)
print(f"初始精确匹配率: {initial_results['exact_match']:.2f}%")
print(f"初始F1分数: {initial_results['f1']:.2f}%")

# 再训练
print("\n=== 开始再训练 ===")
trainer.train()

# 保存训练后的模型
print(f"\n保存训练后的模型到{local_model_path}...")
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)

# 最终评估
print("\n=== 最终评估 ===")
final_results = evaluate_model(trainer, tokenized_validation, small_validation, id_to_index)
print(f"最终精确匹配率: {final_results['exact_match']:.2f}%")
print(f"最终F1分数: {final_results['f1']:.2f}%")

# 耗时统计
total_time = (time.time() - start_time) / 60
print(f"\n总耗时: {total_time:.1f}分钟")


本地模型路径不存在，将使用预训练模型并保存到./saved_model
加载数据集...
预处理训练集...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

预处理验证集...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

从./saved_model加载模型到cuda...

=== 初始评估 ===
开始评估...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

TOKENIZERS_PARALLELISM=(true | false)


初始精确匹配率: 77.90%
初始F1分数: 85.20%

=== 开始再训练 ===


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
200,2.4131


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


保存训练后的模型到./saved_model...

=== 最终评估 ===
开始评估...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

最终精确匹配率: 7.05%
最终F1分数: 40.21%

总耗时: 1.9分钟
