In [1]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:7897"
os.environ["https_proxy"] = "http://127.0.0.1:7897"

from datasets import load_dataset

# 使用SQuAD数据集来微调
datasets = load_dataset("squad_v2")

In [2]:
from transformers import AutoTokenizer,BertForQuestionAnswering

# 加载分词器和模型
tokenizer =  AutoTokenizer.from_pretrained("bert-base-uncased",use_fast=True)
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


- 下面开始对数据开始预处理

In [3]:
def preprocess_data(examples):
# 1. 正常分词，使用滑动窗口
    inputs = tokenizer(
        examples['question'], 
        examples['context'], 
        truncation="only_second", 
        padding='max_length', 
        max_length=384,
        stride=128,
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        input_ids = inputs['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        # 找到序列中 Context 的区间（区分 Question 和 Context）
        sequence_ids = inputs.sequence_ids(i)
        
        # 答案信息
        answer = examples['answers'][sample_index]
        
        # 如果没有答案，直接指向 [CLS]
        if len(answer['answer_start']) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answer['answer_start'][0]
            end_char = start_char + len(answer['text'][0])
            
            # 找到当前分片在原始 Context 中的起始和结束 Token 索引
            token_start_index = 0
            token_end_index = 0
            
            # 找到 Context 在 input_ids 中的起始和结束位置
            token_search_start = 0
            while sequence_ids[token_search_start] != 1:
                token_search_start += 1
                
            token_search_end = len(input_ids) - 1
            while sequence_ids[token_search_end] != 1:
                token_search_end -= 1
            
            # 判断答案是否完全在当前分片的 Context 范围内
            if not (offsets[token_search_start][0] <= start_char and offsets[token_search_end][1] >= end_char):
                # 答案不在这个分片里，设为 [CLS]
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                # 答案在分片里，开始精确定位
                curr = token_search_start
                while curr <= token_search_end and offsets[curr][0] <= start_char:
                    curr += 1
                start_positions.append(curr - 1)
                
                curr = token_search_end
                while curr >= token_search_start and offsets[curr][1] >= end_char:
                    curr -= 1
                end_positions.append(curr + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions 
    return inputs

# 应用预处理函数
tokenized_dataset = datasets.map(
    preprocess_data, 
    batched=True, 
    remove_columns=datasets["train"].column_names
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [4]:
# 对数据集进行划分
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

In [7]:
import torch
import sys

print(f"Python 版本: {sys.version}")
print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 是否可用: {torch.cuda.is_available()}")
if not torch.cuda.is_available():
    import ctypes
    # 尝试加载 nvcuda.dll (Windows) 看看驱动库是否存在
    try:
        ctypes.WinDLL('nvcuda.dll')
        print("NVIDIA 驱动库 (nvcuda.dll) 检测正常")
    except Exception as e:
        print(f"无法加载 NVIDIA 驱动库: {e}")

Python 版本: 3.12.12 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 20:16:04) [GCC 11.2.0]
PyTorch 版本: 2.6.0+cu124
CUDA 是否可用: False
无法加载 NVIDIA 驱动库: module 'ctypes' has no attribute 'WinDLL'


In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert-qa",
    eval_strategy="epoch",
    learning_rate=3e-5,         # 学习率
    per_device_train_batch_size=8, # 训练批次大小
    per_device_eval_batch_size=8,  # 评估批次大小
    num_train_epochs=2,
    weight_decay=0.01,  
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
# 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0188,1.052049
2,0.6572,1.194216


TrainOutput(global_step=32940, training_loss=0.951408569669463, metrics={'train_runtime': 47886.8206, 'train_samples_per_second': 5.503, 'train_steps_per_second': 0.688, 'total_flos': 5.164033933049242e+16, 'train_loss': 0.951408569669463, 'epoch': 2.0})