# 基于滑动窗口策略的机器结合层次化建模阅读理解任务实现 


## Step1 导入相关包

In [None]:
%pip install datasets transformers torch
import torch
import numpy as np
import collections
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator, EvalPrediction
from cmrc_eval import evaluate_cmrc

## Step2 数据集加载

In [None]:
datasets = DatasetDict.load_from_disk("mrc_data")
datasets

## Step3 数据预处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [None]:
def split_document(context, max_length=512, overlap=128):
    """
    将长文档分割成多个片段，每个片段的长度为max_length，片段之间有overlap的重叠。
    """
    fragments = []
    start = 0
    while start < len(context):
        end = min(start + max_length, len(context))
        fragments.append(context[start:end])
        if end == len(context):
            break
        start = end - overlap
    return fragments

def filter_relevant_fragments(question, fragments, top_k=3):
    """
    根据问题与文档片段的相关性，筛选出最相关的top_k个片段。
    """
    relevant_fragments = []
    for fragment in fragments:
        if any(keyword in fragment for keyword in question.split()):
            relevant_fragments.append(fragment)
    return relevant_fragments[:top_k]

def process_func(examples):
    tokenized_examples = {
        "input_ids": [],
        "attention_mask": [],
        "offset_mapping": [],
        "start_positions": [],
        "end_positions": [],
        "example_ids": []
    }
    
    for i, (question, context, answers) in enumerate(zip(examples["question"], examples["context"], examples["answers"])):
        fragments = split_document(context)
        relevant_fragments = filter_relevant_fragments(question, fragments)
        for fragment in relevant_fragments:
            tokenized_fragment = tokenizer(
                question,
                fragment,
                return_offsets_mapping=True,
                max_length=384,
                truncation="only_second",
                padding="max_length"
            )
            
            # 处理答案位置
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            context_start = tokenized_fragment.sequence_ids().index(1)
            context_end = tokenized_fragment.sequence_ids().index(None, context_start) - 1
            offset = tokenized_fragment["offset_mapping"]
            
            if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
                start_token_pos = 0
                end_token_pos = 0
            else:
                token_id = context_start
                while token_id <= context_end and offset[token_id][0] < start_char:
                    token_id += 1
                start_token_pos = token_id
                token_id = context_end
                while token_id >= context_start and offset[token_id][1] > end_char:
                    token_id -= 1
                end_token_pos = token_id
            
            tokenized_examples["input_ids"].append(tokenized_fragment["input_ids"])
            tokenized_examples["attention_mask"].append(tokenized_fragment["attention_mask"])
            tokenized_examples["offset_mapping"].append(tokenized_fragment["offset_mapping"])
            tokenized_examples["start_positions"].append(start_token_pos)
            tokenized_examples["end_positions"].append(end_token_pos)
            tokenized_examples["example_ids"].append(examples["id"][i])
    
    return tokenized_examples



In [None]:
tokenied_datasets = datasets.map(process_func, batched=True, remove_columns=datasets["train"].column_names)


## Step4 获取模型输出

In [None]:
def get_result(start_logits, end_logits, examples, features):
    predictions = {}
    references = {}

    example_to_feature = collections.defaultdict(list)
    for idx, example_id in enumerate(features["example_ids"]):
        example_to_feature[example_id].append(idx)

    n_best = 20 # 取前20个答案
    max_answer_length = 30 # 答案最大长度

    for example in examples:
        example_id = example["id"]
        context = example["context"]
        answers = []
        for feature_idx in example_to_feature[example_id]:
            start_logit = start_logits[feature_idx]
            end_logit = end_logits[feature_idx]
            offset = features[feature_idx]["offset_mapping"]
            start_indexes = np.argsort(start_logit)[::-1][:n_best].tolist()
            end_indexes = np.argsort(end_logit)[::-1][:n_best].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offset[start_index] is None or offset[end_index] is None:
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    answers.append({
                        "text": context[offset[start_index][0]: offset[end_index][1]],
                        "score": start_logit[start_index] + end_logit[end_index]
                    })
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["score"])
            predictions[example_id] = best_answer["text"]
        else:
            predictions[example_id] = ""
        references[example_id] = example["answers"]["text"]

    return predictions, references


## Step5 评估函数

In [None]:
def metirc(pred):
    start_logits, end_logits = pred[0]
    if start_logits.shape[0] == len(tokenied_datasets["validation"]):
        p, r = get_result(start_logits, end_logits, datasets["validation"], tokenied_datasets["validation"])
    else:
        p, r = get_result(start_logits, end_logits, datasets["test"], tokenied_datasets["test"])
    return evaluate_cmrc(p, r)

## Step6 加载模型

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("hfl/chinese-macbert-base")


## Step7 配置TrainingArguments

In [None]:
args = TrainingArguments(
    output_dir="models_for_qa",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=100
)


## Step8 配置Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenied_datasets["train"],
    eval_dataset=tokenied_datasets["validation"],
    data_collator=DefaultDataCollator(),
    compute_metrics=metirc
)

## Step9 模型训练

In [None]:
trainer.train()

## Step8 模型预测

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)
pipe

In [None]:
pipe(question="小明在哪里上班？", context="小明在北京上班")