In [None]:
import os
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ========== 环境配置 ==========

# 华佗模型本地目录
weight_dir = "D:\\model\\HuatuoGPT2-7B_offline"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("使用设备:", device)

tokenizer = AutoTokenizer.from_pretrained(weight_dir, trust_remote_code=True)
model = (
    AutoModelForCausalLM.from_pretrained(
        weight_dir,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        trust_remote_code=True
    )
    .to(device)
    .eval()
)

# ========== 单条推理函数 ==========
def huatuo_predict(prompt: str, question: str, candidate_answers: str) -> str:
    full_text = f"{prompt}\n题目：{question}\n选项：{candidate_answers}"
    inputs = tokenizer(full_text, return_tensors="pt").to(device)
    with torch.no_grad():
        gen = model.generate(
            **inputs,
            max_new_tokens=8,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    answer = tokenizer.decode(
        gen[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
    ).strip()
    # 仅保留 A-Z、分隔符
    answer = "".join(c for c in answer if c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ、，, ")
    answer = answer.replace("，", "、").replace(",", "、").strip()
    return answer

# ========== 配置 ==========
INPUT_FILE  = "合理用药数据集v4.1-测试用.xlsx"
OUTPUT_FILE = "华佗模型结果_new.xlsx"

# 各工作表对应的提示词
PROMPTS = {
    "单选": "以下是关于中药处方审核的单选题，请根据规则选择正确的选项。仅输出选项即可。",
    "多选": "以下是关于中药处方审核的多选题，请根据规则选择所有正确的选项。仅输出选项即可。"
}

# ========== 主流程 ==========
xls = pd.ExcelFile(INPUT_FILE)

with pd.ExcelWriter(OUTPUT_FILE) as writer:
    for sheet in ["单选", "多选"]:
        if sheet not in xls.sheet_names:
            print(f"⚠️ 找不到工作表：{sheet}，跳过")
            continue

        df = pd.read_excel(xls, sheet_name=sheet)
        prompt = PROMPTS[sheet]

        model_answers = []
        for _, row in tqdm(df.iterrows(), desc=f"{sheet} 处理中"):
            question = row["Question"]
            candidate_answers = row["Candidate answers"]
            ans = huatuo_predict(prompt, question, candidate_answers)
            model_answers.append(ans)

        df["模型答案"] = model_answers
        df.to_excel(writer, sheet_name=sheet, index=False)

print("✅ 处理完成，结果已保存到：", OUTPUT_FILE)

In [3]:
import pandas as pd
import re

INPUT_FILE = "Huatuo_结果.xlsx"
OUTPUT_FILE = "Huatuo_结果_清洗后.xlsx"

def clean_answer(ans):
    """清洗模型答案"""
    if pd.isna(ans):
        return ""
    # 提取所有大写字母
    letters = re.findall(r"[A-Z]", str(ans).upper())
    if len(letters) == 1:
        return letters[0]          
    return " ".join(letters)       

# 读取
xls = pd.ExcelFile(INPUT_FILE)

with pd.ExcelWriter(OUTPUT_FILE) as writer:
    for sheet in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet)
        if "模型答案" in df.columns:
            df["模型答案"] = df["模型答案"].apply(clean_answer)
        df.to_excel(writer, sheet_name=sheet, index=False)

print("清洗完成 →", OUTPUT_FILE)

清洗完成 → Huatuo_结果_清洗后.xlsx
