In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import time
from datasets import load_dataset
from tqdm import tqdm
import re

# 设备设置
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载模型
def load_model(model_path):
    print(f"Loading model from: {model_path}")  # 调试信息
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    print("Tokenizer loaded successfully.")  # 调试信息
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    ).to(device).eval()
    print("Model loaded successfully.")  # 调试信息
    return model, tokenizer

# 加载 MMLU-Pro 数据集
def load_mmlu_pro():
    """ 加载 MMLU-Pro 数据集 """
    dataset = load_dataset("/root/autodl-tmp/MMLU-PRO")
    test_df, val_df = dataset["test"], dataset["validation"]
    return test_df, val_df

# 预处理数据
def preprocess(test_df):
    res_df = []
    for each in test_df:
        options = [opt for opt in each["options"] if opt != "N/A"]
        each["options"] = options
        res_df.append(each)
    return res_df

# 筛选 Health 类别的数据
def filter_health_category(test_df):
    return [item for item in test_df if item["category"] == "health"]

# 改进输入格式
def format_query(question, options):
    """ 格式化问题输入，确保模型能清晰理解 """
    option_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(options)])
    query = (
        f"Question: {question}\n"
        f"Options:\n{option_str}\n"
        f"Please choose the correct answer from the options (A, B, C, etc.). Answer in this format: Correct answer: "
    )
    return query

# 生成文本描述（适配 LLaMA）
def generate_description(model, tokenizer, query):
    """ 生成文本描述 """
    if not query.strip():
        return "错误：请输入问题。"

    inputs = tokenizer(query, return_tensors="pt").to(device)

    gen_kwargs = {
        "max_new_tokens": 200,  # ✅ 生成的新 token 数量，而不是限制总 token 长度
        "do_sample": False,
        "top_k": 1
    }

    with torch.no_grad():
        start_time = time.time()
        outputs = model.generate(**inputs, **gen_kwargs)
        inference_time = time.time() - start_time
        description = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return description, inference_time


# 提取模型答案（改进版）
def extract_answer(text):
    """ 从模型输出中提取答案，支持更多格式 """
    patterns = [
        r"answer is \(?([A-Z])\)?",  # answer is A
        r"Correct answer: ([A-Z])",  # Correct answer: A
        r"([A-Z]) is correct",       # A is correct
        r"\bOption ([A-Z])\b"        # Option A
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)

    return None  # 未能匹配到答案

# 测试模型
def test_model(model, tokenizer, test_df):
    results = []
    total_time = 0.0
    correct = 0

    for item in tqdm(test_df, desc="Testing"):
        question = item["question"]
        options = item["options"]
        answer = item["answer"]

        # 格式化输入
        query = format_query(question, options)

        # 生成描述
        description, inference_time = generate_description(model, tokenizer, query)
        total_time += inference_time

        # 提取模型答案
        model_answer = extract_answer(description)

        # 检查是否正确
        is_correct = model_answer == answer
        if is_correct:
            correct += 1

        # 保存结果
        results.append({
            "question": question,
            "options": options,
            "answer": answer,
            "model_answer": model_answer,
            "is_correct": is_correct,
            "inference_time": inference_time
        })

    # 计算准确率和平均推理时间
    accuracy = correct / len(test_df)
    avg_inference_time = total_time / len(test_df)

    return results, accuracy, avg_inference_time

# 保存测试结果
def save_results(results, output_path):
    with open(output_path, "w") as f:
        json.dump(results, f, indent=4)

# 主函数
def main_jupyter(model_path="/root/autodl-tmp/trained_model", output_path="/root/autodl-tmp/health_test_results.json"):
    # 加载模型
    model, tokenizer = load_model(model_path)

    # 加载 MMLU-Pro 数据集
    test_df, val_df = load_mmlu_pro()
    test_df = preprocess(test_df)

    # 筛选 Health 类别的数据
    health_test_df = filter_health_category(test_df)
    print(f"Health 类别数据量: {len(health_test_df)}")

    # 测试模型
    results, accuracy, avg_inference_time = test_model(model, tokenizer, health_test_df)

    # 保存结果
    save_results(results, output_path)

    # 打印结果
    print(f"测试完成！\nHealth 类别准确率: {accuracy * 100:.2f}%\n平均推理时间: {avg_inference_time:.4f} 秒")

# 在 Jupyter Notebook 中直接调用
main_jupyter()


Loading model from: /root/autodl-tmp/trained_model
Tokenizer loaded successfully.


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Model loaded successfully.
Health 类别数据量: 818


Testing: 100%|██████████| 818/818 [2:45:13<00:00, 12.12s/it]  

测试完成！
Health 类别准确率: 50.24%
平均推理时间: 12.1154 秒



