# CMMLU测评

In [1]:
'''
CUDA_VISIBLE_DEVICES=0 swift deploy \
    --model Qwen/Qwen2.5-7B-Instruct \
    --infer_backend vllm \
    --max_new_tokens 2048 \
    --served_model_name Qwen2.5-7B-Instruct
'''

import pandas as pd
import json
import io
import sys
import threading
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests  
import time



def call_llm_api(prompt, api_url):
    """调用大模型API"""
    headers = {
        "Content-Type": "application/json"
    }

    payload = {
        "model": "Qwen2.5-7B-Instruct",
        # "model": "lora",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "max_tokens": 50,
        "temperature": 0.1
    }

    try:
        response = requests.post(
            api_url,
            headers=headers,
            json=payload,
            timeout=120
        )
        response.raise_for_status()
        # 从聊天接口正确解析结果（chat completions返回的是message而非text）
        return response.json().get("choices", [{}])[0].get("message", {}).get("content", "").strip()
    except Exception as e:
        print(f"API调用错误: {str(e)}")
        # 抛出异常而非返回错误字符串，让上层处理
        raise Exception(f"API调用错误: {str(e)}")

def process_in_batches(data, api_url, output_file="./all_model_output/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct-cmmlu_result.json"):
    total = len(data)
    print(f"总请求数: {total}, 线程池大小: {MAX_WORKERS}")

    results = []
    lock = threading.Lock()  # 确保多线程安全写入结果列表

    # 初始化线程池
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # 分批次处理
        for batch_start in range(0, total, MAX_WORKERS):
#             if batch_start%400 ==0 or batch_start%300==0:

            batch_end = min(batch_start + MAX_WORKERS, total)
            print(f"\n处理批次: {batch_start+1} - {batch_end}/{total}")

            # 提交当前批次的所有任务到线程池
            futures = {}
            for i in range(batch_start, batch_end):
                item = data[i]['question_with_options']

                # 修复：确保content变量已定义（这里假设content是你的prompt模板）
                # 如果你没有定义content，请替换为实际的prompt构建逻辑
                prompt = content+item
                future = executor.submit(
                    call_llm_api,
                    prompt,  # 使用构建好的prompt
                    api_url
                )
                futures[future] = (i, data[i])  # 关联future与请求索引和数据

            # 等待当前批次所有任务完成
            batch_results = []
            for future in as_completed(futures):
                i, item = futures[future]
                try:
                    model_response = future.result()
                    batch_results.append({
                        "question_with_options": item['question_with_options'],
                        "answer": item['answer'],
                        "source_file":item['source_file'],
                        "model_response":model_response,
                        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        "status": "success"
                    })
                    # print(f"完成请求 #{i+1}")
                except Exception as e:
                    print(f"请求 #{i+1} 处理失败: {str(e)}")
                    batch_results.append({
                        "error": str(e),
                        "question_with_options": item['question_with_options'],
                        "answer": item['answer'],
                        "source_file":item['source_file'],
                        "model_response":model_response,
                        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        "status": "failed"
                    })
                
            # 批次完成后合并结果并保存
            with lock:
                results.extend(batch_results)

            # 每批处理完成后保存一次结果
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
            print(f"批次 {batch_start+1}-{batch_end} 结果已保存")

    print(f"\n所有请求处理完成，最终结果已保存到 {output_file}")
    return results


def process_json_file(input_file, output_file='./all_model_output/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct-eval_results.xlsx'):
    """
    直接处理JSON文件并生成结果
    """
    # 读取JSON文件
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 按source_file分组统计
    stats = {}
    for item in data:
        source_file = item['source_file']
        if source_file not in stats:
            stats[source_file] = {'correct': 0, 'total': 0}
        
        stats[source_file]['total'] += 1
        if item['answer'] == item['model_response']:
            stats[source_file]['correct'] += 1
    
    # 准备结果数据
    results = []
    for source_file, stat in stats.items():
        acc = stat['correct'] / stat['total'] if stat['total'] > 0 else 0
        results.append({

            
            '源文件': source_file,
            '正确数': stat['correct'],
            '总数': stat['total'], 
            '准确率': round(acc, 4)
        })
    
    # 保存到Excel
    df = pd.DataFrame(results)
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='cmmlu各个子集acc结果', index=False)
    
    print(f"处理完成！共处理 {len(data)} 条数据，生成 {len(results)} 个统计结果")
    return df

if __name__ == "__main__":
    cmmlu_eval_df = pd.read_csv('cmmlu_concat.csv')
    cmmlu_eval_json = cmmlu_eval_df.to_json(orient='records', force_ascii=False, indent=2)
    cmmlu_eval_json = json.loads(cmmlu_eval_json)

    with open('../../prompt/CMMLU评估', 'r', encoding='utf-8') as file:
        content = file.read()



    # 配置参数
    MAX_WORKERS = 200 # 线程池最大线程数，同时也是每批处理的请求数
    API_URL = "http://0.0.0.0:8000/v1/chat/completions"
    print({"开始timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
    process_in_batches(cmmlu_eval_json, API_URL)
    print({"结束timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
    process_json_file('./all_model_output/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct-cmmlu_result.json')
    print({"统计结果已保存": datetime.now().strftime("%Y-%m-%d %H:%M:%S")})

{'开始timestamp': '2025-09-29 12:55:35'}
总请求数: 11582, 线程池大小: 200

处理批次: 1 - 200/11582
批次 1-200 结果已保存

处理批次: 201 - 400/11582
批次 201-400 结果已保存

处理批次: 401 - 600/11582
批次 401-600 结果已保存

处理批次: 601 - 800/11582
批次 601-800 结果已保存

处理批次: 801 - 1000/11582
批次 801-1000 结果已保存

处理批次: 1001 - 1200/11582
批次 1001-1200 结果已保存

处理批次: 1201 - 1400/11582
批次 1201-1400 结果已保存

处理批次: 1401 - 1600/11582
批次 1401-1600 结果已保存

处理批次: 1601 - 1800/11582
批次 1601-1800 结果已保存

处理批次: 1801 - 2000/11582
批次 1801-2000 结果已保存

处理批次: 2001 - 2200/11582
批次 2001-2200 结果已保存

处理批次: 2201 - 2400/11582
批次 2201-2400 结果已保存

处理批次: 2401 - 2600/11582
批次 2401-2600 结果已保存

处理批次: 2601 - 2800/11582
批次 2601-2800 结果已保存

处理批次: 2801 - 3000/11582
批次 2801-3000 结果已保存

处理批次: 3001 - 3200/11582
批次 3001-3200 结果已保存

处理批次: 3201 - 3400/11582
批次 3201-3400 结果已保存

处理批次: 3401 - 3600/11582
批次 3401-3600 结果已保存

处理批次: 3601 - 3800/11582
批次 3601-3800 结果已保存

处理批次: 3801 - 4000/11582
批次 3801-4000 结果已保存

处理批次: 4001 - 4200/11582
批次 4001-4200 结果已保存

处理批次: 4201 - 4400/11582
批次 4201-44

# 测试集测评

In [3]:
'''
CUDA_VISIBLE_DEVICES=0 swift deploy \
    --model Qwen/Qwen2.5-7B-Instruct \
    --infer_backend vllm \
    --max_new_tokens 2048 \
    --served_model_name Qwen2.5-7B-Instruct
'''
content = '现在你是一个肿瘤学科医生，请根据患者的问题给出实际的医疗建议：'
import pandas as pd
import io
import sys
import json
import threading
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests  
import time



def call_llm_api(prompt, api_url):
    """调用大模型API"""
    headers = {
        "Content-Type": "application/json"
    }

    payload = {
        "model": "Qwen2.5-7B-Instruct",
        # "model": "lora",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "max_tokens": 512,
        "temperature": 0.1
    }

    try:
        response = requests.post(
            api_url,
            headers=headers,
            json=payload,
            timeout=120
        )
        response.raise_for_status()
        # 从聊天接口正确解析结果（chat completions返回的是message而非text）
        return response.json().get("choices", [{}])[0].get("message", {}).get("content", "").strip()
    except Exception as e:
        print(f"API调用错误: {str(e)}")
        # 抛出异常而非返回错误字符串，让上层处理
        raise Exception(f"API调用错误: {str(e)}")

def process_in_batches(data, api_url, output_file="./all_model_output/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct-test_result.json"):
    total = len(data)
    print(f"总请求数: {total}, 线程池大小: {MAX_WORKERS}")

    results = []
    lock = threading.Lock()  # 确保多线程安全写入结果列表

    # 初始化线程池
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # 分批次处理
        for batch_start in range(0, total, MAX_WORKERS):
#             if batch_start%400 ==0 or batch_start%300==0:

            batch_end = min(batch_start + MAX_WORKERS, total)
            print(f"\n处理批次: {batch_start+1} - {batch_end}/{total}")

            # 提交当前批次的所有任务到线程池
            futures = {}
            for i in range(batch_start, batch_end):
                item = data[i]['input']

                # 修复：确保content变量已定义（这里假设content是你的prompt模板）
                # 如果你没有定义content，请替换为实际的prompt构建逻辑
                prompt = content+item
                future = executor.submit(
                    call_llm_api,
                    prompt,  # 使用构建好的prompt
                    api_url
                )
                futures[future] = (i, data[i])  # 关联future与请求索引和数据

            # 等待当前批次所有任务完成
            batch_results = []
            for future in as_completed(futures):
                i, item = futures[future]
                try:
                    model_response = future.result()
                    batch_results.append({
                        "input": item['input'],
                        "output": item['output'],
                        "model_response":model_response,
                        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        "status": "success"
                    })
                    # print(f"完成请求 #{i+1}")
                except Exception as e:
                    print(f"请求 #{i+1} 处理失败: {str(e)}")
                    batch_results.append({
                        "error": str(e),
                        "input": item['input'],
                        "output": item['output'],
                        "model_response":model_response,
                        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        "status": "failed"
                    })
                
            # 批次完成后合并结果并保存
            with lock:
                results.extend(batch_results)

            # 每批处理完成后保存一次结果
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
            print(f"批次 {batch_start+1}-{batch_end} 结果已保存")

    print(f"\n所有请求处理完成，最终结果已保存到 {output_file}")
    return results


if __name__ == "__main__":
    with open('../../data_process/final_data/test.json', 'r', encoding='utf-8') as file:
        test_json = json.load(file)



    # 配置参数
    MAX_WORKERS = 100 # 线程池最大线程数，同时也是每批处理的请求数
    API_URL = "http://0.0.0.0:8000/v1/chat/completions"
    print({"开始timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
    process_in_batches(test_json, API_URL)
    print({"结束timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")})


{'开始timestamp': '2025-09-29 15:46:41'}
总请求数: 4671, 线程池大小: 100

处理批次: 1 - 100/4671
批次 1-100 结果已保存

处理批次: 101 - 200/4671
批次 101-200 结果已保存

处理批次: 201 - 300/4671
批次 201-300 结果已保存

处理批次: 301 - 400/4671
批次 301-400 结果已保存

处理批次: 401 - 500/4671
批次 401-500 结果已保存

处理批次: 501 - 600/4671
批次 501-600 结果已保存

处理批次: 601 - 700/4671
批次 601-700 结果已保存

处理批次: 701 - 800/4671
批次 701-800 结果已保存

处理批次: 801 - 900/4671
批次 801-900 结果已保存

处理批次: 901 - 1000/4671
批次 901-1000 结果已保存

处理批次: 1001 - 1100/4671
批次 1001-1100 结果已保存

处理批次: 1101 - 1200/4671
批次 1101-1200 结果已保存

处理批次: 1201 - 1300/4671
批次 1201-1300 结果已保存

处理批次: 1301 - 1400/4671
批次 1301-1400 结果已保存

处理批次: 1401 - 1500/4671
批次 1401-1500 结果已保存

处理批次: 1501 - 1600/4671
批次 1501-1600 结果已保存

处理批次: 1601 - 1700/4671
批次 1601-1700 结果已保存

处理批次: 1701 - 1800/4671
批次 1701-1800 结果已保存

处理批次: 1801 - 1900/4671
批次 1801-1900 结果已保存

处理批次: 1901 - 2000/4671
批次 1901-2000 结果已保存

处理批次: 2001 - 2100/4671
批次 2001-2100 结果已保存

处理批次: 2101 - 2200/4671
批次 2101-2200 结果已保存

处理批次: 2201 - 2300/4671
批次 2201-23