In [6]:
import json
import re
import pandas as pd

# 多个结果文件（不同 seed）
result_files = [
    "mf_results_seed42.json",
    "mf_results_seed999.json",
    "mf_results_seed2024.json",
    'mf_results_seed2025.json',
    'mf_results_seed3407.json'
    # ...
]

def load_results(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

def get_seed(file_path: str) -> str:
    m = re.search(r"seed(\d+)", file_path)
    return m.group(1) if m else file_path

ood_datasets = []

rows_score = {}  # {seed: {col: value}}
rows_cost = {}   # {seed: {col: value}}

for fp in result_files:
    data = load_results(fp)
    seed = get_seed(fp)

    datasets_name = list(data["datasets"].keys())
    indomain_datasets = sorted(list(set(datasets_name) - set(ood_datasets)))

    row_score = {}
    row_cost = {}

    # 各数据集成绩与花费
    for d in indomain_datasets + ood_datasets:
        score = data["datasets"][d]["selection_accuracy"] * 100  # 转为百分比
        cost = data["datasets"][d]["total_cost"]
        row_score[d] = score
        row_cost[d] = cost
        print(f"[seed {seed}] {d}: Selection Accuracy = {score:.4f}, Total Cost = ${cost:.2f}")

    # 平均分/总花费
    # row_score["indomain_avg"] = data["indomain_avg"] * 100
    # row_score["ood_avg"] = data["ood_avg"] * 100
    # row_score["indomain_sample_avg"] = data["indomain_sample_avg"] * 100
    # row_score["ood_sample_avg"] = data["ood_sample_avg"] * 100
    row_score["all_dataset_avg"] = data["all_dataset_avg"] * 100
    row_score["sample_avg"] = data["sample_avg"] * 100

    # row_cost["indomain_total_cost"] = data["indomain_total_cost"]
    # row_cost["ood_total_cost"] = data["ood_total_cost"]
    row_cost["total_cost"] = data["total_cost"]

    rows_score[seed] = row_score
    rows_cost[seed] = row_cost

# 每个种子一行
df_score = pd.DataFrame.from_dict(rows_score, orient="index")
df_cost = pd.DataFrame.from_dict(rows_cost, orient="index")
df_score.index.name = "seed"
df_cost.index.name = "seed"

df_score.to_csv("mf_selection_accuracy_by_seed.csv")
df_cost.to_csv("mf_total_cost_by_seed.csv")

[seed 42] aime: Selection Accuracy = 83.3333, Total Cost = $1.57
[seed 42] arenahard: Selection Accuracy = 78.2222, Total Cost = $7.97
[seed 42] gpqa: Selection Accuracy = 88.3333, Total Cost = $2.22
[seed 42] hle: Selection Accuracy = 25.7716, Total Cost = $39.29
[seed 42] livecodebench: Selection Accuracy = 84.5426, Total Cost = $18.17
[seed 42] livemathbench: Selection Accuracy = 78.3784, Total Cost = $1.43
[seed 42] mmlupro: Selection Accuracy = 87.2222, Total Cost = $10.25
[seed 42] simpleqa: Selection Accuracy = 54.3914, Total Cost = $2.99
[seed 42] swe-bench: Selection Accuracy = 16.0000, Total Cost = $7.49
[seed 42] tau2: Selection Accuracy = 69.0476, Total Cost = $19.00
[seed 999] aime: Selection Accuracy = 83.3333, Total Cost = $1.45
[seed 999] arenahard: Selection Accuracy = 79.1111, Total Cost = $8.26
[seed 999] gpqa: Selection Accuracy = 83.3333, Total Cost = $2.60
[seed 999] hle: Selection Accuracy = 27.0062, Total Cost = $41.82
[seed 999] livecodebench: Selection Accurac