In [53]:
import json
import pandas as pd

In [54]:
with open('llama3_8b_whole_results.json', 'r') as f:
    data = json.load(f)

In [55]:
data.keys()

dict_keys(['llama3_8b', 'llama3_8b_o_low', 'llama3_8b_a_low', 'llama3_8b_e_high', 'llama3_8b_a_high', 'llama3_8b_n_low', 'llama3_8b_c_low', 'llama3_8b_c_high', 'llama3_8b_o_high', 'llama3_8b_n_high', 'llama3_8b_e_low'])

In [56]:
def extract_metrics(data, metrics):
    results = {}
    for model in data:
        results[model] = {}
        for metric, specific_metric in metrics.items():
            for test_suite in data[model]:
                if metric in test_suite:
                    if metric == "truthfulqa":
                        value = data[model][test_suite]["truthfulqa_mc2"]["acc,none"]
                    elif metric == "gsm8k":
                        value = data[model][test_suite]["gsm8k"]["exact_match,flexible-extract"]
                    else:
                        value = data[model][test_suite][metric][specific_metric]
                    results[model][metric] = value
                    break
    return results

metrics = {
    "truthfulqa": "truthfulqa_mc2", 
    "gpqa_main_zeroshot": "acc,none", 
    "gpqa_main_n_shot": "acc,none", 
    "social_iqa": "acc,none", 
    "commonsense_qa": "acc,none", 
    "gsm8k": "exact_match,flexible-extract", 
    "mathqa": "acc,none",
    "mmlu": "acc,none",
    "piqa": "acc,none",
}

results = extract_metrics(data, metrics)

# 打印结果
for model, model_results in results.items():
    print(f'"{model}": {json.dumps(model_results)}')

"llama3_8b": {"truthfulqa": 0.534938756833991, "gpqa_main_zeroshot": 0.28125, "gpqa_main_n_shot": 0.29910714285714285, "social_iqa": 0.4969293756397134, "commonsense_qa": 0.5176085176085176, "gsm8k": 0.6467020470053071, "mathqa": 0.27872696817420434, "mmlu": 0.5122489673835636, "piqa": 0.7818280739934712}
"llama3_8b_o_low": {"truthfulqa": 0.4911052063941109, "gpqa_main_zeroshot": 0.25, "gpqa_main_n_shot": 0.26339285714285715, "social_iqa": 0.43807574206755373, "commonsense_qa": 0.24815724815724816, "gsm8k": 0.31766489764973466, "mathqa": 0.27839195979899495, "mmlu": 0.2977496083179034, "piqa": 0.7230685527747551}
"llama3_8b_a_low": {"truthfulqa": 0.45523405400796635, "gpqa_main_zeroshot": 0.21428571428571427, "gpqa_main_n_shot": 0.24553571428571427, "social_iqa": 0.38382804503582396, "commonsense_qa": 0.2628992628992629, "gsm8k": 0.6482183472327521, "mathqa": 0.27738693467336684, "mmlu": 0.26335279874661727, "piqa": 0.6855277475516867}
"llama3_8b_e_high": {"truthfulqa": 0.3500611947663

In [57]:
import json

def extract_metrics_with_stderr(data, metrics):
    results = {}
    for model in data:
        results[model] = {}
        for metric, specific_metric in metrics.items():
            for test_suite in data[model]:
                if metric in test_suite:
                    if metric == "truthfulqa":
                        value = data[model][test_suite]["truthfulqa_mc2"]["acc,none"]
                        stderr = data[model][test_suite]["truthfulqa_mc2"].get("acc_stderr,none", None)
                    elif metric == "gsm8k":
                        value = data[model][test_suite]["gsm8k"]["exact_match,flexible-extract"]
                        stderr = data[model][test_suite]["gsm8k"].get("exact_match_stderr,flexible-extract", None)
                    else:
                        value = data[model][test_suite][metric][specific_metric]
                        stderr = data[model][test_suite][metric].get(specific_metric.replace("acc,", "acc_stderr,"), None)
                    results[model][metric] = (value, stderr)
                    break
    return results

metrics = {
    "truthfulqa": "truthfulqa_mc2", 
    "gpqa_main_zeroshot": "acc,none", 
    "gpqa_main_n_shot": "acc,none", 
    "social_iqa": "acc,none", 
    "commonsense_qa": "acc,none", 
    "gsm8k": "exact_match,flexible-extract", 
    "mathqa": "acc,none",
    "mmlu": "acc,none"
}

results = extract_metrics_with_stderr(data, metrics)

# 打印结果
for model, model_results in results.items():
    formatted_results = {k: {"value": v[0], "stderr": v[1]} for k, v in model_results.items()}
    print(f'"{model}": {json.dumps(formatted_results)}')

"llama3_8b": {"truthfulqa": {"value": 0.534938756833991, "stderr": 0.01592298890607095}, "gpqa_main_zeroshot": {"value": 0.28125, "stderr": 0.021265785688273954}, "gpqa_main_n_shot": {"value": 0.29910714285714285, "stderr": 0.021656359273376974}, "social_iqa": {"value": 0.4969293756397134, "stderr": 0.011313857198301221}, "commonsense_qa": {"value": 0.5176085176085176, "stderr": 0.01430607861484495}, "gsm8k": {"value": 0.6467020470053071, "stderr": 0.013166337192115686}, "mathqa": {"value": 0.27872696817420434, "stderr": 0.008208048863665954}, "mmlu": {"value": 0.5122489673835636, "stderr": 0.003966375641672073}}
"llama3_8b_o_low": {"truthfulqa": {"value": 0.4911052063941109, "stderr": 0.01605473397594555}, "gpqa_main_zeroshot": {"value": 0.25, "stderr": 0.02048079801297601}, "gpqa_main_n_shot": {"value": 0.26339285714285715, "stderr": 0.0208336900165786}, "social_iqa": {"value": 0.43807574206755373, "stderr": 0.011226965068029933}, "commonsense_qa": {"value": 0.24815724815724816, "std

In [58]:
results

{'llama3_8b': {'truthfulqa': (0.534938756833991, 0.01592298890607095),
  'gpqa_main_zeroshot': (0.28125, 0.021265785688273954),
  'gpqa_main_n_shot': (0.29910714285714285, 0.021656359273376974),
  'social_iqa': (0.4969293756397134, 0.011313857198301221),
  'commonsense_qa': (0.5176085176085176, 0.01430607861484495),
  'gsm8k': (0.6467020470053071, 0.013166337192115686),
  'mathqa': (0.27872696817420434, 0.008208048863665954),
  'mmlu': (0.5122489673835636, 0.003966375641672073)},
 'llama3_8b_o_low': {'truthfulqa': (0.4911052063941109, 0.01605473397594555),
  'gpqa_main_zeroshot': (0.25, 0.02048079801297601),
  'gpqa_main_n_shot': (0.26339285714285715, 0.0208336900165786),
  'social_iqa': (0.43807574206755373, 0.011226965068029933),
  'commonsense_qa': (0.24815724815724816, 0.012366507794696467),
  'gsm8k': (0.31766489764973466, 0.012824066621488849),
  'mathqa': (0.27839195979899495, 0.008205019480641219),
  'mmlu': (0.2977496083179034, 0.0038116075267708156)},
 'llama3_8b_a_low': {'tr

In [59]:
data = results  # 假设results是您的原始数据

# 定义模型顺序
model_order = [
    'llama3_8b',
    'llama3_8b_o_low', 'llama3_8b_o_high',
    'llama3_8b_c_low', 'llama3_8b_c_high',
    'llama3_8b_e_low', 'llama3_8b_e_high',
    'llama3_8b_a_low', 'llama3_8b_a_high',
    'llama3_8b_n_low', 'llama3_8b_n_high',
]

# 创建一个空的列表来存储处理后的数据
processed_data = []

# 处理每个模型
for model in model_order:
    if model in data:
        row = {'model': model}
        
        for metric in data[model]:
            row[f"{metric}_mean"] = data[model][metric][0]  # acc
            row[f"{metric}_std"] = data[model][metric][1]  # stderr
        
        processed_data.append(row)

# 创建DataFrame
df = pd.DataFrame(processed_data)

# 重新排序列，使得value和stderr相邻
columns = ['model'] + sum([[f"{m}_mean", f"{m}_std"] for m in data[list(data.keys())[0]].keys()], [])
df = df[columns]
df.head(10)

# 如果您想保存为CSV，可以取消下面这行的注释
# df.to_csv('output.csv', index=False) 

Unnamed: 0,model,truthfulqa_mean,truthfulqa_std,gpqa_main_zeroshot_mean,gpqa_main_zeroshot_std,gpqa_main_n_shot_mean,gpqa_main_n_shot_std,social_iqa_mean,social_iqa_std,commonsense_qa_mean,commonsense_qa_std,gsm8k_mean,gsm8k_std,mathqa_mean,mathqa_std,mmlu_mean,mmlu_std
0,llama3_8b,0.534939,0.015923,0.28125,0.021266,0.299107,0.021656,0.496929,0.011314,0.517609,0.014306,0.646702,0.013166,0.278727,0.008208,0.512249,0.003966
1,llama3_8b_o_low,0.491105,0.016055,0.25,0.020481,0.263393,0.020834,0.438076,0.011227,0.248157,0.012367,0.317665,0.012824,0.278392,0.008205,0.29775,0.003812
2,llama3_8b_o_high,0.52426,0.016185,0.279018,0.021214,0.279018,0.021214,0.423234,0.01118,0.228501,0.012021,0.683851,0.012808,0.268677,0.008115,0.229882,0.003545
3,llama3_8b_c_low,0.389631,0.015777,0.209821,0.019259,0.229911,0.019902,0.378199,0.010973,0.216216,0.011786,0.318423,0.012832,0.250586,0.007933,0.268836,0.003701
4,llama3_8b_c_high,0.549885,0.016381,0.296875,0.02161,0.283482,0.021317,0.42477,0.011185,0.481572,0.014305,0.721759,0.012344,0.282747,0.008244,0.297109,0.00382
5,llama3_8b_e_low,0.59155,0.016083,0.267857,0.020946,0.28125,0.021266,0.409417,0.011127,0.56593,0.01419,0.630023,0.013299,0.276382,0.008187,0.413688,0.00403
6,llama3_8b_e_high,0.350061,0.015509,0.272321,0.021055,0.267857,0.020946,0.418117,0.011161,0.290745,0.013001,0.69674,0.012662,0.257956,0.008009,0.248255,0.003631
7,llama3_8b_a_low,0.455234,0.016208,0.214286,0.019408,0.245536,0.020357,0.383828,0.011004,0.262899,0.012603,0.648218,0.013153,0.277387,0.008196,0.263353,0.003713
8,llama3_8b_a_high,0.528329,0.016486,0.287946,0.021417,0.274554,0.021109,0.428352,0.011197,0.284193,0.012913,0.707354,0.012532,0.248576,0.007912,0.306723,0.003837
9,llama3_8b_n_low,0.582462,0.01626,0.294643,0.021562,0.287946,0.021417,0.428352,0.011197,0.477477,0.0143,0.718726,0.012385,0.297487,0.008369,0.308432,0.00387


In [60]:
import numpy as np

for metric in df.columns:
    if metric != "model" and not metric.endswith("_std"):
        mean_col = metric
        std_col = f"{metric[:-5]}_std" if metric.endswith("_mean") else f"{metric}_std"
        
        if std_col in df.columns:
            # Convert to float, round to 1 decimal place, and format as percentage
            mean_rounded = (df[mean_col].astype(float) * 100).round(1)
            std_rounded = (df[std_col].astype(float) * 100).round(1)
            
            df[metric] = mean_rounded.apply(lambda x: f"{x:.1f}%") + r" $\pm$ " + std_rounded.apply(lambda x: f"{x:.1f}%")
            df = df.drop(columns=[std_col])

# Optional: Rename columns to remove "_mean" suffix
df.columns = [col[:-5] if col.endswith("_mean") else col for col in df.columns]

In [61]:
df

Unnamed: 0,model,truthfulqa,gpqa_main_zeroshot,gpqa_main_n_shot,social_iqa,commonsense_qa,gsm8k,mathqa,mmlu
0,llama3_8b,53.5% $\pm$ 1.6%,28.1% $\pm$ 2.1%,29.9% $\pm$ 2.2%,49.7% $\pm$ 1.1%,51.8% $\pm$ 1.4%,64.7% $\pm$ 1.3%,27.9% $\pm$ 0.8%,51.2% $\pm$ 0.4%
1,llama3_8b_o_low,49.1% $\pm$ 1.6%,25.0% $\pm$ 2.0%,26.3% $\pm$ 2.1%,43.8% $\pm$ 1.1%,24.8% $\pm$ 1.2%,31.8% $\pm$ 1.3%,27.8% $\pm$ 0.8%,29.8% $\pm$ 0.4%
2,llama3_8b_o_high,52.4% $\pm$ 1.6%,27.9% $\pm$ 2.1%,27.9% $\pm$ 2.1%,42.3% $\pm$ 1.1%,22.9% $\pm$ 1.2%,68.4% $\pm$ 1.3%,26.9% $\pm$ 0.8%,23.0% $\pm$ 0.4%
3,llama3_8b_c_low,39.0% $\pm$ 1.6%,21.0% $\pm$ 1.9%,23.0% $\pm$ 2.0%,37.8% $\pm$ 1.1%,21.6% $\pm$ 1.2%,31.8% $\pm$ 1.3%,25.1% $\pm$ 0.8%,26.9% $\pm$ 0.4%
4,llama3_8b_c_high,55.0% $\pm$ 1.6%,29.7% $\pm$ 2.2%,28.3% $\pm$ 2.1%,42.5% $\pm$ 1.1%,48.2% $\pm$ 1.4%,72.2% $\pm$ 1.2%,28.3% $\pm$ 0.8%,29.7% $\pm$ 0.4%
5,llama3_8b_e_low,59.2% $\pm$ 1.6%,26.8% $\pm$ 2.1%,28.1% $\pm$ 2.1%,40.9% $\pm$ 1.1%,56.6% $\pm$ 1.4%,63.0% $\pm$ 1.3%,27.6% $\pm$ 0.8%,41.4% $\pm$ 0.4%
6,llama3_8b_e_high,35.0% $\pm$ 1.6%,27.2% $\pm$ 2.1%,26.8% $\pm$ 2.1%,41.8% $\pm$ 1.1%,29.1% $\pm$ 1.3%,69.7% $\pm$ 1.3%,25.8% $\pm$ 0.8%,24.8% $\pm$ 0.4%
7,llama3_8b_a_low,45.5% $\pm$ 1.6%,21.4% $\pm$ 1.9%,24.6% $\pm$ 2.0%,38.4% $\pm$ 1.1%,26.3% $\pm$ 1.3%,64.8% $\pm$ 1.3%,27.7% $\pm$ 0.8%,26.3% $\pm$ 0.4%
8,llama3_8b_a_high,52.8% $\pm$ 1.6%,28.8% $\pm$ 2.1%,27.5% $\pm$ 2.1%,42.8% $\pm$ 1.1%,28.4% $\pm$ 1.3%,70.7% $\pm$ 1.3%,24.9% $\pm$ 0.8%,30.7% $\pm$ 0.4%
9,llama3_8b_n_low,58.2% $\pm$ 1.6%,29.5% $\pm$ 2.2%,28.8% $\pm$ 2.1%,42.8% $\pm$ 1.1%,47.7% $\pm$ 1.4%,71.9% $\pm$ 1.2%,29.7% $\pm$ 0.8%,30.8% $\pm$ 0.4%


In [65]:
def escape_latex(s):
    """Escape special characters in LaTeX."""
    return (s.replace('&', r'\&')
             .replace('%', r'\%')
             .replace('$', r'\$')
             .replace('#', r'\#')
             .replace('_', r'\_')
             .replace('{', r'\{')
             .replace('}', r'\}')
             .replace('~', r'\textasciitilde{}')
             .replace('^', r'\textasciicircum{}')
             .replace('\\', r'\textbackslash{}'))

def generate_latex_table(df):
    # 定义指标的顺序
    metric_order = ['truthfulqa', 'gpqa_main_zeroshot', 'gpqa_main_n_shot', 'social_iqa', 'commonsense_qa', 'gsm8k', 'mathqa', 'mmlu']

    # 确保所有需要的列都在数据框中
    available_metrics = [col for col in metric_order if col in df.columns]

    # 开始生成LaTeX代码
    latex_code = []
    latex_code.append(r'\begin{table}[htbp]')
    latex_code.append(r'\centering')
    latex_code.append(r'\caption{Model Performance Comparison}')
    latex_code.append(r'\label{tab:model-performance}')
    latex_code.append(r'\resizebox{\textwidth}{!}{')
    latex_code.append(r'\begin{tabular}{l' + 'c' * len(available_metrics) + '}')
    latex_code.append(r'\toprule')

    # 添加表头
    header = ['Model'] + [escape_latex(metric) for metric in available_metrics]
    latex_code.append(' & '.join(header) + r' \\')
    latex_code.append(r'\midrule')

    # 添加数据行
    for _, row in df.iterrows():
        model = escape_latex(row['model'])
        data_row = [model] + [escape_latex(str(row[metric])) for metric in available_metrics]
        latex_code.append(' & '.join(data_row) + r' \\')

    # 结束表格
    latex_code.append(r'\bottomrule')
    latex_code.append(r'\end{tabular}')
    latex_code.append(r'}')
    latex_code.append(r'\end{table}')

    return '\n'.join(latex_code)

# 生成LaTeX代码
latex_table = generate_latex_table(df)

# 打印LaTeX代码
print(latex_table)

# 可选：将LaTeX代码保存到文件
with open('model_performance_table.tex', 'w') as f:
    f.write(latex_table)

\begin{table}[htbp]
\centering
\caption{Model Performance Comparison}
\label{tab:model-performance}
\resizebox{\textwidth}{!}{
\begin{tabular}{lcccccccc}
\toprule
Model & truthfulqa & gpqa\textbackslash{}_main\textbackslash{}_zeroshot & gpqa\textbackslash{}_main\textbackslash{}_n\textbackslash{}_shot & social\textbackslash{}_iqa & commonsense\textbackslash{}_qa & gsm8k & mathqa & mmlu \\
\midrule
llama3\textbackslash{}_8b & 53.5\textbackslash{}% \textbackslash{}$\textbackslash{}pm\textbackslash{}$ 1.6\textbackslash{}% & 28.1\textbackslash{}% \textbackslash{}$\textbackslash{}pm\textbackslash{}$ 2.1\textbackslash{}% & 29.9\textbackslash{}% \textbackslash{}$\textbackslash{}pm\textbackslash{}$ 2.2\textbackslash{}% & 49.7\textbackslash{}% \textbackslash{}$\textbackslash{}pm\textbackslash{}$ 1.1\textbackslash{}% & 51.8\textbackslash{}% \textbackslash{}$\textbackslash{}pm\textbackslash{}$ 1.4\textbackslash{}% & 64.7\textbackslash{}% \textbackslash{}$\textbackslash{}pm\textbackslash{}$ 1.3\tex

In [63]:
print(df.index)
print(df.columns)

RangeIndex(start=0, stop=11, step=1)
Index(['model', 'truthfulqa', 'gpqa_main_zeroshot', 'gpqa_main_n_shot',
       'social_iqa', 'commonsense_qa', 'gsm8k', 'mathqa', 'mmlu'],
      dtype='object')
