In [1]:
import glob
import pandas as pd
import json

In [2]:
eval_result_files = glob.glob("./open_llm_leaderboard/**/*.json", recursive=True)

In [3]:
def read_eval_result(filename: str):
    with open(filename, "r") as f:
        data = json.load(f)
    result = {}
    model_name = data["config_general"]["model_name"]
    result["model_name"] = model_name
    eval_results = data["results"]

    for task_name, task_results in eval_results.items():
        task_name = task_name.replace("leaderboard|", "")
        if "|" in task_name:
            task_name = task_name.split("|")[-2]
        if "mmlu" in task_name and "average" not in task_name:
            continue
        if ":" in task_name:
            task_name = task_name.split(":")[-2]
        for k, v in task_results.items():
            result[f"{task_name}_{k}"] = v
    return result


In [4]:
eval_results = []

for eval_result_file in eval_result_files:
    eval_results.append(read_eval_result(eval_result_file))


In [5]:
df = pd.DataFrame(eval_results)
df.sort_values("all_acc", ascending=False, inplace=True)
df

Unnamed: 0,model_name,arc_acc,arc_acc_stderr,arc_acc_norm,arc_acc_norm_stderr,hellaswag_acc,hellaswag_acc_stderr,hellaswag_acc_norm,hellaswag_acc_norm_stderr,truthfulqa_truthfulqa_mc1,...,all_acc,all_acc_stderr,all_acc_norm,all_acc_norm_stderr,all_truthfulqa_mc1,all_truthfulqa_mc1_stderr,all_truthfulqa_mc2,all_truthfulqa_mc2_stderr,all_qem,all_qem_stderr
0,microsoft/Phi-3-mini-4k-instruct,0.620307,0.014182,0.656997,0.013872,0.610635,0.004866,0.787791,0.00408,0.440636,...,0.68488,0.031563,0.722394,0.008976,0.440636,0.01738,0.626144,0.015784,0.636088,0.013253
1,vonjack/Phi-3-mini-4k-instruct-LLaMAfied,0.616894,0.014206,0.645904,0.013975,0.609241,0.004869,0.789385,0.004069,0.462668,...,0.668972,0.031766,0.717645,0.009022,0.462668,0.017455,0.63262,0.015854,0.63533,0.013258


In [6]:
print(df.to_latex(index=False))

\begin{tabular}{lrrrrrrrrrrrrrrrrrrrrrrrrrrrr}
\toprule
model_name & arc_acc & arc_acc_stderr & arc_acc_norm & arc_acc_norm_stderr & hellaswag_acc & hellaswag_acc_stderr & hellaswag_acc_norm & hellaswag_acc_norm_stderr & truthfulqa_truthfulqa_mc1 & truthfulqa_truthfulqa_mc1_stderr & truthfulqa_truthfulqa_mc2 & truthfulqa_truthfulqa_mc2_stderr & winogrande_acc & winogrande_acc_stderr & gsm8k_qem & gsm8k_qem_stderr & mmlu_acc & mmlu_acc_stderr & all_acc & all_acc_stderr & all_acc_norm & all_acc_norm_stderr & all_truthfulqa_mc1 & all_truthfulqa_mc1_stderr & all_truthfulqa_mc2 & all_truthfulqa_mc2_stderr & all_qem & all_qem_stderr \\
\midrule
microsoft/Phi-3-mini-4k-instruct & 0.620307 & 0.014182 & 0.656997 & 0.013872 & 0.610635 & 0.004866 & 0.787791 & 0.004080 & 0.440636 & 0.017380 & 0.626144 & 0.015784 & 0.716654 & 0.012665 & 0.636088 & 0.013253 & 0.686758 & 0.032667 & 0.684880 & 0.031563 & 0.722394 & 0.008976 & 0.440636 & 0.017380 & 0.626144 & 0.015784 & 0.636088 & 0.013253 \\
vonjack/P

In [7]:
df.columns = [c.replace("_", " ") for c in df.columns]

In [8]:
print(df.to_markdown(index=False))

| model name                               |   arc acc |   arc acc stderr |   arc acc norm |   arc acc norm stderr |   hellaswag acc |   hellaswag acc stderr |   hellaswag acc norm |   hellaswag acc norm stderr |   truthfulqa truthfulqa mc1 |   truthfulqa truthfulqa mc1 stderr |   truthfulqa truthfulqa mc2 |   truthfulqa truthfulqa mc2 stderr |   winogrande acc |   winogrande acc stderr |   gsm8k qem |   gsm8k qem stderr |   mmlu acc |   mmlu acc stderr |   all acc |   all acc stderr |   all acc norm |   all acc norm stderr |   all truthfulqa mc1 |   all truthfulqa mc1 stderr |   all truthfulqa mc2 |   all truthfulqa mc2 stderr |   all qem |   all qem stderr |
|:-----------------------------------------|----------:|-----------------:|---------------:|----------------------:|----------------:|-----------------------:|---------------------:|----------------------------:|----------------------------:|-----------------------------------:|----------------------------:|----------------------