In [1]:
import glob
import pandas as pd
import json

In [2]:
eval_result_files = glob.glob("./german_rag_eval/**/*.json", recursive=True)

In [3]:
model_name_map = {
    "_mnt_azureml_cr_j_c10d21340e3c47a8a6a3fbf1b3c2b84c_cap_data-capability_wd_datadir_base_models_Mixtral-8x22B-Instruct-v0.1" : "mistralai/Mixtral-8x22B-Instruct-v0.1",
}

In [4]:
def read_eval_result(filename: str):
    with open(filename, "r") as f:
        data = json.load(f)
    result = {}
    model_name = data["config_general"]["model_name"]
    model_name = model_name_map.get(model_name, model_name)
    result["model_name"] = model_name
    eval_results = data["results"]
    for k, v in eval_results.items():
        if "_average" in k:
            continue
        k = k.replace("community|german_rag_eval:", "")
        if "|" in k:
            k = k[:-2]
        result[f"{k}_acc"] = v["acc"]
        result[f"{k}_acc_stderr"] = v["acc_stderr"]
    return result


In [5]:
eval_results = []

for eval_result_file in eval_result_files:
    eval_results.append(read_eval_result(eval_result_file))


In [6]:
df = pd.DataFrame(eval_results)
df.sort_values("all_acc", ascending=False, inplace=True)
df

Unnamed: 0,model_name,choose_context_by_question_acc,choose_context_by_question_acc_stderr,choose_question_by_context_acc,choose_question_by_context_acc_stderr,context_question_match_acc,context_question_match_acc_stderr,question_answer_match_acc,question_answer_match_acc_stderr,all_acc,all_acc_stderr
4,mistralai/Mixtral-8x22B-Instruct-v0.1,0.998,0.001414,1.0,0.0,0.967,0.005652,0.986,0.003717,0.98775,0.002696
1,VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct,0.998,0.001414,1.0,0.0,0.973,0.005128,0.949,0.00696,0.98,0.003376
0,VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct,0.953,0.006696,0.998,0.001414,0.975,0.00494,0.974,0.005035,0.975,0.004521
2,mistralai/Mixtral-8x7B-Instruct-v0.1,0.94,0.007514,0.998,0.001414,0.973,0.005128,0.973,0.005128,0.971,0.004796
5,meta-llama/Meta-Llama-3-70B-Instruct,0.94,0.007514,1.0,0.0,0.974,0.005035,0.946,0.007151,0.965,0.004925
8,microsoft/Phi-3-mini-4k-instruct,0.847,0.01139,0.998,0.001414,0.965,0.005815,0.964,0.005894,0.9435,0.006128
12,VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct,0.928,0.008178,0.824,0.012049,0.982,0.004206,0.906,0.009233,0.91,0.008417
3,meta-llama/Meta-Llama-3-8B-Instruct,0.725,0.014127,0.855,0.01114,0.977,0.004743,0.943,0.007335,0.875,0.009336
7,DiscoResearch/DiscoLM_German_7b_v1,0.625,0.015317,0.991,0.002988,0.914,0.00887,0.927,0.00823,0.86425,0.008851
14,occiglot/occiglot-7b-de-en-instruct,0.343,0.015019,0.994,0.002443,0.863,0.010879,0.969,0.005484,0.79225,0.008456


In [7]:
print(df.to_latex(index=False))

\begin{tabular}{lrrrrrrrrrr}
\toprule
model_name & choose_context_by_question_acc & choose_context_by_question_acc_stderr & choose_question_by_context_acc & choose_question_by_context_acc_stderr & context_question_match_acc & context_question_match_acc_stderr & question_answer_match_acc & question_answer_match_acc_stderr & all_acc & all_acc_stderr \\
\midrule
mistralai/Mixtral-8x22B-Instruct-v0.1 & 0.998000 & 0.001414 & 1.000000 & 0.000000 & 0.967000 & 0.005652 & 0.986000 & 0.003717 & 0.987750 & 0.002696 \\
VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct & 0.998000 & 0.001414 & 1.000000 & 0.000000 & 0.973000 & 0.005128 & 0.949000 & 0.006960 & 0.980000 & 0.003376 \\
VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct & 0.953000 & 0.006696 & 0.998000 & 0.001414 & 0.975000 & 0.004940 & 0.974000 & 0.005035 & 0.975000 & 0.004521 \\
mistralai/Mixtral-8x7B-Instruct-v0.1 & 0.940000 & 0.007514 & 0.998000 & 0.001414 & 0.973000 & 0.005128 & 0.973000 & 0.005128 & 0.971000 & 0.004796 \\
meta-llama/Met

In [8]:
for column in df.columns:
    if column.endswith("_stderr") and column != "all_acc_stderr":
        df.drop(column, axis=1, inplace=True)
df.columns = [c.replace("_", " ") for c in df.columns]


In [9]:
print(df.to_markdown(index=False))

| model name                                               |   choose context by question acc |   choose question by context acc |   context question match acc |   question answer match acc |   all acc |   all acc stderr |
|:---------------------------------------------------------|---------------------------------:|---------------------------------:|-----------------------------:|----------------------------:|----------:|-----------------:|
| mistralai/Mixtral-8x22B-Instruct-v0.1                    |                            0.998 |                            1     |                        0.967 |                       0.986 |   0.98775 |       0.00269564 |
| VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct          |                            0.998 |                            1     |                        0.973 |                       0.949 |   0.98    |       0.0033755  |
| VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct         |                            0.953 |             