In [1]:
import os
import pandas as pd
import json
from IPython.display import display, Markdown, Latex

from common.utils import RESULTS_DIR, EVAL_SIZE, filename_to_obj, remove_index

In [2]:
def results_as_pandas(filename):
    path = os.path.join(RESULTS_DIR, filename)
    with open(path, "r") as f:
        data = f.readlines()
    data = [json.loads(d) for d in data]
    data = pd.DataFrame(data)

    params = filename_to_obj(filename)
    for k, v in params.items():
        data[k] = v

    data = data.explode("evaluations")
    data = data.rename_axis("question_idx").reset_index()

    data = pd.concat([data, data["evaluations"].apply(pd.Series)], axis=1)
    evaluation_keys = data["evaluations"].apply(pd.Series).columns
    for col in evaluation_keys:
        data = pd.concat([data, data[col].apply(pd.Series).add_prefix(f"{col}/")], axis=1)
        data = data.drop(columns=col)
    data = data.drop(columns=["evaluations"])

    return data


files = os.listdir(RESULTS_DIR)
params_names = list(filename_to_obj(files[0]).keys())
all_results = pd.concat([results_as_pandas(f) for f in files])
all_results.head()

Unnamed: 0,question_idx,question_id,llm,prompt_id,temperature,nli,nlg,citations/ais_recall,citations/ais_precision,citations/n_sentences,...,citations/n_overcitations,citations/sentences,citations/supported,citations/citations,citations/correct_citations,citations/out_of_range,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,quality/answer_relevance
0,0,5abab42e55429955dce3eed2,qwen1_5-14b-chat-q8_0,1,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,0.0,0.5,3,...,0,[Kevin Crowley is the professional lacrosse pl...,"[0, 0, 0]","[[], [], [3, 4]]","[[], [], [True]]","[0, 0, 0]",1.0,0.0,1.0,0.546523
1,0,5abab42e55429955dce3eed2,qwen1_5-14b-chat-q8_0,1,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,0.0,1.0,2,...,0,[Kevin Crowley is the professional lacrosse pl...,"[0, 0]","[[], [4]]","[[], [True]]","[0, 0]",1.0,0.0,0.5,0.650574
2,0,5abab42e55429955dce3eed2,qwen1_5-14b-chat-q8_0,1,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,0.333333,1.0,3,...,0,[Kevin Crowley is the professional lacrosse pl...,"[0, 0, 1]","[[], [], [2]]","[[], [], [True]]","[0, 0, 0]",1.0,0.0,0.0,0.480813
3,1,5a761900554299109176e648,qwen1_5-14b-chat-q8_0,1,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1.0,1.0,1,...,0,"[Flynn Intel Group, a lobbying group House Ide...",[1],"[[0, 1]]","[[True, True]]",[0],1.0,1.0,1.0,0.378449
4,1,5a761900554299109176e648,qwen1_5-14b-chat-q8_0,1,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1.0,1.0,1,...,0,"[Flynn Intel Group, a lobbying group House Ide...",[1],"[[0, 1]]","[[True, True]]",[0],1.0,1.0,1.0,0.378826


In [3]:
all_obj_cols = all_results.select_dtypes(include=["object"]).columns
drop_obj_cols = list(set(all_obj_cols) - set(params_names))
drop_obj_cols.remove("question_id")
print(f"Dropping columns: {drop_obj_cols}")
all_num_results = all_results.drop(columns=drop_obj_cols)

Dropping columns: ['citations/citations', 'citations/correct_citations', 'citations/supported', 'citations/out_of_range', 'citations/sentences']


In [4]:
eval_split = all_num_results[all_num_results["question_idx"] < EVAL_SIZE]
train_split = all_num_results[all_num_results["question_idx"] >= EVAL_SIZE]

In [5]:
def aggregate(split):
    split = split.drop(columns=["question_idx"])
    results_with_std_for_each_question = split.groupby([*params_names, "question_id"]).agg(["mean", "std"])
    results_for_each_model = results_with_std_for_each_question.groupby(params_names)
    results = results_for_each_model.mean()
    results["n_questions"] = results_for_each_model.size()
    return results

eval_results = aggregate(eval_split)
train_results = aggregate(train_split)

if eval_results["n_questions"].nunique() != 1:
    print("Warning: not all rows in evaluation have the same number of examples")

In [6]:
display(Markdown("### Prompts comparison"))
eval_results[eval_results.index.get_level_values("llm") == "mistral-7b-instruct-v0.2.Q8_0"]

### Prompts comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,citations/n_correct_citations,...,citations/n_overcitations,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 25_level_1
llm,prompt_id,temperature,nli,nlg,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
mistral-7b-instruct-v0.2.Q8_0,1,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,0.549818,0.056448,0.876011,0.038954,2.853333,0.452654,3.203333,0.641732,2.73,0.566563,...,0.0,0.873581,0.017802,0.656667,0.023094,0.768333,0.040415,0.537185,0.046166,100
mistral-7b-instruct-v0.2.Q8_0,2,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,0.488163,0.075649,0.843323,0.054662,3.19,0.484905,4.113333,0.767421,3.496667,0.727615,...,0.005774,0.856701,0.013472,0.68,0.023094,0.758333,0.048301,0.51464,0.054069,100
mistral-7b-instruct-v0.2.Q8_0,3,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,0.512314,0.055778,0.877921,0.038959,2.42,0.414779,3.133333,0.549383,2.463333,0.370637,...,0.017321,0.847581,0.030792,0.566667,0.080829,0.735,0.037528,0.526535,0.046135,100


In [7]:
display(Markdown("### Evaluation results"))
eval_display = eval_results[eval_results.index.get_level_values("prompt_id") == "1"]
eval_display = remove_index(eval_display, "prompt_id")
eval_display

### Evaluation results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,citations/n_overcitations,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 24_level_1
llm,temperature,nli,nlg,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
gpt-3.5-turbo-0125,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1,0.729222,0.07549,0.991389,0.003849,2.02,0.214022,2.143333,0.207846,2.12,...,0.0,0.884244,0.01299,0.75,0.040415,0.853333,0.023094,0.560276,0.049467,100
mistral-7b-instruct-v0.2.Q8_0,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1,0.549818,0.056448,0.876011,0.038954,2.853333,0.452654,3.203333,0.641732,2.73,...,0.0,0.873581,0.017802,0.656667,0.023094,0.768333,0.040415,0.537185,0.046166,100
mixtral-8x7b-instruct-v0.1.Q8_0,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1,0.603619,0.082348,0.922921,0.050573,2.746667,0.294585,2.896667,0.442236,2.703333,...,0.0,0.882598,0.007698,0.693333,0.057735,0.871667,0.046188,0.529603,0.052443,100
qwen1_5-14b-chat-q8_0,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1,0.628333,0.042339,0.978333,0.011547,1.393333,0.075056,1.593333,0.096603,1.533333,...,0.0,0.876344,0.006598,0.583333,0.040415,0.66,0.030981,0.540885,0.039755,100
qwen1_5-7b-chat-q8_0,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1,0.231778,0.069444,0.863667,0.060044,2.42,0.227846,1.303333,0.217846,1.266667,...,0.0,0.864821,0.024118,0.686667,0.057735,0.548333,0.074282,0.545086,0.046009,100


In [8]:
display(Markdown("### Training results"))
train_display = train_results[train_results.index.get_level_values("prompt_id") == "1"]
train_display = remove_index(train_display, "prompt_id")
train_display

### Training results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,citations/n_overcitations,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 24_level_1
llm,temperature,nli,nlg,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
mixtral-8x7b-instruct-v0.1.Q8_0,0.1,t5_xxl_true_nli_mixture,mistral-7b-instruct-v0.2.Q8_0,1,0.624332,0.085718,0.901555,0.04351,2.661476,0.336632,3.073878,0.483275,2.804042,...,0.000161,0.87697,0.022419,0.736374,0.056194,0.845245,0.045212,0.565741,0.045036,3596
