In [1]:
import os
import pandas as pd
from IPython.display import display, Markdown
from tqdm.auto import tqdm

from common.consts import RESULTS_DIR, EVAL_SIZE
from common.utils import filename_to_obj, remove_index

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def results_as_pandas(filename):
    path = os.path.join(RESULTS_DIR, filename)
    data = pd.read_json(path, lines=True)

    if len(data) == 0:
        print(f"empty file: {filename}")
        return data

    params = filename_to_obj(filename)
    for k, v in params.items():
        data[k] = v

    data = data.explode("evaluations")
    data = data.rename_axis("question_idx").reset_index()

    data = pd.concat([data, data["evaluations"].apply(pd.Series)], axis=1)
    evaluation_keys = data["evaluations"].apply(pd.Series).columns
    for col in evaluation_keys:
        data = pd.concat([data, data[col].apply(pd.Series).add_prefix(f"{col}/")], axis=1)
        data = data.drop(columns=col)
    data = data.drop(columns=["evaluations"])

    return data


files = os.listdir(RESULTS_DIR)
params_names = list(filename_to_obj(files[0]).keys())
all_results = pd.concat([results_as_pandas(f) for f in tqdm(files)])
all_results.head()

100%|██████████| 41/41 [02:00<00:00,  2.94s/it]


Unnamed: 0,question_idx,question_id,llm,prompt_id,temperature,nli,ellm,sim,citations/ais_recall,citations/ais_precision,...,citations/supported,citations/citations,citations/correct_citations,citations/out_of_range,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance,quality/new_question
0,0,5abab42e55429955dce3eed2,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.25,1.0,...,"[0, 0, 0, 1]","[[], [], [3], [4]]","[[], [], [True], [True]]","[0, 0, 0, 0]",1.0,1.0,1.0,1.0,0.649022,Which of P.K. Subban's brothers was drafted fi...
1,0,5abab42e55429955dce3eed2,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.083333,0.875,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[[], [], [3], [4], [6], [], [], [], [], [4, 5]...","[[], [], [True], [True], [True], [], [], [], [...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1.0,0.0,1.0,0.5,0.75747,"Who are the two professional hockey players, b..."
2,0,5abab42e55429955dce3eed2,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.0,1.0,...,"[0, 0, 0, 0, 0, 0]","[[], [], [4], [6], [], []]","[[], [], [True], [True], [], []]","[0, 0, 0, 0, 0, 0]",0.5,0.0,0.5,0.5,0.770844,"Who are the two professional hockey players, b..."
3,1,5a761900554299109176e648,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1.0,1.0,...,[1],[[1]],[[True]],[0],1.0,1.0,0.5,1.0,0.450042,What is the name of the intelligence firm that...
4,1,5a761900554299109176e648,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1.0,1.0,...,"[1, 1]","[[0, 1], [0, 1]]","[[True, True], [True, True]]","[0, 0]",1.0,1.0,1.0,1.0,0.772082,What is the name of the lobbying group founded...


In [3]:
all_obj_cols = all_results.select_dtypes(include=["object"]).columns
drop_obj_cols = list(set(all_obj_cols) - set(params_names))
drop_obj_cols.remove("question_id")
print(f"Dropping columns: {drop_obj_cols}")
all_num_results = all_results.drop(columns=drop_obj_cols)

Dropping columns: ['citations/sentences', 'citations/citations', 'quality/new_question', 'citations/correct_citations', 'citations/out_of_range', 'citations/supported']


In [4]:
eval_split = all_num_results[all_num_results["question_idx"] < EVAL_SIZE]
train_split = all_num_results[all_num_results["question_idx"] >= EVAL_SIZE]

In [5]:
AGG_FUNC = "max"


def aggregate(split):
    split = split.drop(columns=["question_idx"])
    results_with_std_for_each_question = split.groupby([*params_names, "question_id"]).agg([AGG_FUNC, "std"])
    results_for_each_model = results_with_std_for_each_question.groupby(params_names)
    results = results_for_each_model.mean()
    results["n_questions"] = results_for_each_model.size()
    return results


eval_results = aggregate(eval_split)
train_results = aggregate(train_split)

if eval_results["n_questions"].nunique() != 1:
    print("Warning: not all rows in evaluation have the same number of examples")

In [6]:
display(Markdown("### Prompts comparison"))
parameter_results = eval_results[eval_results.index.get_level_values("llm") == "Mistral-7B-Instruct-v0.2"]


def show_cleaned_results(short_eval_display, keep_index_name=None):
    short_eval_display = short_eval_display.copy()
    for index_name in ["temperature", "nli", "ellm", "sim", "prompt_id"]:
        if index_name == keep_index_name:
            continue
        short_eval_display = remove_index(short_eval_display, index_name)
    important_columns = ["citations/ais_recall", "citations/ais_precision", "correctness/answer_overlap", "correctness/answer_entail", "correctness/citations_recall", "correctness/citations_precision", "quality/answer_relevance"]
    short_eval_display = short_eval_display[important_columns]
    return short_eval_display


show_cleaned_results(parameter_results[parameter_results.index.get_level_values("temperature") == "0.1"], keep_index_name="prompt_id")

### Prompts comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,Unnamed: 1_level_1,max,std,max,std,max,std,max,std,max,std,max,std,max,std
llm,prompt_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Mistral-7B-Instruct-v0.2,1,0.606238,0.03656,0.901424,0.03835,0.883923,0.014915,0.9,0.017321,0.775,0.025981,0.762833,0.050125,0.74368,0.02834
Mistral-7B-Instruct-v0.2,2,0.559952,0.066146,0.8875,0.034978,0.877269,0.027905,0.88,0.023094,0.8,0.043301,0.7055,0.046655,0.729335,0.030174
Mistral-7B-Instruct-v0.2,3,0.6215,0.045033,0.9135,0.024787,0.869269,0.017321,0.87,0.011547,0.69,0.034641,0.690167,0.04237,0.681964,0.027177
Mistral-7B-Instruct-v0.2,4,0.55,0.050999,0.864286,0.044519,0.862603,0.024249,0.85,0.011547,0.625,0.031754,0.719024,0.036335,0.701345,0.02687


In [7]:
display(Markdown("### Temperature comparison"))
show_cleaned_results(parameter_results[parameter_results.index.get_level_values("prompt_id") == "1"], keep_index_name="temperature")

### Temperature comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,Unnamed: 1_level_1,max,std,max,std,max,std,max,std,max,std,max,std,max,std
llm,temperature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Mistral-7B-Instruct-v0.2,0.01,0.597643,0.027143,0.901955,0.016922,0.87059,0.01299,0.9,0.017321,0.77,0.014434,0.761024,0.023419,0.749056,0.015812
Mistral-7B-Instruct-v0.2,0.1,0.606238,0.03656,0.901424,0.03835,0.883923,0.014915,0.9,0.017321,0.775,0.025981,0.762833,0.050125,0.74368,0.02834
Mistral-7B-Instruct-v0.2,0.2,0.614048,0.081991,0.918591,0.050804,0.887192,0.029793,0.89,0.005774,0.815,0.051962,0.78619,0.050897,0.754883,0.03312
Mistral-7B-Instruct-v0.2,0.3,0.638214,0.084729,0.914333,0.066155,0.908923,0.045707,0.9,0.023094,0.835,0.059848,0.808857,0.067481,0.76533,0.047856
Mistral-7B-Instruct-v0.2,0.4,0.717048,0.150051,0.963996,0.108934,0.908923,0.044607,0.91,0.028868,0.85,0.096603,0.827857,0.099659,0.767508,0.04985
Mistral-7B-Instruct-v0.2,0.5,0.69375,0.144597,0.942603,0.085049,0.910923,0.073768,0.91,0.034641,0.82,0.082169,0.830952,0.114113,0.760321,0.059824
Mistral-7B-Instruct-v0.2,0.6,0.667405,0.168079,0.941042,0.110804,0.886359,0.049038,0.9,0.040415,0.855,0.12181,0.841357,0.125848,0.7736,0.064925
Mistral-7B-Instruct-v0.2,0.7,0.712095,0.182757,0.941048,0.115458,0.90559,0.047054,0.9,0.028868,0.865,0.118923,0.84369,0.132095,0.780154,0.05884
Mistral-7B-Instruct-v0.2,0.8,0.693667,0.195393,0.959667,0.131938,0.914192,0.062564,0.93,0.086603,0.86,0.119697,0.82969,0.123992,0.792152,0.070362
Mistral-7B-Instruct-v0.2,0.9,0.750833,0.204617,0.9685,0.12727,0.892859,0.074434,0.9,0.057735,0.875,0.14913,0.853524,0.152158,0.765121,0.073964


In [8]:
display(Markdown("### Evaluation results"))
eval_not_mistral = eval_results[eval_results.index.get_level_values("llm") != "Mistral-7B-Instruct-v0.2"]
eval_mistral = eval_results[(eval_results.index.get_level_values("llm") == "Mistral-7B-Instruct-v0.2") & (eval_results.index.get_level_values("prompt_id") == "1") & (eval_results.index.get_level_values("temperature") == "0.1")]
eval_display = pd.concat([eval_not_mistral, eval_mistral])
eval_display = eval_display[eval_display.index.get_level_values("llm") != "Mixtral-8x7B-Instruct-v0.1.Q8_0"]  # use this model only for training data, because we already have same model without quantization
eval_display = remove_index(eval_display, "prompt_id")
eval_display = eval_display.sort_values(by=("correctness/citations_recall", AGG_FUNC), ascending=False)
eval_display

### Evaluation results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,max,std,max,std,max,std,max,std,max,...,std,max,std,max,std,max,std,max,std,Unnamed: 25_level_1
llm,temperature,nli,ellm,sim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
rag-tge_Mistral_v2-4480,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.85557,0.134646,0.979417,0.031338,5.94,1.208135,6.64,1.148101,6.35,...,0.017918,0.91,0.023094,0.995,0.025981,0.933333,0.070338,0.780416,0.055383,100
rag-tge_Llama-3-8B_v1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.734354,0.100349,0.97856,0.03002,9.28,1.170721,9.81,1.338763,9.17,...,0.013723,0.94,0.023094,0.995,0.00866,0.858667,0.09881,0.798084,0.071202,100
rag-tge_Mistral_v2-3360,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.813959,0.146211,0.984115,0.032981,6.97,1.424317,7.54,1.360671,7.17,...,0.030777,0.92,0.011547,0.99,0.025981,0.918333,0.084337,0.782177,0.06322,100
rag-tge_Llama-3-8B_v2,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0,0.76995,0.120419,0.984613,0.04962,8.44,1.022228,9.59,1.261849,8.93,...,0.015031,0.97,0.040415,0.985,0.005774,0.877024,0.135618,0.808191,0.06608,100
rag-tge_Llama-3-8B_v3,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,4,0.875667,0.0751,0.975167,0.029156,1.69,0.175386,2.61,0.232623,2.48,...,0.045041,0.92,0.023094,0.955,0.025207,0.955333,0.00866,0.79045,0.033242,100
rag-tge_Mistral_v6,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,4,0.854167,0.08233,0.97,0.023575,1.7,0.129697,2.33,0.124972,2.21,...,0.031858,0.95,0.028868,0.955,0.023094,0.976667,0.018283,0.796144,0.040207,100
rag-tge_Mistral.Q8,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.864167,0.06052,0.974167,0.021027,2.47,0.261624,3.25,0.306084,3.05,...,0.012892,0.87,0.005774,0.93,0.023094,0.93619,0.040843,0.779305,0.040067,100
gpt-3.5-turbo-0125,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.801167,0.072604,0.9975,0.002887,2.21,0.214022,2.32,0.207846,2.3,...,0.01299,0.92,0.011547,0.875,0.023094,0.962333,0.023671,0.787486,0.041098,100
zephyr-orpo-141b-A35b-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.537167,0.134222,0.873897,0.101052,2.35,0.366975,6.15,1.293896,4.52,...,0.060333,0.89,0.040415,0.855,0.092376,0.643667,0.108477,0.746003,0.059428,100
Mixtral-8x7B-Instruct-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.7075,0.061199,0.9575,0.029692,2.22,0.148564,2.47,0.205801,2.28,...,0.001443,0.91,0.011547,0.855,0.034641,0.908333,0.037528,0.751501,0.030875,100


In [9]:
clean_results = show_cleaned_results(eval_display)
for agg in [AGG_FUNC, "std"]:
    display(Markdown(f"# {agg}"))
    for sort_by in [
        ("citations/ais_recall", agg),
        ("citations/ais_precision", agg),
        ("correctness/answer_overlap", agg),
        ("correctness/answer_entail", agg),
        ("correctness/citations_recall", agg),
        ("correctness/citations_precision", agg),
        ("quality/answer_relevance", agg),
    ]:
        display(Markdown(f"### sorted by: {sort_by[0]}"))
        results = clean_results.sort_values(by=sort_by, ascending=False)
        results = results[[col for col in results.columns if col[1] == agg]]
        display(results)

# max

### sorted by: citations/ais_recall

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,max,max,max,max,max,max,max
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Llama-3-8B_v3,0.875667,0.975167,0.895744,0.92,0.955,0.955333,0.79045
rag-tge_Mistral.Q8,0.864167,0.974167,0.870923,0.87,0.93,0.93619,0.779305
rag-tge_Mistral_v2-4480,0.85557,0.979417,0.93459,0.91,0.995,0.933333,0.780416
rag-tge_Mistral_v6,0.854167,0.97,0.931256,0.95,0.955,0.976667,0.796144
rag-tge_Mistral_v2-3360,0.813959,0.984115,0.961667,0.92,0.99,0.918333,0.782177
gpt-3.5-turbo-0125,0.801167,0.9975,0.89591,0.92,0.875,0.962333,0.787486
gpt-4-turbo,0.775,0.99,0.885892,0.91,0.82,0.949333,0.777481
rag-tge_Llama-3-8B_v2,0.76995,0.984613,0.949359,0.97,0.985,0.877024,0.808191
Meta-Llama-3-70B-Instruct,0.736667,0.97,0.886179,0.89,0.8,0.915333,0.776388
rag-tge_Llama-3-8B_v1,0.734354,0.97856,0.959359,0.94,0.995,0.858667,0.798084


### sorted by: citations/ais_precision

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,max,max,max,max,max,max,max
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
gpt-3.5-turbo-0125,0.801167,0.9975,0.89591,0.92,0.875,0.962333,0.787486
qwen1_5-32b-chat-q8_0,0.534167,0.996667,0.880392,0.9,0.66,0.961667,0.731615
qwen1_5-110b-chat,0.6075,0.993333,0.888982,0.91,0.78,0.947333,0.787043
gpt-4-turbo,0.775,0.99,0.885892,0.91,0.82,0.949333,0.777481
rag-tge_Llama-3-8B_v2,0.76995,0.984613,0.949359,0.97,0.985,0.877024,0.808191
rag-tge_Mistral_v2-3360,0.813959,0.984115,0.961667,0.92,0.99,0.918333,0.782177
qwen1_5-14b-chat-q8_0,0.676667,0.983333,0.880154,0.87,0.69,0.931667,0.754467
rag-tge_Mistral_v2-4480,0.85557,0.979417,0.93459,0.91,0.995,0.933333,0.780416
rag-tge_Llama-3-8B_v1,0.734354,0.97856,0.959359,0.94,0.995,0.858667,0.798084
zephyr-7b-beta,0.48531,0.977167,0.874256,0.91,0.705,0.815278,0.751837


### sorted by: correctness/answer_overlap

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,max,max,max,max,max,max,max
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Mistral_v2-3360,0.813959,0.984115,0.961667,0.92,0.99,0.918333,0.782177
rag-tge_Llama-3-8B_v1,0.734354,0.97856,0.959359,0.94,0.995,0.858667,0.798084
rag-tge_Llama-3-8B_v2,0.76995,0.984613,0.949359,0.97,0.985,0.877024,0.808191
rag-tge_Mistral_v2-4480,0.85557,0.979417,0.93459,0.91,0.995,0.933333,0.780416
rag-tge_Mistral_v6,0.854167,0.97,0.931256,0.95,0.955,0.976667,0.796144
Meta-Llama-3-8B-Instruct,0.549357,0.972143,0.917846,0.91,0.775,0.922,0.748064
Mixtral-8x7B-Instruct-v0.1,0.7075,0.9575,0.91159,0.91,0.855,0.908333,0.751501
gpt-3.5-turbo-0125,0.801167,0.9975,0.89591,0.92,0.875,0.962333,0.787486
rag-tge_Llama-3-8B_v3,0.875667,0.975167,0.895744,0.92,0.955,0.955333,0.79045
qwen1_5-110b-chat,0.6075,0.993333,0.888982,0.91,0.78,0.947333,0.787043


### sorted by: correctness/answer_entail

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,max,max,max,max,max,max,max
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Llama-3-8B_v2,0.76995,0.984613,0.949359,0.97,0.985,0.877024,0.808191
rag-tge_Mistral_v6,0.854167,0.97,0.931256,0.95,0.955,0.976667,0.796144
rag-tge_Llama-3-8B_v1,0.734354,0.97856,0.959359,0.94,0.995,0.858667,0.798084
rag-tge_Mistral_v2-3360,0.813959,0.984115,0.961667,0.92,0.99,0.918333,0.782177
gpt-3.5-turbo-0125,0.801167,0.9975,0.89591,0.92,0.875,0.962333,0.787486
rag-tge_Llama-3-8B_v3,0.875667,0.975167,0.895744,0.92,0.955,0.955333,0.79045
rag-tge_Mistral_v2-4480,0.85557,0.979417,0.93459,0.91,0.995,0.933333,0.780416
Mixtral-8x7B-Instruct-v0.1,0.7075,0.9575,0.91159,0.91,0.855,0.908333,0.751501
zephyr-7b-beta,0.48531,0.977167,0.874256,0.91,0.705,0.815278,0.751837
gpt-4-turbo,0.775,0.99,0.885892,0.91,0.82,0.949333,0.777481


### sorted by: correctness/citations_recall

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,max,max,max,max,max,max,max
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Mistral_v2-4480,0.85557,0.979417,0.93459,0.91,0.995,0.933333,0.780416
rag-tge_Llama-3-8B_v1,0.734354,0.97856,0.959359,0.94,0.995,0.858667,0.798084
rag-tge_Mistral_v2-3360,0.813959,0.984115,0.961667,0.92,0.99,0.918333,0.782177
rag-tge_Llama-3-8B_v2,0.76995,0.984613,0.949359,0.97,0.985,0.877024,0.808191
rag-tge_Llama-3-8B_v3,0.875667,0.975167,0.895744,0.92,0.955,0.955333,0.79045
rag-tge_Mistral_v6,0.854167,0.97,0.931256,0.95,0.955,0.976667,0.796144
rag-tge_Mistral.Q8,0.864167,0.974167,0.870923,0.87,0.93,0.93619,0.779305
gpt-3.5-turbo-0125,0.801167,0.9975,0.89591,0.92,0.875,0.962333,0.787486
zephyr-orpo-141b-A35b-v0.1,0.537167,0.873897,0.852077,0.89,0.855,0.643667,0.746003
Mixtral-8x7B-Instruct-v0.1,0.7075,0.9575,0.91159,0.91,0.855,0.908333,0.751501


### sorted by: correctness/citations_precision

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,max,max,max,max,max,max,max
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Mistral_v6,0.854167,0.97,0.931256,0.95,0.955,0.976667,0.796144
gpt-3.5-turbo-0125,0.801167,0.9975,0.89591,0.92,0.875,0.962333,0.787486
qwen1_5-32b-chat-q8_0,0.534167,0.996667,0.880392,0.9,0.66,0.961667,0.731615
rag-tge_Llama-3-8B_v3,0.875667,0.975167,0.895744,0.92,0.955,0.955333,0.79045
gpt-4-turbo,0.775,0.99,0.885892,0.91,0.82,0.949333,0.777481
qwen1_5-110b-chat,0.6075,0.993333,0.888982,0.91,0.78,0.947333,0.787043
rag-tge_Mistral.Q8,0.864167,0.974167,0.870923,0.87,0.93,0.93619,0.779305
rag-tge_Mistral_v2-4480,0.85557,0.979417,0.93459,0.91,0.995,0.933333,0.780416
qwen1_5-14b-chat-q8_0,0.676667,0.983333,0.880154,0.87,0.69,0.931667,0.754467
Meta-Llama-3-8B-Instruct,0.549357,0.972143,0.917846,0.91,0.775,0.922,0.748064


### sorted by: quality/answer_relevance

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,max,max,max,max,max,max,max
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Llama-3-8B_v2,0.76995,0.984613,0.949359,0.97,0.985,0.877024,0.808191
rag-tge_Llama-3-8B_v1,0.734354,0.97856,0.959359,0.94,0.995,0.858667,0.798084
qwen1_5-7b-chat-q8_0,0.291167,0.91,0.886821,0.86,0.605,0.864,0.797259
rag-tge_Mistral_v6,0.854167,0.97,0.931256,0.95,0.955,0.976667,0.796144
rag-tge_Llama-3-8B_v3,0.875667,0.975167,0.895744,0.92,0.955,0.955333,0.79045
gpt-3.5-turbo-0125,0.801167,0.9975,0.89591,0.92,0.875,0.962333,0.787486
qwen1_5-110b-chat,0.6075,0.993333,0.888982,0.91,0.78,0.947333,0.787043
rag-tge_Mistral_v2-3360,0.813959,0.984115,0.961667,0.92,0.99,0.918333,0.782177
rag-tge_Mistral_v2-4480,0.85557,0.979417,0.93459,0.91,0.995,0.933333,0.780416
rag-tge_Mistral.Q8,0.864167,0.974167,0.870923,0.87,0.93,0.93619,0.779305


# std

### sorted by: citations/ais_recall

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,std,std,std,std,std,std,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Mistral_v2-3360,0.146211,0.032981,0.030777,0.011547,0.025981,0.084337,0.06322
rag-tge_Mistral_v2-4480,0.134646,0.031338,0.017918,0.023094,0.025981,0.070338,0.055383
zephyr-orpo-141b-A35b-v0.1,0.134222,0.101052,0.060333,0.040415,0.092376,0.108477,0.059428
gemma-1.1-7b-it,0.122999,0.082995,0.075822,0.075056,0.080829,0.101471,0.043209
rag-tge_Llama-3-8B_v2,0.120419,0.04962,0.015031,0.040415,0.005774,0.135618,0.06608
rag-tge_Llama-3-8B_v1,0.100349,0.03002,0.013723,0.023094,0.00866,0.09881,0.071202
qwen1_5-110b-chat,0.095486,0.026943,0.020207,0.017321,0.051962,0.023286,0.055224
zephyr-7b-beta,0.090558,0.068665,0.028674,0.046188,0.080056,0.088494,0.041852
rag-tge_Mistral_v6,0.08233,0.023575,0.031858,0.028868,0.023094,0.018283,0.040207
rag-tge_Llama-3-8B_v3,0.0751,0.029156,0.045041,0.023094,0.025207,0.00866,0.033242


### sorted by: citations/ais_precision

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,std,std,std,std,std,std,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
zephyr-orpo-141b-A35b-v0.1,0.134222,0.101052,0.060333,0.040415,0.092376,0.108477,0.059428
gemma-1.1-7b-it,0.122999,0.082995,0.075822,0.075056,0.080829,0.101471,0.043209
Phi-3-mini-4k-instruct,0.030503,0.06958,0.012317,0.023094,0.051188,0.064275,0.03725
zephyr-7b-beta,0.090558,0.068665,0.028674,0.046188,0.080056,0.088494,0.041852
c4ai-command-r-plus,0.060382,0.061399,0.023094,0.0,0.043868,0.054568,0.046987
qwen1_5-7b-chat-q8_0,0.063671,0.057158,0.024118,0.023094,0.074282,0.078327,0.034364
rag-tge_Llama-3-8B_v2,0.120419,0.04962,0.015031,0.040415,0.005774,0.135618,0.06608
Mistral-7B-Instruct-v0.2,0.03656,0.03835,0.014915,0.017321,0.025981,0.050125,0.02834
rag-tge_Mistral_v2-3360,0.146211,0.032981,0.030777,0.011547,0.025981,0.084337,0.06322
rag-tge_Mistral_v2-4480,0.134646,0.031338,0.017918,0.023094,0.025981,0.070338,0.055383


### sorted by: correctness/answer_overlap

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,std,std,std,std,std,std,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
gemma-1.1-7b-it,0.122999,0.082995,0.075822,0.075056,0.080829,0.101471,0.043209
zephyr-orpo-141b-A35b-v0.1,0.134222,0.101052,0.060333,0.040415,0.092376,0.108477,0.059428
rag-tge_Llama-3-8B_v3,0.0751,0.029156,0.045041,0.023094,0.025207,0.00866,0.033242
gemma-1.1-2b-it,0.005774,0.009623,0.038105,0.040415,0.005774,0.006736,0.019533
rag-tge_Mistral_v6,0.08233,0.023575,0.031858,0.028868,0.023094,0.018283,0.040207
rag-tge_Mistral_v2-3360,0.146211,0.032981,0.030777,0.011547,0.025981,0.084337,0.06322
zephyr-7b-beta,0.090558,0.068665,0.028674,0.046188,0.080056,0.088494,0.041852
qwen1_5-7b-chat-q8_0,0.063671,0.057158,0.024118,0.023094,0.074282,0.078327,0.034364
c4ai-command-r-plus,0.060382,0.061399,0.023094,0.0,0.043868,0.054568,0.046987
Meta-Llama-3-70B-Instruct,0.049556,0.017802,0.020651,0.017321,0.017321,0.009623,0.027829


### sorted by: correctness/answer_entail

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,std,std,std,std,std,std,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
gemma-1.1-7b-it,0.122999,0.082995,0.075822,0.075056,0.080829,0.101471,0.043209
zephyr-7b-beta,0.090558,0.068665,0.028674,0.046188,0.080056,0.088494,0.041852
rag-tge_Llama-3-8B_v2,0.120419,0.04962,0.015031,0.040415,0.005774,0.135618,0.06608
gemma-1.1-2b-it,0.005774,0.009623,0.038105,0.040415,0.005774,0.006736,0.019533
zephyr-orpo-141b-A35b-v0.1,0.134222,0.101052,0.060333,0.040415,0.092376,0.108477,0.059428
rag-tge_Mistral_v6,0.08233,0.023575,0.031858,0.028868,0.023094,0.018283,0.040207
rag-tge_Llama-3-8B_v1,0.100349,0.03002,0.013723,0.023094,0.00866,0.09881,0.071202
rag-tge_Mistral_v2-4480,0.134646,0.031338,0.017918,0.023094,0.025981,0.070338,0.055383
Phi-3-mini-4k-instruct,0.030503,0.06958,0.012317,0.023094,0.051188,0.064275,0.03725
qwen1_5-7b-chat-q8_0,0.063671,0.057158,0.024118,0.023094,0.074282,0.078327,0.034364


### sorted by: correctness/citations_recall

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,std,std,std,std,std,std,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
zephyr-orpo-141b-A35b-v0.1,0.134222,0.101052,0.060333,0.040415,0.092376,0.108477,0.059428
gemma-1.1-7b-it,0.122999,0.082995,0.075822,0.075056,0.080829,0.101471,0.043209
zephyr-7b-beta,0.090558,0.068665,0.028674,0.046188,0.080056,0.088494,0.041852
qwen1_5-7b-chat-q8_0,0.063671,0.057158,0.024118,0.023094,0.074282,0.078327,0.034364
qwen1_5-110b-chat,0.095486,0.026943,0.020207,0.017321,0.051962,0.023286,0.055224
Phi-3-mini-4k-instruct,0.030503,0.06958,0.012317,0.023094,0.051188,0.064275,0.03725
c4ai-command-r-plus,0.060382,0.061399,0.023094,0.0,0.043868,0.054568,0.046987
Mixtral-8x7B-Instruct-v0.1,0.061199,0.029692,0.001443,0.011547,0.034641,0.037528,0.030875
qwen1_5-14b-chat-q8_0,0.042339,0.00866,0.006598,0.005774,0.030981,0.016358,0.038641
gpt-4-turbo,0.045401,0.005774,0.007661,0.011547,0.028868,0.010585,0.033935


### sorted by: correctness/citations_precision

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,std,std,std,std,std,std,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Llama-3-8B_v2,0.120419,0.04962,0.015031,0.040415,0.005774,0.135618,0.06608
zephyr-orpo-141b-A35b-v0.1,0.134222,0.101052,0.060333,0.040415,0.092376,0.108477,0.059428
gemma-1.1-7b-it,0.122999,0.082995,0.075822,0.075056,0.080829,0.101471,0.043209
rag-tge_Llama-3-8B_v1,0.100349,0.03002,0.013723,0.023094,0.00866,0.09881,0.071202
zephyr-7b-beta,0.090558,0.068665,0.028674,0.046188,0.080056,0.088494,0.041852
rag-tge_Mistral_v2-3360,0.146211,0.032981,0.030777,0.011547,0.025981,0.084337,0.06322
qwen1_5-7b-chat-q8_0,0.063671,0.057158,0.024118,0.023094,0.074282,0.078327,0.034364
rag-tge_Mistral_v2-4480,0.134646,0.031338,0.017918,0.023094,0.025981,0.070338,0.055383
Phi-3-mini-4k-instruct,0.030503,0.06958,0.012317,0.023094,0.051188,0.064275,0.03725
c4ai-command-r-plus,0.060382,0.061399,0.023094,0.0,0.043868,0.054568,0.046987


### sorted by: quality/answer_relevance

Unnamed: 0_level_0,citations/ais_recall,citations/ais_precision,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance
Unnamed: 0_level_1,std,std,std,std,std,std,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
rag-tge_Llama-3-8B_v1,0.100349,0.03002,0.013723,0.023094,0.00866,0.09881,0.071202
rag-tge_Llama-3-8B_v2,0.120419,0.04962,0.015031,0.040415,0.005774,0.135618,0.06608
rag-tge_Mistral_v2-3360,0.146211,0.032981,0.030777,0.011547,0.025981,0.084337,0.06322
zephyr-orpo-141b-A35b-v0.1,0.134222,0.101052,0.060333,0.040415,0.092376,0.108477,0.059428
rag-tge_Mistral_v2-4480,0.134646,0.031338,0.017918,0.023094,0.025981,0.070338,0.055383
qwen1_5-110b-chat,0.095486,0.026943,0.020207,0.017321,0.051962,0.023286,0.055224
c4ai-command-r-plus,0.060382,0.061399,0.023094,0.0,0.043868,0.054568,0.046987
gemma-1.1-7b-it,0.122999,0.082995,0.075822,0.075056,0.080829,0.101471,0.043209
zephyr-7b-beta,0.090558,0.068665,0.028674,0.046188,0.080056,0.088494,0.041852
gpt-3.5-turbo-0125,0.072604,0.002887,0.01299,0.011547,0.023094,0.023671,0.041098


In [10]:
display(Markdown("### Training results"))
train_display = remove_index(train_results, "prompt_id")
train_display

### Training results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,max,std,max,std,max,std,max,std,max,...,std,max,std,max,std,max,std,max,std,Unnamed: 25_level_1
llm,temperature,nli,ellm,sim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
Meta-Llama-3-70B-Instruct,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.76895,0.031242,0.967017,0.010844,1.510799,0.063348,2.05489,0.085451,1.953894,...,0.0101,0.900795,0.010757,0.823467,0.015654,0.920578,0.011854,0.764136,0.024321,17872
Mixtral-8x7B-Instruct-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.734613,0.063516,0.956884,0.027845,2.307743,0.209854,2.775428,0.274398,2.557775,...,0.017245,0.899073,0.020221,0.861862,0.037924,0.881502,0.03792,0.765638,0.034703,7659
Mixtral-8x7B-Instruct-v0.1.Q8_0,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.684853,0.070232,0.930524,0.034851,2.912667,0.27358,3.3925,0.383894,3.078,...,0.018198,0.894,0.019919,0.878833,0.036453,0.83568,0.046054,0.757222,0.036725,6000
qwen1_5-110b-chat,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.669,0.11713,0.987744,0.030177,2.174,0.256917,1.989,0.213576,1.902,...,0.022317,0.898,0.025403,0.7825,0.053828,0.939481,0.036772,0.764311,0.043379,1000
qwen1_5-32b-chat-q8_0,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.582407,0.031801,0.994444,0.006415,1.666667,0.083395,1.4,0.044905,1.388889,...,0.0102,0.844444,0.0,0.65,0.022453,0.955556,0.003208,0.73822,0.017761,90
zephyr-orpo-141b-A35b-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.563212,0.127016,0.882383,0.101713,2.376147,0.372804,5.887287,1.25203,4.269987,...,0.036494,0.878113,0.034536,0.821756,0.09721,0.663595,0.105757,0.743845,0.063733,763
