In [1]:
import os
import pandas as pd
import json
from IPython.display import display, Markdown
from tqdm.auto import tqdm

from common.consts import RESULTS_DIR, EVAL_SIZE
from common.utils import filename_to_obj, remove_index

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def results_as_pandas(filename):
    path = os.path.join(RESULTS_DIR, filename)
    with open(path, "r") as f:
        data = f.readlines()
    data = [json.loads(d) for d in data]
    data = pd.DataFrame(data)

    if len(data) == 0:
        print(f"empty file: {filename}")
        return data

    params = filename_to_obj(filename)
    for k, v in params.items():
        data[k] = v

    data = data.explode("evaluations")
    data = data.rename_axis("question_idx").reset_index()

    data = pd.concat([data, data["evaluations"].apply(pd.Series)], axis=1)
    evaluation_keys = data["evaluations"].apply(pd.Series).columns
    for col in evaluation_keys:
        data = pd.concat([data, data[col].apply(pd.Series).add_prefix(f"{col}/")], axis=1)
        data = data.drop(columns=col)
    data = data.drop(columns=["evaluations"])

    return data


files = os.listdir(RESULTS_DIR)
params_names = list(filename_to_obj(files[0]).keys())
all_results = pd.concat([results_as_pandas(f) for f in tqdm(files)])
all_results.head()

100%|██████████| 39/39 [01:26<00:00,  2.22s/it]


Unnamed: 0,question_idx,question_id,llm,prompt_id,temperature,nli,ellm,sim,citations/ais_recall,citations/ais_precision,...,citations/supported,citations/citations,citations/correct_citations,citations/out_of_range,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance,quality/new_question
0,0,5abab42e55429955dce3eed2,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.25,1.0,...,"[0, 0, 0, 1]","[[], [], [3], [4]]","[[], [], [True], [True]]","[0, 0, 0, 0]",1.0,1.0,1.0,1.0,0.649022,Which of P.K. Subban's brothers was drafted fi...
1,0,5abab42e55429955dce3eed2,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.083333,0.875,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[[], [], [3], [4], [6], [], [], [], [], [4, 5]...","[[], [], [True], [True], [True], [], [], [], [...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1.0,0.0,1.0,0.5,0.75747,"Who are the two professional hockey players, b..."
2,0,5abab42e55429955dce3eed2,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.0,1.0,...,"[0, 0, 0, 0, 0, 0]","[[], [], [4], [6], [], []]","[[], [], [True], [True], [], []]","[0, 0, 0, 0, 0, 0]",0.5,0.0,0.5,0.5,0.770844,"Who are the two professional hockey players, b..."
3,1,5a761900554299109176e648,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1.0,1.0,...,[1],[[1]],[[True]],[0],1.0,1.0,0.5,1.0,0.450042,What is the name of the intelligence firm that...
4,1,5a761900554299109176e648,Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1.0,1.0,...,"[1, 1]","[[0, 1], [0, 1]]","[[True, True], [True, True]]","[0, 0]",1.0,1.0,1.0,1.0,0.772082,What is the name of the lobbying group founded...


In [3]:
all_obj_cols = all_results.select_dtypes(include=["object"]).columns
drop_obj_cols = list(set(all_obj_cols) - set(params_names))
drop_obj_cols.remove("question_id")
print(f"Dropping columns: {drop_obj_cols}")
all_num_results = all_results.drop(columns=drop_obj_cols)

Dropping columns: ['citations/supported', 'citations/sentences', 'citations/out_of_range', 'quality/new_question', 'citations/citations', 'citations/correct_citations']


In [4]:
eval_split = all_num_results[all_num_results["question_idx"] < EVAL_SIZE]
train_split = all_num_results[all_num_results["question_idx"] >= EVAL_SIZE]

In [5]:
def aggregate(split):
    split = split.drop(columns=["question_idx"])
    results_with_std_for_each_question = split.groupby([*params_names, "question_id"]).agg(["mean", "std"])
    results_for_each_model = results_with_std_for_each_question.groupby(params_names)
    results = results_for_each_model.mean()
    results["n_questions"] = results_for_each_model.size()
    return results


eval_results = aggregate(eval_split)
train_results = aggregate(train_split)

if eval_results["n_questions"].nunique() != 1:
    print("Warning: not all rows in evaluation have the same number of examples")

In [6]:
display(Markdown("### Prompts comparison"))
parameter_results = eval_results[eval_results.index.get_level_values("llm") == "Mistral-7B-Instruct-v0.2"]


def show_cleaned_results(short_eval_display, keep_index_name=None):
    short_eval_display = short_eval_display.copy()
    for index_name in ["temperature", "nli", "ellm", "sim", "prompt_id"]:
        if index_name == keep_index_name:
            continue
        short_eval_display = remove_index(short_eval_display, index_name)
    important_columns = ["citations/ais_recall", "citations/ais_precision", "correctness/answer_overlap", "correctness/answer_entail", "correctness/citations_recall", "correctness/citations_precision", "quality/answer_relevance"]
    short_eval_display = short_eval_display[important_columns]
    return short_eval_display


show_cleaned_results(parameter_results[parameter_results.index.get_level_values("temperature") == "0.1"], keep_index_name="prompt_id")

### Prompts comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,prompt_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Mistral-7B-Instruct-v0.2,1,0.575644,0.03656,0.871517,0.03835,0.872812,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834
Mistral-7B-Instruct-v0.2,2,0.50154,0.066146,0.862167,0.034978,0.85538,0.027905,0.86,0.023094,0.77,0.043301,0.670627,0.046655,0.704898,0.030174
Mistral-7B-Instruct-v0.2,3,0.580222,0.045033,0.890076,0.024787,0.855936,0.017321,0.863333,0.011547,0.665,0.034641,0.658333,0.04237,0.658884,0.027177


In [7]:
display(Markdown("### Temperature comparison"))
show_cleaned_results(parameter_results[parameter_results.index.get_level_values("prompt_id") == "1"], keep_index_name="temperature")

### Temperature comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,temperature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Mistral-7B-Instruct-v0.2,0.01,0.574825,0.027143,0.889365,0.016922,0.858923,0.01299,0.883333,0.017321,0.758333,0.014434,0.744357,0.023419,0.734959,0.015812
Mistral-7B-Instruct-v0.2,0.1,0.575644,0.03656,0.871517,0.03835,0.872812,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834
Mistral-7B-Instruct-v0.2,0.2,0.544493,0.081991,0.877352,0.050804,0.864735,0.029793,0.886667,0.005774,0.77,0.051962,0.741825,0.050897,0.725103,0.03312
Mistral-7B-Instruct-v0.2,0.3,0.566712,0.084729,0.863762,0.066155,0.872534,0.045707,0.883333,0.023094,0.776667,0.059848,0.749746,0.067481,0.723474,0.047856
Mistral-7B-Instruct-v0.2,0.4,0.580572,0.150051,0.877947,0.108934,0.872336,0.044607,0.886667,0.028868,0.775,0.096603,0.738476,0.099659,0.721409,0.04985
Mistral-7B-Instruct-v0.2,0.5,0.569651,0.144597,0.88101,0.085049,0.852962,0.073768,0.88,0.034641,0.761667,0.082169,0.737151,0.114113,0.707704,0.059824
Mistral-7B-Instruct-v0.2,0.6,0.519373,0.168079,0.856773,0.110804,0.845013,0.049038,0.87,0.040415,0.758333,0.12181,0.738574,0.125848,0.715341,0.064925
Mistral-7B-Instruct-v0.2,0.7,0.555853,0.182757,0.858316,0.115458,0.871201,0.047054,0.873333,0.028868,0.77,0.118923,0.73623,0.132095,0.727172,0.05884
Mistral-7B-Instruct-v0.2,0.8,0.505895,0.195393,0.842402,0.131938,0.867148,0.062564,0.86,0.086603,0.751667,0.119697,0.71973,0.123992,0.725166,0.070362
Mistral-7B-Instruct-v0.2,0.9,0.572745,0.204617,0.876745,0.12727,0.82941,0.074434,0.856667,0.057735,0.751667,0.14913,0.731516,0.152158,0.695873,0.073964


In [8]:
display(Markdown("### Evaluation results"))
eval_display = eval_results[eval_results.index.get_level_values("prompt_id") == "1"]
eval_display = eval_display[eval_display.index.get_level_values("temperature") == "0.1"]
eval_display = eval_display[eval_display.index.get_level_values("llm") != "Mixtral-8x7B-Instruct-v0.1.Q8_0"]  # use this model only for training data, because we already have same model without quantization
eval_display = remove_index(eval_display, "prompt_id")
eval_display = eval_display.sort_values(by=("correctness/citations_recall", "mean"), ascending=False)
eval_display

### Evaluation results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 25_level_1
llm,temperature,nli,ellm,sim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
rag-tge_Llama-3-8B_v1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.640377,0.100349,0.953748,0.03002,8.153333,1.170721,8.49,1.338763,8.02,...,0.013723,0.923333,0.023094,0.988333,0.00866,0.775833,0.09881,0.735427,0.071202,100
rag-tge_Mistral_v2-4480,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.73317,0.134646,0.954283,0.031338,4.746667,1.208135,5.483333,1.148101,5.22,...,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383,100
rag-tge_Mistral_v2-3360,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.682667,0.146211,0.957922,0.032981,5.566667,1.424317,6.223333,1.360671,5.96,...,0.030777,0.91,0.011547,0.968333,0.025981,0.842111,0.084337,0.725706,0.06322,100
rag-tge_Mistral.Q8,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.812135,0.06052,0.957667,0.021027,2.246667,0.261624,2.95,0.306084,2.796667,...,0.012892,0.866667,0.005774,0.915,0.023094,0.907413,0.040843,0.742347,0.040067,100
gpt-3.5-turbo-0125,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.737556,0.072604,0.995833,0.002887,2.02,0.214022,2.143333,0.207846,2.13,...,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098,100
Mixtral-8x7B-Instruct-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.6535,0.061199,0.928603,0.029692,2.1,0.148564,2.29,0.205801,2.12,...,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875,100
gpt-4-turbo,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.733013,0.045401,0.985,0.005774,1.593333,0.170111,1.79,0.090331,1.76,...,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935,100
Meta-Llama-3-70B-Instruct,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.691944,0.049556,0.955556,0.017802,1.463333,0.063509,1.88,0.069282,1.796667,...,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829,100
zephyr-orpo-141b-A35b-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.417516,0.134222,0.796321,0.101052,1.99,0.366975,4.93,1.293896,3.57,...,0.060333,0.86,0.040415,0.775,0.092376,0.54567,0.108477,0.696486,0.059428,100
Meta-Llama-3-8B-Instruct,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.492702,0.060547,0.95121,0.022033,2.186667,0.194752,2.04,0.18071,1.91,...,0.0,0.903333,0.005774,0.753333,0.023094,0.907556,0.018283,0.720451,0.03169,100


In [9]:
clean_results = show_cleaned_results(eval_display)
for sort_by in [
    ("citations/ais_recall", "mean"),
    ("citations/ais_precision", "mean"),
    ("correctness/answer_overlap", "mean"),
    ("correctness/answer_entail", "mean"),
    ("correctness/citations_recall", "mean"),
    ("correctness/citations_precision", "mean"),
    ("quality/answer_relevance", "mean"),
]:
    display(Markdown(f"### sorted by: {sort_by[0]}"))
    display(clean_results.sort_values(by=sort_by, ascending=False))

### sorted by: citations/ais_recall

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
rag-tge_Mistral.Q8,0.812135,0.06052,0.957667,0.021027,0.860838,0.012892,0.866667,0.005774,0.915,0.023094,0.907413,0.040843,0.742347,0.040067
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
rag-tge_Mistral_v2-4480,0.73317,0.134646,0.954283,0.031338,0.915308,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
Meta-Llama-3-70B-Instruct,0.691944,0.049556,0.955556,0.017802,0.865923,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829
rag-tge_Mistral_v2-3360,0.682667,0.146211,0.957922,0.032981,0.936128,0.030777,0.91,0.011547,0.968333,0.025981,0.842111,0.084337,0.725706,0.06322
Mixtral-8x7B-Instruct-v0.1,0.6535,0.061199,0.928603,0.029692,0.909923,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875
rag-tge_Llama-3-8B_v1,0.640377,0.100349,0.953748,0.03002,0.944436,0.013723,0.923333,0.023094,0.988333,0.00866,0.775833,0.09881,0.735427,0.071202
qwen1_5-14b-chat-q8_0,0.633333,0.042339,0.975,0.00866,0.876344,0.006598,0.863333,0.005774,0.66,0.030981,0.92,0.016358,0.719488,0.038641
Mistral-7B-Instruct-v0.2,0.575644,0.03656,0.871517,0.03835,0.872812,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834


### sorted by: citations/ais_precision

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
qwen1_5-32b-chat-q8_0,0.512222,0.031856,0.993333,0.002887,0.873725,0.005774,0.896667,0.005774,0.655,0.00866,0.961667,0.0,0.710307,0.026912
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
qwen1_5-110b-chat,0.528167,0.095486,0.976111,0.026943,0.867315,0.020207,0.893333,0.017321,0.735,0.051962,0.929889,0.023286,0.737396,0.055224
qwen1_5-14b-chat-q8_0,0.633333,0.042339,0.975,0.00866,0.876344,0.006598,0.863333,0.005774,0.66,0.030981,0.92,0.016358,0.719488,0.038641
rag-tge_Mistral_v2-3360,0.682667,0.146211,0.957922,0.032981,0.936128,0.030777,0.91,0.011547,0.968333,0.025981,0.842111,0.084337,0.725706,0.06322
rag-tge_Mistral.Q8,0.812135,0.06052,0.957667,0.021027,0.860838,0.012892,0.866667,0.005774,0.915,0.023094,0.907413,0.040843,0.742347,0.040067
Meta-Llama-3-70B-Instruct,0.691944,0.049556,0.955556,0.017802,0.865923,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829
rag-tge_Mistral_v2-4480,0.73317,0.134646,0.954283,0.031338,0.915308,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383
rag-tge_Llama-3-8B_v1,0.640377,0.100349,0.953748,0.03002,0.944436,0.013723,0.923333,0.023094,0.988333,0.00866,0.775833,0.09881,0.735427,0.071202


### sorted by: correctness/answer_overlap

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
rag-tge_Llama-3-8B_v1,0.640377,0.100349,0.953748,0.03002,0.944436,0.013723,0.923333,0.023094,0.988333,0.00866,0.775833,0.09881,0.735427,0.071202
rag-tge_Mistral_v2-3360,0.682667,0.146211,0.957922,0.032981,0.936128,0.030777,0.91,0.011547,0.968333,0.025981,0.842111,0.084337,0.725706,0.06322
Meta-Llama-3-8B-Instruct,0.492702,0.060547,0.95121,0.022033,0.917846,0.0,0.903333,0.005774,0.753333,0.023094,0.907556,0.018283,0.720451,0.03169
rag-tge_Mistral_v2-4480,0.73317,0.134646,0.954283,0.031338,0.915308,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383
Mixtral-8x7B-Instruct-v0.1,0.6535,0.061199,0.928603,0.029692,0.909923,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
qwen1_5-14b-chat-q8_0,0.633333,0.042339,0.975,0.00866,0.876344,0.006598,0.863333,0.005774,0.66,0.030981,0.92,0.016358,0.719488,0.038641
qwen1_5-32b-chat-q8_0,0.512222,0.031856,0.993333,0.002887,0.873725,0.005774,0.896667,0.005774,0.655,0.00866,0.961667,0.0,0.710307,0.026912
Mistral-7B-Instruct-v0.2,0.575644,0.03656,0.871517,0.03835,0.872812,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834


### sorted by: correctness/answer_entail

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
rag-tge_Llama-3-8B_v1,0.640377,0.100349,0.953748,0.03002,0.944436,0.013723,0.923333,0.023094,0.988333,0.00866,0.775833,0.09881,0.735427,0.071202
rag-tge_Mistral_v2-3360,0.682667,0.146211,0.957922,0.032981,0.936128,0.030777,0.91,0.011547,0.968333,0.025981,0.842111,0.084337,0.725706,0.06322
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
Meta-Llama-3-8B-Instruct,0.492702,0.060547,0.95121,0.022033,0.917846,0.0,0.903333,0.005774,0.753333,0.023094,0.907556,0.018283,0.720451,0.03169
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
Mixtral-8x7B-Instruct-v0.1,0.6535,0.061199,0.928603,0.029692,0.909923,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875
rag-tge_Mistral_v2-4480,0.73317,0.134646,0.954283,0.031338,0.915308,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383
qwen1_5-32b-chat-q8_0,0.512222,0.031856,0.993333,0.002887,0.873725,0.005774,0.896667,0.005774,0.655,0.00866,0.961667,0.0,0.710307,0.026912
qwen1_5-110b-chat,0.528167,0.095486,0.976111,0.026943,0.867315,0.020207,0.893333,0.017321,0.735,0.051962,0.929889,0.023286,0.737396,0.055224
Mistral-7B-Instruct-v0.2,0.575644,0.03656,0.871517,0.03835,0.872812,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834


### sorted by: correctness/citations_recall

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
rag-tge_Llama-3-8B_v1,0.640377,0.100349,0.953748,0.03002,0.944436,0.013723,0.923333,0.023094,0.988333,0.00866,0.775833,0.09881,0.735427,0.071202
rag-tge_Mistral_v2-4480,0.73317,0.134646,0.954283,0.031338,0.915308,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383
rag-tge_Mistral_v2-3360,0.682667,0.146211,0.957922,0.032981,0.936128,0.030777,0.91,0.011547,0.968333,0.025981,0.842111,0.084337,0.725706,0.06322
rag-tge_Mistral.Q8,0.812135,0.06052,0.957667,0.021027,0.860838,0.012892,0.866667,0.005774,0.915,0.023094,0.907413,0.040843,0.742347,0.040067
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
Mixtral-8x7B-Instruct-v0.1,0.6535,0.061199,0.928603,0.029692,0.909923,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
Meta-Llama-3-70B-Instruct,0.691944,0.049556,0.955556,0.017802,0.865923,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829
zephyr-orpo-141b-A35b-v0.1,0.417516,0.134222,0.796321,0.101052,0.807244,0.060333,0.86,0.040415,0.775,0.092376,0.54567,0.108477,0.696486,0.059428
Meta-Llama-3-8B-Instruct,0.492702,0.060547,0.95121,0.022033,0.917846,0.0,0.903333,0.005774,0.753333,0.023094,0.907556,0.018283,0.720451,0.03169


### sorted by: correctness/citations_precision

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
qwen1_5-32b-chat-q8_0,0.512222,0.031856,0.993333,0.002887,0.873725,0.005774,0.896667,0.005774,0.655,0.00866,0.961667,0.0,0.710307,0.026912
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
qwen1_5-110b-chat,0.528167,0.095486,0.976111,0.026943,0.867315,0.020207,0.893333,0.017321,0.735,0.051962,0.929889,0.023286,0.737396,0.055224
qwen1_5-14b-chat-q8_0,0.633333,0.042339,0.975,0.00866,0.876344,0.006598,0.863333,0.005774,0.66,0.030981,0.92,0.016358,0.719488,0.038641
Meta-Llama-3-70B-Instruct,0.691944,0.049556,0.955556,0.017802,0.865923,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829
Meta-Llama-3-8B-Instruct,0.492702,0.060547,0.95121,0.022033,0.917846,0.0,0.903333,0.005774,0.753333,0.023094,0.907556,0.018283,0.720451,0.03169
rag-tge_Mistral.Q8,0.812135,0.06052,0.957667,0.021027,0.860838,0.012892,0.866667,0.005774,0.915,0.023094,0.907413,0.040843,0.742347,0.040067
rag-tge_Mistral_v2-4480,0.73317,0.134646,0.954283,0.031338,0.915308,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383
Mixtral-8x7B-Instruct-v0.1,0.6535,0.061199,0.928603,0.029692,0.909923,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875


### sorted by: quality/answer_relevance

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
qwen1_5-7b-chat-q8_0,0.233444,0.063671,0.862,0.057158,0.864821,0.024118,0.84,0.023094,0.548333,0.074282,0.800444,0.078327,0.767839,0.034364
Meta-Llama-3-70B-Instruct,0.691944,0.049556,0.955556,0.017802,0.865923,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
rag-tge_Mistral.Q8,0.812135,0.06052,0.957667,0.021027,0.860838,0.012892,0.866667,0.005774,0.915,0.023094,0.907413,0.040843,0.742347,0.040067
gemma-1.1-7b-it,0.367889,0.122999,0.766667,0.082995,0.722513,0.075822,0.726667,0.075056,0.471667,0.080829,0.660222,0.101471,0.738827,0.043209
Mistral-7B-Instruct-v0.1,0.005,0.0,0.01,0.0,0.79041,0.014434,0.826667,0.017321,0.01,0.0,0.006667,0.0,0.73785,0.02713
qwen1_5-110b-chat,0.528167,0.095486,0.976111,0.026943,0.867315,0.020207,0.893333,0.017321,0.735,0.051962,0.929889,0.023286,0.737396,0.055224
rag-tge_Llama-3-8B_v1,0.640377,0.100349,0.953748,0.03002,0.944436,0.013723,0.923333,0.023094,0.988333,0.00866,0.775833,0.09881,0.735427,0.071202
rag-tge_Mistral_v2-4480,0.73317,0.134646,0.954283,0.031338,0.915308,0.017918,0.896667,0.023094,0.971667,0.025981,0.876556,0.070338,0.72928,0.055383


In [10]:
display(Markdown("### Training results"))
train_display = remove_index(train_results, "prompt_id")
train_display

### Training results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 25_level_1
llm,temperature,nli,ellm,sim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
Meta-Llama-3-70B-Instruct,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.742273,0.031242,0.958344,0.010844,1.454174,0.063348,1.978738,0.085451,1.87957,...,0.0101,0.891749,0.010757,0.810243,0.015654,0.910769,0.011854,0.742832,0.024321,17872
Mixtral-8x7B-Instruct-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.67031,0.063516,0.931789,0.027845,2.077639,0.209854,2.475505,0.274398,2.285696,...,0.017245,0.879884,0.020221,0.824611,0.037924,0.845152,0.03792,0.72991,0.034703,7659
Mixtral-8x7B-Instruct-v0.1.Q8_0,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.623737,0.070232,0.901718,0.034851,2.656944,0.27358,3.031389,0.383894,2.768167,...,0.018198,0.877611,0.019919,0.847833,0.036453,0.797328,0.046054,0.724574,0.036725,6000
qwen1_5-110b-chat,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.56492,0.11713,0.96412,0.030177,1.936333,0.256917,1.789667,0.213576,1.687333,...,0.022317,0.874333,0.025403,0.737167,0.053828,0.909575,0.036772,0.725493,0.043379,1000
qwen1_5-32b-chat-q8_0,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.549691,0.031801,0.988889,0.006415,1.6,0.083395,1.362963,0.044905,1.340741,...,0.0102,0.844444,0.0,0.633333,0.022453,0.953704,0.003208,0.72265,0.017761,90
zephyr-orpo-141b-A35b-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.443428,0.127016,0.797227,0.101713,1.991263,0.372804,4.609655,1.25203,3.25557,...,0.036494,0.845347,0.034536,0.738204,0.09721,0.562458,0.105757,0.682368,0.063733,763
