In [1]:
import os
import pandas as pd
import json
from IPython.display import display, Markdown, Latex
from tqdm.auto import tqdm

from common.consts import RESULTS_DIR, EVAL_SIZE
from common.utils import filename_to_obj, remove_index

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def results_as_pandas(filename):
    path = os.path.join(RESULTS_DIR, filename)
    with open(path, "r") as f:
        data = f.readlines()
    data = [json.loads(d) for d in data]
    data = pd.DataFrame(data)

    if len(data) == 0:
        print(f"empty file: {filename}")
        return data

    params = filename_to_obj(filename)
    for k, v in params.items():
        data[k] = v

    data = data.explode("evaluations")
    data = data.rename_axis("question_idx").reset_index()

    data = pd.concat([data, data["evaluations"].apply(pd.Series)], axis=1)
    evaluation_keys = data["evaluations"].apply(pd.Series).columns
    for col in evaluation_keys:
        data = pd.concat([data, data[col].apply(pd.Series).add_prefix(f"{col}/")], axis=1)
        data = data.drop(columns=col)
    data = data.drop(columns=["evaluations"])

    return data


files = os.listdir(RESULTS_DIR)
params_names = list(filename_to_obj(files[0]).keys())
all_results = pd.concat([results_as_pandas(f) for f in tqdm(files)])
all_results.head()

100%|██████████| 27/27 [00:14<00:00,  1.90it/s]


Unnamed: 0,question_idx,question_id,llm,prompt_id,temperature,nli,ellm,sim,citations/ais_recall,citations/ais_precision,...,citations/supported,citations/citations,citations/correct_citations,citations/out_of_range,correctness/answer_overlap,correctness/answer_entail,correctness/citations_recall,correctness/citations_precision,quality/answer_relevance,quality/new_question
0,0,5abab42e55429955dce3eed2,gpt-3.5-turbo-0125,1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.0,1.0,...,"[0, 0]","[[], [3]]","[[], [True]]","[0, 0]",0.5,0.0,0.5,1.0,0.730736,What is the relationship between Jordan Subban...
1,0,5abab42e55429955dce3eed2,gpt-3.5-turbo-0125,1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.0,1.0,...,"[0, 0]","[[], [3]]","[[], [True]]","[0, 0]",0.5,0.0,0.5,1.0,0.822013,What professional hockey player was drafted fi...
2,0,5abab42e55429955dce3eed2,gpt-3.5-turbo-0125,1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.0,1.0,...,"[0, 0]","[[], [3]]","[[], [True]]","[0, 0]",0.5,0.0,0.5,1.0,0.730736,What is the relationship between Jordan Subban...
3,1,5a761900554299109176e648,gpt-3.5-turbo-0125,1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1.0,1.0,...,[1],[[1]],[[True]],[0],1.0,1.0,0.5,1.0,0.965509,What is the name of the lobbying group that wa...
4,1,5a761900554299109176e648,gpt-3.5-turbo-0125,1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1.0,1.0,...,[1],[[1]],[[True]],[0],1.0,1.0,0.5,1.0,0.974511,What is the name of the lobbying group that wa...


In [3]:
all_obj_cols = all_results.select_dtypes(include=["object"]).columns
drop_obj_cols = list(set(all_obj_cols) - set(params_names))
drop_obj_cols.remove("question_id")
print(f"Dropping columns: {drop_obj_cols}")
all_num_results = all_results.drop(columns=drop_obj_cols)

Dropping columns: ['citations/correct_citations', 'citations/sentences', 'quality/new_question', 'citations/citations', 'citations/supported', 'citations/out_of_range']


In [4]:
eval_split = all_num_results[all_num_results["question_idx"] < EVAL_SIZE]
train_split = all_num_results[all_num_results["question_idx"] >= EVAL_SIZE]

In [5]:
def aggregate(split):
    split = split.drop(columns=["question_idx"])
    results_with_std_for_each_question = split.groupby([*params_names, "question_id"]).agg(["mean", "std"])
    results_for_each_model = results_with_std_for_each_question.groupby(params_names)
    results = results_for_each_model.mean()
    results["n_questions"] = results_for_each_model.size()
    return results


eval_results = aggregate(eval_split)
train_results = aggregate(train_split)

if eval_results["n_questions"].nunique() != 1:
    print("Warning: not all rows in evaluation have the same number of examples")

In [6]:
display(Markdown("### Prompts comparison"))
parameter_results = eval_results[eval_results.index.get_level_values("llm") == "Mistral-7B-Instruct-v0.2"]
parameter_results[parameter_results.index.get_level_values("temperature") == "0.1"]

### Prompts comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 26_level_1
llm,prompt_id,temperature,nli,ellm,sim,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
Mistral-7B-Instruct-v0.2,1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.575644,0.03656,0.871517,0.03835,2.573333,0.212475,3.016667,0.372709,2.676667,0.302403,...,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834,100
Mistral-7B-Instruct-v0.2,2,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.50154,0.066146,0.862167,0.034978,2.473333,0.204254,2.883333,0.333316,2.536667,0.298675,...,0.027905,0.86,0.023094,0.77,0.043301,0.670627,0.046655,0.704898,0.030174,100
Mistral-7B-Instruct-v0.2,3,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.580222,0.045033,0.890076,0.024787,1.98,0.156519,2.416667,0.282902,2.15,0.24094,...,0.017321,0.863333,0.011547,0.665,0.034641,0.658333,0.04237,0.658884,0.027177,100


In [7]:
display(Markdown("### Temperature comparison"))
parameter_results[parameter_results.index.get_level_values("prompt_id") == "1"]

### Temperature comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 26_level_1
llm,prompt_id,temperature,nli,ellm,sim,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
Mistral-7B-Instruct-v0.2,1,0.01,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.574825,0.027143,0.889365,0.016922,2.576667,0.130745,2.966667,0.223121,2.666667,0.212073,...,0.01299,0.883333,0.017321,0.758333,0.014434,0.744357,0.023419,0.734959,0.015812,100
Mistral-7B-Instruct-v0.2,1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.575644,0.03656,0.871517,0.03835,2.573333,0.212475,3.016667,0.372709,2.676667,0.302403,...,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834,100
Mistral-7B-Instruct-v0.2,1,0.2,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.544493,0.081991,0.877352,0.050804,2.626667,0.320651,3.0,0.60033,2.63,0.482049,...,0.029793,0.886667,0.005774,0.77,0.051962,0.741825,0.050897,0.725103,0.03312,100
Mistral-7B-Instruct-v0.2,1,0.3,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.566712,0.084729,0.863762,0.066155,2.546667,0.455952,2.913333,0.668395,2.536667,0.568113,...,0.045707,0.883333,0.023094,0.776667,0.059848,0.749746,0.067481,0.723474,0.047856,100
Mistral-7B-Instruct-v0.2,1,0.4,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.580572,0.150051,0.877947,0.108934,2.623333,0.659426,3.006667,0.822505,2.623333,0.761222,...,0.044607,0.886667,0.028868,0.775,0.096603,0.738476,0.099659,0.721409,0.04985,100
Mistral-7B-Instruct-v0.2,1,0.5,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.569651,0.144597,0.88101,0.085049,2.51,0.579914,2.903333,0.792816,2.57,0.68593,...,0.073768,0.88,0.034641,0.761667,0.082169,0.737151,0.114113,0.707704,0.059824,100
Mistral-7B-Instruct-v0.2,1,0.6,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.519373,0.168079,0.856773,0.110804,2.583333,0.631519,3.0,0.936496,2.453333,0.813529,...,0.049038,0.87,0.040415,0.758333,0.12181,0.738574,0.125848,0.715341,0.064925,100
Mistral-7B-Instruct-v0.2,1,0.7,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.555853,0.182757,0.858316,0.115458,2.55,0.585797,2.996667,0.863642,2.553333,0.708564,...,0.047054,0.873333,0.028868,0.77,0.118923,0.73623,0.132095,0.727172,0.05884,100
Mistral-7B-Instruct-v0.2,1,0.8,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.505895,0.195393,0.842402,0.131938,2.6,0.702563,3.03,1.059032,2.506667,0.900543,...,0.062564,0.86,0.086603,0.751667,0.119697,0.71973,0.123992,0.725166,0.070362,100
Mistral-7B-Instruct-v0.2,1,0.9,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,0.572745,0.204617,0.876745,0.12727,2.38,0.696808,2.583333,0.846552,2.32,0.789139,...,0.074434,0.856667,0.057735,0.751667,0.14913,0.731516,0.152158,0.695873,0.073964,100


In [8]:
display(Markdown("### Evaluation results"))
eval_display = eval_results[eval_results.index.get_level_values("prompt_id") == "1"]
eval_display = eval_display[eval_display.index.get_level_values("temperature") == "0.1"]
eval_display = remove_index(eval_display, "prompt_id")
eval_display = eval_display.sort_values(by=("correctness/citations_recall", "mean"), ascending=False)
eval_display

### Evaluation results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 25_level_1
llm,temperature,nli,ellm,sim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
mixtral-8x7b-instruct-v0.1.Q8_0,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.604452,0.084171,0.921198,0.053556,2.746667,0.294585,2.893333,0.438508,2.696667,...,0.007698,0.856667,0.023094,0.871667,0.046188,0.821722,0.066027,0.711417,0.048184,100
gpt-3.5-turbo-0125,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.737556,0.072604,0.995833,0.002887,2.02,0.214022,2.143333,0.207846,2.13,...,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098,100
Mixtral-8x7B-Instruct-v0.1,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.6535,0.061199,0.928603,0.029692,2.1,0.148564,2.29,0.205801,2.12,...,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875,100
gpt-4-turbo,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.733013,0.045401,0.985,0.005774,1.593333,0.170111,1.79,0.090331,1.76,...,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935,100
Meta-Llama-3-70B-Instruct,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.691944,0.049556,0.955556,0.017802,1.463333,0.063509,1.88,0.069282,1.796667,...,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829,100
Meta-Llama-3-8B-Instruct,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.492702,0.060547,0.95121,0.022033,2.186667,0.194752,2.04,0.18071,1.91,...,0.0,0.903333,0.005774,0.753333,0.023094,0.907556,0.018283,0.720451,0.03169,100
Mistral-7B-Instruct-v0.2,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.575644,0.03656,0.871517,0.03835,2.573333,0.212475,3.016667,0.372709,2.676667,...,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834,100
qwen1_5-110b-chat,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.528167,0.095486,0.976111,0.026943,1.976667,0.222073,1.673333,0.127017,1.623333,...,0.020207,0.893333,0.017321,0.735,0.051962,0.929889,0.023286,0.737396,0.055224,100
zephyr-7b-beta,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.40669,0.090558,0.910524,0.068665,3.28,0.395182,2.57,0.653154,2.343333,...,0.028674,0.866667,0.046188,0.626667,0.080056,0.727884,0.088494,0.715053,0.041852,100
Phi-3-mini-4k-instruct,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.252796,0.030503,0.578455,0.06958,8.4,2.654658,1.633333,0.29618,1.49,...,0.012317,0.78,0.023094,0.451667,0.051188,0.457889,0.064275,0.663574,0.03725,100


In [9]:
def show_cleaned_results(short_eval_display):
    short_eval_display = remove_index(short_eval_display, "temperature")
    short_eval_display = remove_index(short_eval_display, "nli")
    short_eval_display = remove_index(short_eval_display, "ellm")
    short_eval_display = remove_index(short_eval_display, "sim")
    important_columns = ["citations/ais_recall", "citations/ais_precision", "correctness/answer_overlap", "correctness/answer_entail", "correctness/citations_recall", "correctness/citations_precision", "quality/answer_relevance"]
    short_eval_display = short_eval_display[important_columns]
    return short_eval_display


show_cleaned_results(eval_display)

Unnamed: 0_level_0,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,correctness/answer_overlap,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
llm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
mixtral-8x7b-instruct-v0.1.Q8_0,0.604452,0.084171,0.921198,0.053556,0.882598,0.007698,0.856667,0.023094,0.871667,0.046188,0.821722,0.066027,0.711417,0.048184
gpt-3.5-turbo-0125,0.737556,0.072604,0.995833,0.002887,0.884244,0.01299,0.91,0.011547,0.853333,0.023094,0.941667,0.023671,0.751901,0.041098
Mixtral-8x7B-Instruct-v0.1,0.6535,0.061199,0.928603,0.029692,0.909923,0.001443,0.896667,0.011547,0.825,0.034641,0.875556,0.037528,0.726039,0.030875
gpt-4-turbo,0.733013,0.045401,0.985,0.005774,0.877302,0.007661,0.9,0.011547,0.795,0.028868,0.938778,0.010585,0.747313,0.033935
Meta-Llama-3-70B-Instruct,0.691944,0.049556,0.955556,0.017802,0.865923,0.020651,0.873333,0.017321,0.785,0.017321,0.907556,0.009623,0.752517,0.027829
Meta-Llama-3-8B-Instruct,0.492702,0.060547,0.95121,0.022033,0.917846,0.0,0.903333,0.005774,0.753333,0.023094,0.907556,0.018283,0.720451,0.03169
Mistral-7B-Instruct-v0.2,0.575644,0.03656,0.871517,0.03835,0.872812,0.014915,0.886667,0.017321,0.751667,0.025981,0.721619,0.050125,0.720431,0.02834
qwen1_5-110b-chat,0.528167,0.095486,0.976111,0.026943,0.867315,0.020207,0.893333,0.017321,0.735,0.051962,0.929889,0.023286,0.737396,0.055224
zephyr-7b-beta,0.40669,0.090558,0.910524,0.068665,0.853718,0.028674,0.866667,0.046188,0.626667,0.080056,0.727884,0.088494,0.715053,0.041852
Phi-3-mini-4k-instruct,0.252796,0.030503,0.578455,0.06958,0.711068,0.012317,0.78,0.023094,0.451667,0.051188,0.457889,0.064275,0.663574,0.03725


In [10]:
display(Markdown("### Training results"))
train_display = remove_index(train_results, "prompt_id")
train_display

### Training results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,prompt_id,citations/ais_recall,citations/ais_recall,citations/ais_precision,citations/ais_precision,citations/n_sentences,citations/n_sentences,citations/n_total_citations,citations/n_total_citations,citations/n_correct_citations,...,correctness/answer_overlap,correctness/answer_entail,correctness/answer_entail,correctness/citations_recall,correctness/citations_recall,correctness/citations_precision,correctness/citations_precision,quality/answer_relevance,quality/answer_relevance,n_questions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 25_level_1
llm,temperature,nli,ellm,sim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
mixtral-8x7b-instruct-v0.1.Q8_0,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.62231,0.075736,0.901109,0.037999,2.654258,0.295822,3.021252,0.412071,2.755955,...,0.019962,0.875624,0.022234,0.844066,0.040573,0.798038,0.049704,0.724411,0.038496,4674
qwen1_5-110b-chat,0.1,t5_xxl_true_nli_mixture,Mistral-7B-Instruct-v0.2,all-MiniLM-L6-v2,1,0.56492,0.11713,0.96412,0.030177,1.936333,0.256917,1.789667,0.213576,1.687333,...,0.022317,0.874333,0.025403,0.737167,0.053828,0.909575,0.036772,0.725493,0.043379,1000
