In [4]:
from pathlib import Path
evaluations_dir_path = Path("/home/watson_chua/efs/hansard_finetuning/data/evaluations")
models_to_compare = ["gpt4_zero_shot", "gpt4_one_shot", "llama3", "gemma2"]            

In [2]:
pairwise_combination = []
for i in range(len(models_to_compare)):
    for j in range(i+1, len(models_to_compare)):
        if models_to_compare[i].startswith("gpt4") and models_to_compare[j].startswith("gpt4"):
            continue
        pairwise_combination.append((models_to_compare[i], models_to_compare[j]))
pairwise_combination

[('gpt4_zero_shot', 'llama3'),
 ('gpt4_zero_shot', 'gemma2'),
 ('gpt4_one_shot', 'llama3'),
 ('gpt4_one_shot', 'gemma2'),
 ('llama3', 'gemma2')]

## Convert Evaluation String to JSON

In [6]:
import sys
sys.path.append("../")
from llm_utils.post_processing import clean_json_output

In [3]:
import json
import pandas as pd


def convert_evaluation_to_df(input_path):
    with input_path.open('r') as f:
        lines = f.readlines()
    json_lines =  [json.loads(l) for l in lines]
    df_eval = pd.DataFrame([clean_json_output(l["evaluation"]) for l in json_lines])
    return df_eval

## Gemini Preference Eval

In [9]:
for model_1, model_2 in pairwise_combination:
    df_eval = convert_evaluation_to_df(evaluations_dir_path/Path(f"{model_1}_{model_2}_evaluation.jsonl"))
    print(f"{model_1} vs. {model_2}")
    print(df_eval["winner"].value_counts())

gpt4_zero_shot vs. llama3
winner
b    212
a     92
Name: count, dtype: int64
gpt4_zero_shot vs. gemma2
winner
b    221
a     83
Name: count, dtype: int64
gpt4_one_shot vs. llama3
winner
b    192
a    112
Name: count, dtype: int64
gpt4_one_shot vs. gemma2
winner
b    205
a     99
Name: count, dtype: int64
llama3 vs. gemma2
winner
b    188
a    116
Name: count, dtype: int64


## Compare Ragas

In [52]:
output_root_dir = "/home/watson_chua/efs/axolotl/data/evaluations/hansard/"
def compare_ragas(subdir, llm1):
    df_ragas = pd.read_csv(output_root_dir + subdir + "/" + "{llm1}_ragas_claude.csv".format(llm1=llm1))
    print(df_ragas.dropna(how="any",axis=0,subset=["faithfulness", "answer_relevancy", "answer_correctness"]).mean(numeric_only=True))

In [11]:
for model_1 in models_to_compare:
    df_ragas = pd.read_csv(evaluations_dir_path / Path(f"{model_1}_ragas.csv"))
    print(model_1)
    print(df_ragas.dropna(how="any",axis=0,subset=["faithfulness", "answer_relevancy", "answer_similarity", "answer_correctness"]).mean(numeric_only=True))


gpt4_zero_shot
faithfulness          0.855116
answer_relevancy      0.662348
answer_similarity     0.785876
answer_correctness    0.653498
dtype: float64
gpt4_one_shot
faithfulness          0.840792
answer_relevancy      0.669053
answer_similarity     0.784777
answer_correctness    0.663694
dtype: float64
llama3
faithfulness          0.862831
answer_relevancy      0.616783
answer_similarity     0.807248
answer_correctness    0.783219
dtype: float64
gemma2
faithfulness          0.860389
answer_relevancy      0.609027
answer_similarity     0.801928
answer_correctness    0.782969
dtype: float64
