## Evaluation pipeline

Load dataset with queries and answers, and the resulting dataset from inference pipeline.

Run all to compute ROUGE and RAGAs evaluation metrics with summarized tables.

In [2]:
import pandas as pd
import seaborn as sns
import os

#Load dataset with queries and ground truth answers
df_vragen = pd.read_excel('Vragen HAFIR.xlsx')

#Load dataset from inference pipeline result
df_llm_response = pd.read_pickle("full_dataframe_pickle")

In [None]:
#Add query and answer column into inference dataset
df_vragen = df_vragen.rename(columns={"Vraag": "query"})
df_merged = pd.merge(df_llm_response, df_vragen[["query","Antwoord"]], on='query', how='left')


### ROUGE evaluation

In [6]:
#Remap column names

df = df_merged
#Define mappings for column renaming
embedding_model_map = {
    "custom-vectorizer-384": "multilingual-e5-small",
    "openai-vectorizer-text-embedding-3-large": "text-embedding-3-large",
    "openai-vectorizer-text-embedding-ada-002": "text-embedding-ada-002"
}

search_algorithm_map = {
    "exhaustiveknn": "KNN",
    "hnsw": "HNSW"
}

#Create new columns for clarity of the techniques
df['embedding_model'] = df['idx_name'].apply(
    lambda x: next((val for key, val in embedding_model_map.items() if key in x), None)
)
df['search_algorithm'] = df['idx_name'].apply(
    lambda x: next((val for key, val in search_algorithm_map.items() if key in x), None)
)
df['agentic_retrieval'] = df['method_name'].apply(
    lambda x: "Agentic" in x
)
df['reranking'] = df['method_name'].apply(
    lambda x: "with reranking" in x
)

df_merged_rouge = df[['deployment_name',
                           'embedding_model',
                           'search_algorithm',
                           'agentic_retrieval',
                           'reranking',
                           'query',
                           'iteration',
                           'response',
                           'Antwoord']]

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def rouge_scorer_wrapper(a,b):
    result = scorer.score(a,b)
    rouge1 = result["rouge1"]
    rougeL = result["rougeL"]
    return rouge1.precision, rouge1.recall, rouge1.fmeasure, rougeL.precision, rougeL.recall, rougeL.fmeasure

#Appends ROUGE metric calculations to dataset
df_merged_rouge[['rouge1_precision', 'rouge1_recall', 'rouge1_fmeasure', 
           'rougeL_precision', 'rougeL_recall', 'rougeL_fmeasure']] = df_merged_rouge.apply(
    lambda x: pd.Series(rouge_scorer_wrapper(x.Antwoord, x.response)), 
    axis=1
)

### RAGAs evaluation

In [None]:
from ragas import SingleTurnSample 
from ragas.metrics import ResponseRelevancy
from ragas.dataset_schema import SingleTurnSample 
from ragas.metrics import Faithfulness
from ragas import SingleTurnSample
from ragas.metrics import LLMContextPrecisionWithReference
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import LLMContextRecall
from ragas.embeddings import LangchainEmbeddingsWrapper
import asyncio
import os
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

In [None]:
import json
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential
from azure.identity import get_bearer_token_provider
from openai import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings

with open("config.json", "r") as f:
    config = json.load(f)

#Set connection to an LLM
AZURE_OPENAI_API_KEY = config["AZURE_OPENAI_API_KEY"]
AZURE_OPENAI_ENDPOINT = config["AZURE_OPENAI_ENDPOINT"]

llm = AzureChatOpenAI(
    deployment_name="gpt-4o",  
    azure_endpoint=AZURE_OPENAI_ENDPOINT, 
    openai_api_key=AZURE_OPENAI_API_KEY, 
    openai_api_version="2024-12-01-preview"
)

In [None]:
#Create RAGAs scorers
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(llm)
scorer_faith = Faithfulness(llm=evaluator_llm)
scorer_context_recall = LLMContextRecall(llm=evaluator_llm)
scorer_context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)


In [None]:
#List RAGAs scorers to use for each metric
scorers = [scorer_faith, scorer_context_recall, scorer_context_precision]
BATCH_SIZE = 10
SAVE_DIR = "batches"
CONCURRENCY_LIMIT = 10

async def ragas_evaluate_wrapper(user_input, response, reference, retrieved_contexts):
    '''
    Async wrapper for evaluation of a single row
    '''
    sample = SingleTurnSample(
        user_input=user_input,
        response=response,
        reference=reference,
        retrieved_contexts=retrieved_contexts,
    )
    scores = []
    for scorer in scorers:
        try:
            result = await scorer.single_turn_ascore(sample)
            scores.append(result)
        except Exception as e:
            print(f"Error with scorer {scorer.name}: {e}")
            scores.append(None)
    return scores

async def process_batch(batch_df, batch_index):
    '''
    Processes and saves data in batches of pickle files
    '''
    print(f"Processing batch {batch_index}")
    
    tasks = []
    for _, row in batch_df.iterrows():
        #Call the evaluate function for each row

        scores = ragas_evaluate_wrapper(
            user_input=row.query,
            response=row.response,
            reference=row.reference,
            retrieved_contexts=row.retrieved_contexts,
        )

        tasks.append(scores)
    results = await asyncio.gather(*tasks)
    batch_df["faithfulness"] = [r[0] for r in results]
    batch_df["context_recall"] = [r[1] for r in results]
    batch_df["context_precision"] = [r[2] for r in results] 
    

    os.makedirs(SAVE_DIR, exist_ok=True)
    batch_path = os.path.join(SAVE_DIR, f"batch_{batch_index}.pkl")
    batch_df.to_pickle(batch_path)
    print(f"Saved batch {batch_index} to {batch_path}")

    return batch_df

async def limited_process_batch(semaphore, batch_df, batch_index):
    '''
    Counter to limit amount of API calls and concurrent runs
    '''
    async with semaphore:
        await process_batch(batch_df, batch_index)

async def evaluate_dataframe(df):
    '''
    Runs evaluation process and saves data in batches
    '''
    semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
    tasks = []
    
    total_rows = len(df)
    num_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE
    
    for i in range(num_batches):
        start = i * BATCH_SIZE
        end = min(start + BATCH_SIZE, total_rows)

        #Skip already processed batch
        batch_path = os.path.join(SAVE_DIR, f"batch_{i}.pkl")
        if os.path.exists(batch_path):
            print(f"Batch {i} already exists. Skipping.")
            continue

        batch_df = df.iloc[start:end].copy()
        tasks.append(limited_process_batch(semaphore, batch_df, i))
    await asyncio.gather(*tasks)


#Rename columns in case they are not matching
df_merged.rename(
    columns={"Antwoord": "reference", "results": "retrieved_contexts"}, inplace=True
)

asyncio.run(evaluate_dataframe(df_merged.copy()))

In [None]:
from os import listdir
from os.path import isfile, join

#Loads batches to form one dataframe
files = [join("batches", f) for f in listdir("batches") if (isfile(join("batches", f)) and ".pkl" in f)]
df_ragas_evaluation = pd.concat((pd.read_pickle(file) for file in files))

### Compute grouped scores

In [189]:
#ROUGE scores grouped by query
df_grouped_query = df_merged_rouge[["query", 'rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall',  
                                    'rougeL_fmeasure', 'rougeL_precision', 'rougeL_recall']].groupby("query").mean().reset_index()

to_merge = df_grouped_query.merge(df_vragen[['query', 'Vraag Nr.']], on='query', how='left')
#Replace query content by query number
df_grouped_query['query'] = to_merge['Vraag Nr.']
df_grouped_query.sort_values(by='rouge1_fmeasure', inplace=True, ascending=False)

df_grouped_query.style.background_gradient(
    subset= ['rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall', 'rougeL_fmeasure', 'rougeL_precision', 'rougeL_recall'],
    cmap=sns.color_palette("crest_r", as_cmap=True)
)


Unnamed: 0,query,rouge1_fmeasure,rouge1_precision,rouge1_recall,rougeL_fmeasure,rougeL_precision,rougeL_recall
20,13,0.435231,0.579873,0.36067,0.191167,0.259084,0.157253
9,2,0.393866,0.552922,0.327532,0.201358,0.287449,0.166467
6,14,0.365844,0.352091,0.447049,0.151216,0.145062,0.186343
3,15,0.365206,0.319856,0.472924,0.192195,0.168085,0.25021
5,20,0.352228,0.280186,0.529417,0.167449,0.132756,0.25381
7,16,0.349436,0.327821,0.410667,0.190304,0.17965,0.223262
2,10,0.347732,0.30964,0.439815,0.156579,0.139393,0.198741
0,8,0.313905,0.241211,0.50477,0.185661,0.141802,0.302048
18,4,0.307826,0.308069,0.37292,0.162363,0.16283,0.197799
21,19,0.287818,0.220038,0.538143,0.155391,0.118718,0.295714


In [10]:
def colour_rows(row):
    #Function to colour rows based on category
    if row['category'] == 'deployment_name':
        return ['background-color: lightblue'] * len(row)
    elif row['category'] == 'embedding_model':
        return ['background-color: lightgreen'] * len(row)
    elif row['category'] == 'search_algorithm':
        return ['background-color: lightyellow'] * len(row)
    elif row['category'] == 'agentic_rerank':
        return ['background-color: lightsalmon'] * len(row)
    else:
        return ['background-color: white'] * len(row)


In [None]:
df_rouge_deploy = df_merged_rouge[["deployment_name", 'rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall',  'rougeL_fmeasure',
           'rougeL_precision', 'rougeL_recall']].groupby(["deployment_name"]).mean().reset_index()
df_rouge_embed = df_merged_rouge[["embedding_model", 'rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall',  'rougeL_fmeasure',
           'rougeL_precision', 'rougeL_recall']].groupby(["embedding_model"]).mean().reset_index()
df_rouge_search = df_merged_rouge[["search_algorithm", 'rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall',  'rougeL_fmeasure',
           'rougeL_precision', 'rougeL_recall']].groupby(["search_algorithm"]).mean().reset_index()
df_rouge_agentic_rerank = df_merged_rouge.groupby(["agentic_retrieval", "reranking"])[
    ['rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall',  'rougeL_fmeasure', 'rougeL_precision', 'rougeL_recall']].mean().reset_index()

In [11]:
#Create table grouped by method category with mean ROUGE metrics
df_rouge_deploy['category'] = 'deployment_name'
df_rouge_embed['category'] = 'embedding_model'
df_rouge_search['category'] = 'search_algorithm'
df_rouge_agentic_rerank['category'] = 'agentic_rerank'

df_rouge_combined = pd.concat(
    [df_rouge_deploy, df_rouge_embed, df_rouge_search, df_rouge_agentic_rerank],
    ignore_index=True
)

df_rouge_combined = df_rouge_combined[["category", "deployment_name", "embedding_model", "search_algorithm", "agentic_retrieval", "reranking", 'rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall',  'rougeL_fmeasure', 'rougeL_precision', 'rougeL_recall']]
df_rouge_combined.style.apply(colour_rows, axis=1)

Unnamed: 0,category,deployment_name,embedding_model,search_algorithm,agentic_retrieval,reranking,rouge1_fmeasure,rouge1_precision,rouge1_recall,rougeL_fmeasure,rougeL_precision,rougeL_recall
0,deployment_name,gpt-4.1,,,,,0.276196,0.217159,0.519177,0.146098,0.112356,0.293037
1,deployment_name,gpt-4o,,,,,0.293892,0.309025,0.361467,0.158682,0.166555,0.202082
2,embedding_model,,multilingual-e5-small,,,,0.285897,0.266347,0.436946,0.153205,0.141724,0.246099
3,embedding_model,,text-embedding-3-large,,,,0.284759,0.256881,0.45013,0.151321,0.134888,0.251457
4,embedding_model,,text-embedding-ada-002,,,,0.284475,0.266048,0.43389,0.152644,0.141756,0.245122
5,search_algorithm,,,HNSW,,,0.284956,0.263359,0.439879,0.152591,0.13995,0.24779
6,search_algorithm,,,KNN,,,0.285131,0.262826,0.440765,0.152189,0.138962,0.247329
7,agentic_rerank,,,,False,False,0.29541,0.277797,0.425401,0.156562,0.14589,0.237219
8,agentic_rerank,,,,True,False,0.280049,0.256844,0.443359,0.150787,0.137132,0.250622
9,agentic_rerank,,,,True,True,0.279673,0.254636,0.452206,0.149822,0.135345,0.254837


In [None]:
df_ragas_deploy = df_ragas_evaluation[["deployment_name", 'aggregated_score', 'faithfulness', 'context_recall',  'context_precision',
           'time']].groupby(["deployment_name"]).mean().reset_index()
df_ragas_embed = df_ragas_evaluation[["embedding_model", 'aggregated_score', 'faithfulness', 'context_recall',  'context_precision',
           'time']].groupby(["embedding_model"]).mean().reset_index()
df_ragas_search = df_ragas_evaluation[["search_algorithm", 'aggregated_score', 'faithfulness', 'context_recall',  'context_precision',
           'time']].groupby(["search_algorithm"]).mean().reset_index()

df_ragas_agentic_rerank = df_ragas_evaluation.groupby(["agentic_retrieval", "reranking"])[
    ['aggregated_score', 'faithfulness', 'context_recall', 'context_precision', 'time']].mean().reset_index()

In [14]:
#Create table grouped by method category with mean RAGAs metrics
df_ragas_deploy['category'] = 'deployment_name'
df_ragas_embed['category'] = 'embedding_model'
df_ragas_search['category'] = 'search_algorithm'
df_ragas_agentic_rerank['category'] = 'agentic_rerank'

df_ragas_combined = pd.concat(
    [df_ragas_deploy, df_ragas_embed, df_ragas_search, df_ragas_agentic_rerank],
    ignore_index=True
)

df_ragas_combined = df_ragas_combined[["category", "deployment_name", "embedding_model", "search_algorithm", "agentic_retrieval", "reranking", "aggregated_score", "faithfulness", "context_recall", "context_precision", "time"]]
df_ragas_combined.style.apply(colour_rows, axis=1)

Unnamed: 0,category,deployment_name,embedding_model,search_algorithm,agentic_retrieval,reranking,aggregated_score,faithfulness,context_recall,context_precision,time
0,deployment_name,gpt-4.1,,,,,0.592527,0.74605,0.485665,0.392342,10.517602
1,deployment_name,gpt-4o,,,,,0.533383,0.653075,0.463907,0.363477,6.079889
2,embedding_model,,multilingual-e5-small,,,,0.561263,0.702051,0.467823,0.373129,7.568498
3,embedding_model,,text-embedding-3-large,,,,0.578668,0.709173,0.484987,0.41134,9.224458
4,embedding_model,,text-embedding-ada-002,,,,0.548934,0.687464,0.471547,0.349261,8.103281
5,search_algorithm,,,HNSW,,,0.560439,0.68921,0.474465,0.38887,8.261583
6,search_algorithm,,,KNN,,,0.565472,0.709915,0.475107,0.366949,8.335908
7,agentic_rerank,,,,False,False,0.583253,0.705735,0.481294,0.440247,4.956362
8,agentic_rerank,,,,True,False,0.543633,0.693489,0.451211,0.336345,9.471422
9,agentic_rerank,,,,True,True,0.561979,0.699464,0.491853,0.357137,10.468453


In [162]:
#ROUGE metrics grouped by model configurations
df_grouped_model = df_merged_rouge[["deployment_name","embedding_model", "search_algorithm", "agentic_retrieval", "reranking", 'rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall',  'rougeL_fmeasure',
           'rougeL_precision', 'rougeL_recall']].groupby(["deployment_name","embedding_model", "search_algorithm", "agentic_retrieval", "reranking"]).mean().reset_index()
df_grouped_model.sort_values(by='rouge1_fmeasure', inplace=True, ascending=False)

df_grouped_model.style.background_gradient(
    subset= ['rouge1_fmeasure', 'rouge1_precision', 'rouge1_recall', 'rougeL_fmeasure', 'rougeL_precision', 'rougeL_recall'],
    cmap=sns.color_palette("crest_r", as_cmap=True)
)

Unnamed: 0,deployment_name,embedding_model,search_algorithm,agentic_retrieval,reranking,rouge1_fmeasure,rouge1_precision,rouge1_recall,rougeL_fmeasure,rougeL_precision,rougeL_recall
21,gpt-4o,multilingual-e5-small,KNN,False,False,0.308185,0.326965,0.356548,0.164253,0.173713,0.194863
18,gpt-4o,multilingual-e5-small,HNSW,False,False,0.306566,0.329769,0.34794,0.164581,0.176458,0.194038
24,gpt-4o,text-embedding-3-large,HNSW,False,False,0.305784,0.319201,0.359165,0.164123,0.170001,0.197861
30,gpt-4o,text-embedding-ada-002,HNSW,False,False,0.304838,0.327744,0.349064,0.168091,0.178681,0.201301
33,gpt-4o,text-embedding-ada-002,KNN,False,False,0.303696,0.32487,0.347762,0.162044,0.174174,0.191712
25,gpt-4o,text-embedding-3-large,HNSW,True,False,0.298374,0.288625,0.395693,0.162778,0.154955,0.224119
27,gpt-4o,text-embedding-3-large,KNN,False,False,0.297344,0.309501,0.361192,0.1598,0.166046,0.200393
20,gpt-4o,multilingual-e5-small,HNSW,True,True,0.29425,0.308524,0.366846,0.158264,0.169134,0.204222
35,gpt-4o,text-embedding-ada-002,KNN,True,True,0.293478,0.305817,0.358879,0.158065,0.163674,0.199639
12,gpt-4.1,text-embedding-ada-002,HNSW,False,False,0.291404,0.238615,0.4843,0.153145,0.12256,0.273465


In [39]:
#Calculate aggregates score and remap columns
df_ragas_evaluation['aggregated_score'] = (2 * final_df['faithfulness'] + final_df['context_recall'] + final_df['context_precision'])/4
df = df_ragas_evaluation

#Define mappings for column renaming
embedding_model_map = {
    "custom-vectorizer-384": "multilingual-e5-small",
    "openai-vectorizer-text-embedding-3-large": "text-embedding-3-large",
    "openai-vectorizer-text-embedding-ada-002": "text-embedding-ada-002"
}

search_algorithm_map = {
    "exhaustiveknn": "KNN",
    "hnsw": "HNSW"
}

#Create new columns for clarity of the techniques
df['embedding_model'] = df['idx_name'].apply(
    lambda x: next((val for key, val in embedding_model_map.items() if key in x), None)
)
df['search_algorithm'] = df['idx_name'].apply(
    lambda x: next((val for key, val in search_algorithm_map.items() if key in x), None)
)
df['agentic_retrieval'] = df['method_name'].apply(
    lambda x: "Agentic" in x
)
df['reranking'] = df['method_name'].apply(
    lambda x: "with reranking" in x
)

df_ragas_evaluation_stats = df[['deployment_name',
                           'embedding_model',
                           'search_algorithm',
                           'agentic_retrieval',
                           'reranking', "faithfulness", "context_recall", "context_precision", "aggregated_score"
                           ]]

rouge_statistics = df_merged_rouge[['deployment_name',
                           'embedding_model',
                           'search_algorithm',
                           'agentic_retrieval',
                           'reranking', "rouge1_precision", "rouge1_recall", "rouge1_fmeasure", "rougeL_precision", "rougeL_recall","rougeL_fmeasure"
                           ]]

#Export pickle for statistical tests
df_ragas_evaluation_stats.to_pickle("rouge_statistics.pkl")
rouge_statistics.to_pickle("rouge_statistics.pkl")


In [248]:
#RAGAs scores grouped by query

final_df_grouped_iteration = df_ragas_evaluation[["deployment_name","idx_name", "method_name", "query", 'faithfulness', 'context_recall', 'context_precision', 'time']].groupby(["deployment_name","idx_name", "method_name", "query"]).mean().reset_index()
final_df_grouped_query = final_df_grouped_iteration.groupby("query")[['faithfulness', 'context_recall', 'context_precision']].mean().reset_index()

merged_df = final_df_grouped_query.merge(df_vragen[['query', 'Vraag Nr.']], on='query', how='left')
#Replace query content by query number
final_df_grouped_query['query'] = merged_df['Vraag Nr.']
final_df_grouped_query.sort_values(by='query', inplace=True)


final_df_grouped_query['aggregated_score'] = (2 * final_df_grouped_query['faithfulness'] + final_df_grouped_query['context_recall'] + final_df_grouped_query['context_precision'])/4

final_df_grouped_query = final_df_grouped_query[['query',
                           'aggregated_score',
                           'faithfulness',
                           'context_recall',
                           'context_precision']]

final_df_grouped_query = final_df_grouped_query.sort_values(by='aggregated_score', ascending=False)



final_df_grouped_query.style.background_gradient(
    subset= ['faithfulness', 'context_recall', 'context_precision', 'aggregated_score'],
    cmap=sns.color_palette("crest_r", as_cmap=True)
)

Unnamed: 0,query,aggregated_score,faithfulness,context_recall,context_precision
20,13,0.879696,0.808407,0.909057,0.992913
12,18,0.787279,0.759618,0.976852,0.65303
6,14,0.71428,0.694136,0.780093,0.688758
1,3,0.712788,0.850009,0.401852,0.74928
0,8,0.702501,0.631876,0.898148,0.648105
18,4,0.700259,0.687951,0.91358,0.511554
16,5,0.68895,0.725984,0.990741,0.313091
3,15,0.665727,0.893901,0.388889,0.486216
5,20,0.624912,0.75061,0.54798,0.450447
7,16,0.550377,0.839835,0.096296,0.425541


In [257]:
#RAGAs metrics grouped by model configurations
final_df_grouped_model = final_df_grouped_iteration.groupby(["deployment_name","idx_name", "method_name"])[['faithfulness', 'context_recall', 'context_precision', 'time']].mean().reset_index()
df = final_df_grouped_model

#Define mappings for column renaming
embedding_model_map = {
    "custom-vectorizer-384": "multilingual-e5-small",
    "openai-vectorizer-text-embedding-3-large": "text-embedding-3-large",
    "openai-vectorizer-text-embedding-ada-002": "text-embedding-ada-002"
}

search_algorithm_map = {
    "exhaustiveknn": "KNN",
    "hnsw": "HNSW"
}

#Create new columns for clarity of the techniques
df['embedding_model'] = df['idx_name'].apply(
    lambda x: next((val for key, val in embedding_model_map.items() if key in x), None)
)
df['search_algorithm'] = df['idx_name'].apply(
    lambda x: next((val for key, val in search_algorithm_map.items() if key in x), None)
)
df['agentic_retrieval'] = df['method_name'].apply(
    lambda x: "Agentic" in x
)
df['reranking'] = df['method_name'].apply(
    lambda x: "with reranking" in x
)

final_df_grouped_model = df[['deployment_name',
                           'embedding_model',
                           'search_algorithm',
                           'agentic_retrieval',
                           'reranking',
                           'faithfulness',
                           'context_recall',
                           'context_precision',
                           'time']]

#Add aggretated score, giving faithfulness a weight of 2
final_df_grouped_model['aggregated_score'] = (2 * final_df_grouped_model['faithfulness'] + final_df_grouped_model['context_recall'] + final_df_grouped_model['context_precision'])/4

#Reorder
final_df_grouped_model = final_df_grouped_model[['deployment_name',
                           'embedding_model',
                           'search_algorithm',
                           'agentic_retrieval',
                           'reranking',
                           'aggregated_score',
                           'faithfulness',
                           'context_recall',
                           'context_precision',
                           'time']]

#Sort by highest aggregated score
final_df_grouped_model = final_df_grouped_model.sort_values(by='aggregated_score', ascending=False)

final_df_grouped_model_styled = (
    final_df_grouped_model.style
    .background_gradient(cmap="crest_r", subset= ['faithfulness', 'context_recall', 'context_precision', 'aggregated_score'])
    .background_gradient(cmap="crest", subset= ["time"])
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_grouped_model['aggregated_score'] = (2 * final_df_grouped_model['faithfulness'] + final_df_grouped_model['context_recall'] + final_df_grouped_model['context_precision'])/4
