<h1 align="center">
  <a href="https://www.nlga.niedersachsen.de/startseite">
    <img width="300" src="https://www.nlga.niedersachsen.de/assets/image/246974" alt="NLGA">
  </a>
</h1>

## Final visualization of the comparison between different LLM providers

**Overview**: In this notebook, we will load the results of the experiments conducted on the NLGA dataset and visualize the comparison between the responses generated by the several LLMs. We will also display the average scores of the evaluation metrics for each model.

### Install Dependencies

In [20]:
# %pip install polars

In [21]:
import os
from dotenv import load_dotenv
# import polars as pl 
import pandas as pd

In [22]:
load_dotenv()

CONFIG = {
    "RESULTS_DIR": "./results/",
}

file_names = [
    'gpt_35_vs_luminous_supreme_experiment.jsonl',
    # 'llm_AA_vs_openai_experiment_small.jsonl',
    'nous_hermes_2_mixtral_8x7b_dpo_vs_luminous_supreme_experiment.jsonl',
    'mixtral_8x7b_instruct_v0.1_vs_luminous_supreme_experiment.jsonl',
    'meta_llama_3_8b_instruct_vs_luminous_supreme_experiment.jsonl',
    'gpt_4_vs_luminous_supreme_experiment.jsonl',
    
]

In [23]:
def load_and_merge_jsonl(files, results_dir):
    dataframes = []
    
    for file_name in files:
        file_path = os.path.join(results_dir, file_name)
        df = pl.read_ndjson(file_path)
        dataframes.append(df)
    
    if dataframes:
        print(f"Loaded {len(dataframes)} files. Merging...")
        combined_df = pl.concat(dataframes, how="diagonal")
        return combined_df
    else:
        print("No files loaded. Returning empty DataFrame.")
        return pl.DataFrame()


combined_df = load_and_merge_jsonl(file_names, CONFIG['RESULTS_DIR'])
combined_df

Loaded 5 files. Merging...


score_response_relevance_model_gpt-35-turbo-16k-deployment,score_response_match_model_luminous-supreme-control,score_response_match_recall_model_luminous-supreme-control,score_factual_accuracy_model_luminous-supreme-control,response_model_gpt-35-turbo-16k-deployment,explanation_response_conciseness_model_gpt-35-turbo-16k-deployment,score_Strict_Context_adherence_model_luminous-supreme-control,score_valid_response_model_gpt-35-turbo-16k-deployment,explanation_response_matching_model_luminous-supreme-control,explanation_response_completeness_wrt_context_model_luminous-supreme-control,score_response_match_precision_model_luminous-supreme-control,score_Strict_Context_adherence_model_gpt-35-turbo-16k-deployment,explanation_Strict_Context_adherence_model_luminous-supreme-control,explanation_factual_accuracy_model_gpt-35-turbo-16k-deployment,ground_truth_model_gpt-35-turbo-16k-deployment,explanation_valid_response_model_gpt-35-turbo-16k-deployment,explanation_response_completeness_wrt_context_model_gpt-35-turbo-16k-deployment,score_response_conciseness_model_gpt-35-turbo-16k-deployment,ground_truth_model_luminous-supreme-control,response_model_luminous-supreme-control,explanation_response_matching_model_gpt-35-turbo-16k-deployment,score_response_relevance_model_luminous-supreme-control,score_response_match_precision_model_gpt-35-turbo-16k-deployment,context_model_gpt-35-turbo-16k-deployment,score_response_conciseness_model_luminous-supreme-control,score_response_match_model_gpt-35-turbo-16k-deployment,question,explanation_Strict_Context_adherence_model_gpt-35-turbo-16k-deployment,explanation_valid_response_model_luminous-supreme-control,score_response_match_recall_model_gpt-35-turbo-16k-deployment,score_response_completeness_wrt_context_model_luminous-supreme-control,context_model_luminous-supreme-control,score_factual_accuracy_model_gpt-35-turbo-16k-deployment,explanation_factual_accuracy_model_luminous-supreme-control,explanation_response_relevance_model_gpt-35-turbo-16k-deployment,explanation_response_relevance_model_luminous-supreme-control,score_response_completeness_wrt_context_model_gpt-35-turbo-16k-deployment,…,response_model_meta/meta-llama-3-8b-instruct,score_factual_accuracy_model_meta/meta-llama-3-8b-instruct,score_Strict_Context_adherence_model_meta/meta-llama-3-8b-instruct,explanation_response_relevance_model_meta/meta-llama-3-8b-instruct,ground_truth_model_meta/meta-llama-3-8b-instruct,score_response_match_precision_model_meta/meta-llama-3-8b-instruct,score_response_relevance_model_meta/meta-llama-3-8b-instruct,score_response_completeness_wrt_context_model_meta/meta-llama-3-8b-instruct,explanation_Strict_Context_adherence_model_meta/meta-llama-3-8b-instruct,explanation_factual_accuracy_model_meta/meta-llama-3-8b-instruct,score_response_match_recall_model_meta/meta-llama-3-8b-instruct,explanation_valid_response_model_meta/meta-llama-3-8b-instruct,score_response_conciseness_model_meta/meta-llama-3-8b-instruct,explanation_response_conciseness_model_meta/meta-llama-3-8b-instruct,score_valid_response_model_meta/meta-llama-3-8b-instruct,explanation_response_matching_model_meta/meta-llama-3-8b-instruct,context_model_meta/meta-llama-3-8b-instruct,explanation_response_completeness_wrt_context_model_meta/meta-llama-3-8b-instruct,explanation_factual_accuracy_model_gpt4,score_valid_response_model_gpt4,explanation_response_matching_model_gpt4,score_factual_accuracy_model_gpt4,score_response_conciseness_model_gpt4,explanation_valid_response_model_gpt4,response_model_gpt4,score_response_completeness_wrt_context_model_gpt4,ground_truth_model_gpt4,score_response_match_recall_model_gpt4,score_response_relevance_model_gpt4,explanation_response_relevance_model_gpt4,score_Strict_Context_adherence_model_gpt4,context_model_gpt4,score_response_match_precision_model_gpt4,explanation_response_conciseness_model_gpt4,score_response_match_model_gpt4,explanation_response_completeness_wrt_context_model_gpt4,explanation_Strict_Context_adherence_model_gpt4
f64,f64,f64,f64,str,str,f64,f64,str,str,f64,f64,str,str,str,str,str,f64,str,str,str,f64,f64,str,f64,f64,str,str,str,f64,f64,str,f64,str,str,str,f64,…,str,f64,f64,str,str,f64,f64,f64,str,str,f64,str,f64,str,f64,str,str,str,str,f64,str,f64,f64,str,str,f64,str,f64,f64,str,f64,str,f64,str,f64,str,str
1.0,0.0,0.0,,"""Um sich vor Legionellen zu schützen, sollten Maßn…","""{  ""Reasoning"": ""The response provides relevan…",1.0,1.0,"""Information Recall: 0.0{  ""Result"": [  …","""{  ""Reasoning"": ""The context provides detailed…",,1.0,"""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Result"": [  {  ""Fact"": ""…","""Kaltes Wasser muss kalt sein, d.h. < 20 °C Warm…","""{  ""Reasoning"": ""The response provides detaile…","""{  ""Reasoning"": ""The response includes all the…",1.0,"""Kaltes Wasser muss kalt sein, d.h. < 20 °C Warm…","""'Ihre Anfrage kann nicht mit den bereitgestellten…","""Information Recall: 0.5{  ""Result"": [  …",0.0,0.3,"""[Doc Nr. 1] Legionellen Vorkommen und Vermeidung …",0.5,0.428571,"""Wie kann ich mich vor Legionellen schützen?""","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response is asking for mo…",0.5,0.0,"""[Doc Nr. 1] Legionellen Vorkommen und Vermeidung …",0.9,"""{  ""Result"": [] }""","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 0.5{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.672,0.7,0.6,"""Der Name Legionellen stammt von einem Vorfall im …","""{  ""Reasoning"": ""The response directly answers…",1.0,1.0,"""Information Recall: 0.7{  ""Result"": [  …","""{  ""Reasoning"": ""The response correctly identi…",0.6,0.0,"""{  ""Reasoning"": ""The response provides informa…","""{  ""Result"": [  {  ""Fact"": ""…","""Im Juni 1976 erkrankten 221 Teilnehmer eines Tref…","""{  ""Reasoning"": ""The response 'Der Name Legion…","""{  ""Reasoning"": ""The response correctly identi…",1.0,"""Im Juni 1976 erkrankten 221 Teilnehmer eines Tref…","""Der Name Legionellen leitet sich von der Legionel…","""Information Recall: 0.625{  ""Result"": [  …",1.0,1.0,"""[Doc Nr. 1] Jedoch sind kranke und ältere Mensche…",1.0,0.689655,"""Warum heißen Legionellen so?""","""{  ""Reasoning"": ""The response provides informa…","""{  ""Reasoning"": ""The response 'Der Name Legion…",0.625,1.0,"""[Doc Nr. 1] Jedoch sind kranke und ältere Mensche…",1.0,"""{  ""Result"": [  {  ""Fact"": ""…","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 1.0{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.756757,0.7,1.0,"""Die Anzeichen für Borreliose können sein: eine mi…","""{  ""Reasoning"": ""The response accurately provi…",1.0,1.0,"""Information Recall: 0.7{  ""Result"": [  …","""{  ""Reasoning"": ""The response accurately captu…",1.0,1.0,"""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Result"": [  {  ""Fact"": ""…","""Wanderröte in etwa 50 % der Fälle: nach einigen…","""{  ""Reasoning"": ""The response provides a list …","""{  ""Reasoning"": ""The response accurately captu…",1.0,"""Wanderröte in etwa 50 % der Fälle: nach einigen…","""Die Anzeichen für Lyme-Borreliose können sich in …","""Information Recall: 0.2{  ""Result"": [  …",1.0,0.8,"""[Doc Nr. 1] Risiko Zeckenstich Borreliose Was sin…",1.0,0.246154,"""Was sind Anzeichen für Borreliose?""","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response provides detaile…",0.2,1.0,"""[Doc Nr. 1] Risiko Zeckenstich Borreliose Was sin…",1.0,"""{  ""Result"": [  {  ""Fact"": ""…","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 1.0{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.727273,1.0,0.8,"""Nein, Lyme-Krankheit und Borreliose sind nicht da…","""{  ""Reasoning"": ""The response accurately answe…",1.0,1.0,"""Information Recall: 1.0{  ""Result"": [  …","""{  ""Reasoning"": ""The response correctly identi…",0.4,1.0,"""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Result"": [  {  ""Fact"": ""…","""Ja. (die durch Bakterien verursachte Borreliose (…","""{  ""Reasoning"": ""The response 'Nein, Lyme-Kran…","""{  ""Reasoning"": ""The response is incorrect. Th…",1.0,"""Ja. (die durch Bakterien verursachte Borreliose (…","""Die Lyme Disease und die Borreliosis sind das sel…","""Information Recall: 0.25{  ""Result"": [  …",1.0,0.25,"""[Doc Nr. 1] Dieses Erkrankungsbild besteht in 15 …",1.0,0.25,"""Sind die Lyme-Krankheit und Borreliose das selbe?…","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response provides informa…",0.25,1.0,"""[Doc Nr. 1] Dieses Erkrankungsbild besteht in 15 …",0.75,"""{  ""Result"": [  {  ""Fact"": ""…","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 1.0{  ""Reasoning"": ""The re…",0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.444444,0.5,1.0,"""Nein, die schnelle Entfernung einer Zecke bietet …","""{  ""Reasoning"": ""The response directly answers…",1.0,1.0,"""Information Recall: 0.5{  ""Result"": [  …","""{  ""Reasoning"": ""The response correctly incorp…",0.333333,1.0,"""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Result"": [  {  ""Fact"": ""…","""Nein/kaum. Im Gegensatz zu der Infektion mit Borr…","""{  ""Reasoning"": ""The response 'Nein, die schne…","""{  ""Reasoning"": ""The response correctly incorp…",1.0,"""Nein/kaum. Im Gegensatz zu der Infektion mit Borr…","""Die schnelle Entfernung einer angehefteten Zecke …","""Information Recall: 0.5{  ""Result"": [  …",1.0,0.333333,"""[Doc Nr. 1] Risiko Zeckenstich FSME Schutz vor FS…",1.0,0.444444,"""Schützt die schnelle Entfernung einer Zecke vor F…","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response provides informa…",0.5,1.0,"""[Doc Nr. 1] Risiko Zeckenstich FSME Schutz vor FS…",1.0,"""{  ""Result"": [  {  ""Fact"": ""…","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 1.0{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,0.0,,"""Ja, die FSME-Erkrankung kann in schweren Fällen t…","""{  ""Reasoning"": ""The response provides relevan…",1.0,1.0,"""Information Recall: 0.0{  ""Result"": [  …","""{  ""Reasoning"": ""The response is incorrect bec…",,1.0,"""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Result"": [  {  ""Fact"": ""…","""Bei einem großen Teil der Infizierten (> 70%) und…","""{  ""Reasoning"": ""The response 'Ja, die FSME-Er…","""{  ""Reasoning"": ""The response correctly incorp…",1.0,"""Bei einem großen Teil der Infizierten (> 70%) und…","""Ihre Anfrage kann nicht mit dem bereitgestellten …","""Information Recall: 0.5{  ""Result"": [  …",0.0,0.4,"""[Doc Nr. 1] www.nlga.niedersachsen.de FSME-Erkran…",0.5,0.470588,"""Ist FSME tödlich?""","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response is asking for mo…",0.5,0.5,"""[Doc Nr. 1] www.nlga.niedersachsen.de FSME-Erkran…",1.0,"""{  ""Result"": [] }""","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 0.5{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.842105,0.8,1.0,"""Eine Bescheinigung über einen durchgeführten Coro…","""{  ""Reasoning"": ""The response accurately lists…",1.0,1.0,"""Information Recall: 0.8{  ""Result"": [  …","""{  ""Reasoning"": ""The response correctly includ…",1.0,1.0,"""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Result"": [  {  ""Fact"": ""…","""Die Bescheinigung muss Name, Vorname, Geburtsdatu…","""{  ""Reasoning"": ""The response provides a list …","""{  ""Reasoning"": ""The response accurately lists…",1.0,"""Die Bescheinigung muss Name, Vorname, Geburtsdatu…","""Die Bescheinigung über einen durchgeführten Coron…","""Information Recall: 1.0{  ""Result"": [  …",1.0,1.0,"""[Doc Nr. 1] (2) 1Die Person, die den Test gemäß A…",1.0,1.0,"""Welche Daten muss eine Bescheinigung über einen d…","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response provides detaile…",1.0,1.0,"""[Doc Nr. 1] (2) 1Die Person, die den Test gemäß A…",1.0,"""{  ""Result"": [  {  ""Fact"": ""…","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 1.0{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,0.0,,"""Ja, als jemand im Alter von 80 Jahren gehören Sie…","""{  ""Reasoning"": ""The response directly answers…",1.0,1.0,"""Information Recall: 0.0{  ""Result"": [  …","""{  ""Reasoning"": ""The response does not incorpo…",,1.0,"""{  ""Reasoning"": ""The response adheres to the g…","""{  ""Result"": [  {  ""Fact"": ""…","""Ja. Personen im Alter von ? 80 Jahren gehören zum…","""{  ""Reasoning"": ""The response 'Ja, als jemand …","""{  ""Reasoning"": ""The response correctly identi…",1.0,"""Ja. Personen im Alter von ? 80 Jahren gehören zum…","""Ihre Anfrage kann aufgrund fehlender Informatione…","""Information Recall: 1.0{  ""Result"": [  …",0.0,0.25,"""[Doc Nr. 1] 3 1. ALLGEMEINE FRAGEN ZUR IMPFUN…",0.5,0.571429,"""Ich bin 80. Kann ich demnächst geimpft werden?""","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response is asking for mo…",1.0,0.0,"""[Doc Nr. 1] 3 1. ALLGEMEINE FRAGEN ZUR IMPFUN…",0.8,"""{  ""Result"": [] }""","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 0.5{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,0.0,,"""Ja, Sie können sich auf Borkum gegen Corona impfe…","""{  ""Reasoning"": ""The response directly answers…",1.0,1.0,"""Information Recall: 0.0{  ""Result"": [  …","""{  ""Reasoning"": ""The context provided does not…",,1.0,"""{  ""Reasoning"": ""The response adheres to the g…","""{  ""Result"": [  {  ""Fact"": ""…","""Nein. Inselbewohnerinnen und -bewohner müssen sic…","""{  ""Reasoning"": ""The response 'Ja, Sie können …","""{  ""Reasoning"": ""The context does not provide …",1.0,"""Nein. Inselbewohnerinnen und -bewohner müssen sic…","""Ihre Anfrage kann aufgrund fehlender Informatione…","""Information Recall: 0.16666666666666666{  ""Res…",0.0,0.0,"""[Doc Nr. 1] 3 1. ALLGEMEINE FRAGEN ZUR IMPFUN…",1.0,0.0,"""Kann ich mich auf Borkum gegen Corona impfen lass…","""{  ""Reasoning"": ""The response strictly adheres…","""{  ""Reasoning"": ""The response is not providing…",0.166667,0.0,"""[Doc Nr. 1] 3 1. ALLGEMEINE FRAGEN ZUR IMPFUN…",0.0,"""{  ""Result"": [] }""","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 1.0{  ""Reasoning"": ""The re…",0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1.0,0.0,0.0,,"""Für weitere Informationen zur Corona-Impfung könn…","""{  ""Reasoning"": ""The response directly address…",0.0,1.0,"""Information Recall: 0.0{  ""Result"": [  …","""{  ""Reasoning"": ""The context provides multiple…",,1.0,"""{  ""Reasoning"": ""The response is not adhering …","""{  ""Result"": [  {  ""Fact"": ""…","""Informationen zur Impfung können vorab z.B. unter…","""{  ""Reasoning"": ""The response provides a list …","""{  ""Reasoning"": ""The response correctly identi…",1.0,"""Informationen zur Impfung können vorab z.B. unter…","""Ihre Anfrage kann aufgrund fehlender Informatione…","""Information Recall: 0.7{  ""Result"": [  …",0.0,1.0,"""[Doc Nr. 1] 9 6. KOMMUNIKATION UND INFORMATION…",1.0,0.756757,"""Wo erhalte ich weitere Informationen zur Corona-I…","""{  ""Reasoning"": ""The response provides informa…","""{  ""Reasoning"": ""The response 'Ihre Anfrage ka…",0.7,0.0,"""[Doc Nr. 1] 9 6. KOMMUNIKATION UND INFORMATION…",1.0,"""{  ""Result"": [] }""","""Response Precision: 1.0{  ""Reasoning"": ""The re…","""Response Precision: 1.0{  ""Reasoning"": ""The re…",1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# combined_df.select(pl.col("question"), pl.col("response_model_gpt-35-turbo-16k-deployment"), pl.col("explanation_response_matching_model_luminous-supreme-control"))

In [26]:
def display_average_scores(df):
    score_columns = [col for col in df.columns if 'score' in col]
    data_for_table = []
    
    for column in score_columns:
        average = df[column].drop_nans().mean()
        parts = column.split('_model_')
        
        if len(parts) != 2:
            print(f"Warning: '{column}' does not follow the expected naming convention.")
            continue
        
        metric_name = parts[0].replace('score_', '').replace('_', ' ').capitalize()
        model_name = parts[1]
        
        data_for_table.append({
            "Model": model_name,
            "Metric": metric_name,
            "Average Score": average
        })
    
    results_table = pl.DataFrame(data_for_table)
    return results_table

In [27]:
pl.Config.set_tbl_rows(32)
pl.Config.set_fmt_str_lengths(50)
display_average_scores(combined_df)

Model,Metric,Average Score
str,str,f64
"""gpt-35-turbo-16k-deployment""","""Response relevance""",0.733333
"""luminous-supreme-control""","""Response match""",0.234748
"""luminous-supreme-control""","""Response match recall""",0.27524
"""luminous-supreme-control""","""Factual accuracy""",0.545321
"""luminous-supreme-control""","""Strict context adherence""",0.931034
"""gpt-35-turbo-16k-deployment""","""Valid response""",0.8
"""luminous-supreme-control""","""Response match precision""",0.390946
"""gpt-35-turbo-16k-deployment""","""Strict context adherence""",0.914286
"""gpt-35-turbo-16k-deployment""","""Response conciseness""",0.914286
"""luminous-supreme-control""","""Response relevance""",0.539048


### Aggregated Metrics by Model

In [30]:
import polars as pl
import os

def aggregate_metrics_by_model(df):
    # Filter score columns directly when melting, no pre-filtering required
    score_columns = [col for col in df.columns if 'score' in col]
    model_scores = df.melt(id_vars=["question"], value_vars=score_columns)
    
    # Parse out the metric and model from the column names
    model_scores = model_scores.with_columns([
        pl.col("variable").str.split("_model_").alias("parts"),
        pl.col("parts")[1].alias("model"),
        pl.col("parts")[0].str.replace("score_", "", regex=False).alias("metric")
    ]).drop("parts")
    
    # Calculate the average scores by model and metric
    aggregated = model_scores.groupby(["model", "metric"]).agg([
        pl.col("value").mean().alias("Average Score")
    ])
    
    return aggregated


aggregated_by_model = aggregate_metrics_by_model(combined_df)
print(aggregated_by_model)

TypeError: 'Expr' object is not subscriptable

### Aggregated Metrics by Metric Across Models

In [31]:
def aggregate_metrics_by_metric(df):
    score_columns = [col for col in df.columns if 'score' in col]
    metric_scores = df.melt(id_vars=["question"], value_vars=score_columns)
    
    # Parse out the metric and model from the column names
    metric_scores = metric_scores.with_columns([
        pl.col("variable").str.split("_model_").alias("parts"),
        pl.col("parts")[1].alias("model"),
        pl.col("parts")[0].str.replace("score_", "", regex=False).alias("metric")
    ]).drop("parts")
    
    # Calculate the average scores by metric and model
    aggregated = metric_scores.groupby(["metric", "model"]).agg([
        pl.col("value").mean().alias("Average Score")
    ])
    
    return aggregated


aggregated_by_metric = aggregate_metrics_by_metric(combined_df)

print(aggregated_by_metric)


TypeError: 'Expr' object is not subscriptable