### Datasets preprocessing:

Datasets that need some preprocessing before they can be used in `TruBenchmarkExperiment` class:
1. Snowflake IT (internal): both rephrased and regular?, this should be used for all 3 in the triad
2. SummEval (CNN and DailyMail summarizations with annotation) for groundedness
3. QAGS (CNN and DailyMail with Turkers' annotation) for groundedness
4. QAGS (XSUM with Turkers' annotation) for groundedness
5. MSMARCO V2 for context relevance
6. HotPot QA for answer relevance 



In [11]:
import ast
import random

import pandas as pd
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generatate_snowflake_it_golden_set_groundedness,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_qags_golden_set_groundedness,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_snowflake_it_golden_set_answer_relevance,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_snowflake_it_golden_set_context_relevance,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_summeval_groundedness_golden_set,
)

# Pin random seed
random.seed(42)


snowflake_it_file_path = "data/snowflake_it_v3.csv"

snowflake_it_for_answer_relevance = pd.DataFrame(
    list(
        generate_snowflake_it_golden_set_answer_relevance(
            snowflake_it_file_path
        )
    )
)
snowflake_it_answer_relevance_true_labels = list(
    snowflake_it_for_answer_relevance["expected_score"]
)

snowflake_it_for_context_relevance = pd.DataFrame(
    list(
        generate_snowflake_it_golden_set_context_relevance(
            snowflake_it_file_path
        )
    )
)
snowflake_it_for_context_relevance_true_labels = list(
    snowflake_it_for_context_relevance["expected_score"]
)


snowflake_it_for_groundness = pd.DataFrame(
    list(
        generatate_snowflake_it_golden_set_groundedness(snowflake_it_file_path)
    )
)
snowflake_it_for_groundness_true_labels = list(
    snowflake_it_for_groundness["expected_score"]
)

summeval_list = list(
    generate_summeval_groundedness_golden_set("data/summeval_test.json")
)

summeval_true_labels = [entry["expected_score"] for entry in summeval_list]

summeval = pd.DataFrame(
    list(generate_summeval_groundedness_golden_set("data/summeval_test.json"))
)

qags_cnn_dm = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_cnndm.jsonl"))
)

qags_cnn_dm_true_labels = [
    row["expected_score"] for _, row in qags_cnn_dm.iterrows()
]


qags_xsum = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_xsum.jsonl"))
)

qqags_xsum_true_labels = [
    row["expected_score"] for _, row in qags_xsum.iterrows()
]

In [None]:
# random.seed(42)

# from datasets import load_dataset

# ds = load_dataset("nixiesearch/ms-marco-hard-negatives")
# ms_marco_hard_neg = pd.DataFrame(ds)


In [13]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import generate_balanced_ms_marco_hard_negatives_dataset


random.seed(42)
# ms_marco_hard_neg_balanced = generate_balanced_ms_marco_hard_negatives_dataset(ms_marco_hard_neg['train'], 400)
# ms_marco_hard_neg_balanced.to_csv("ms_marco_hard_neg_balanced.csv", index=False)
ms_marco_hard_neg_balanced = pd.read_csv("data/ms_marco_hard_neg_balanced.csv")
ms_marco_hard_neg_balanced

Unnamed: 0,query,expected_response,expected_score
0,)what was the immediate impact of the success ...,The presence of communication amid scientific ...,1
1,)what was the immediate impact of the success ...,The pivotal engineering and scientific success...,0
2,_________ justice is designed to repair the ha...,The approach is based on a theory of justice t...,1
3,_________ justice is designed to repair the ha...,Retributive justice is a theory of justice whi...,0
4,what color is amber urine,"Colorâurine can be a variety of colors, most...",1
...,...,...,...
395,how much does it cost to replace a foundation ...,the average cost of a foundation repair the av...,0
396,what is glycohemoglobin,test overview a glycohemoglobin test or hemogl...,1
397,what is glycohemoglobin,"The hemoglobin A1c test, also called HbA1c, gl...",0
398,stalactites definition,A stalagmite (UK /ËstaelÉÉ¡maÉªt/ ËstÃ¦lÉÉ...,1


In [14]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_ms_marco_context_relevance_benchmark,
)

random.seed(42)


ms_marco = list(generate_ms_marco_context_relevance_benchmark())


score_1_entries = [entry for entry in ms_marco if entry["expected_score"] == 1]
score_0_entries = [entry for entry in ms_marco if entry["expected_score"] == 0]

# Calculate the number of samples needed from each group
num_samples_per_group = min(
    len(score_1_entries), len(score_0_entries), 150
)  # Sample 150 from each


sampled_score_1 = random.sample(score_1_entries, num_samples_per_group)
sampled_score_0 = random.sample(score_0_entries, num_samples_per_group)

# Combine and shuffle the samples to get a balanced dataset
balanced_sample = sampled_score_1 + sampled_score_0
random.shuffle(balanced_sample)

# Ensure the combined length is 300
assert len(balanced_sample) == 300

# Now you can use `balanced_sample` as your final dataset
print(
    f"Number of entries with expected_score = 1: {len([e for e in balanced_sample if e['expected_score'] == 1])}"
)
print(
    f"Number of entries with expected_score = 0: {len([e for e in balanced_sample if e['expected_score'] == 0])}"
)

ms_marco_balanced_sample_300 = pd.DataFrame(balanced_sample)

Number of entries with expected_score = 1: 150
Number of entries with expected_score = 0: 150


In [15]:
ms_marco_balanced_sample_300

Unnamed: 0,query_id,query,expected_response,expected_score
0,429239,is there poison ivy in oregon,Poison oak is common to western Oregon and Was...,1
1,632923,what does bbb rating mean for a bond,Definition of Baa3 Rating in the Financial Dic...,0
2,151780,direct material price variance formula,Direct Material Price Variance: = Actual Quant...,1
3,1028902,what is vesd,"VESDA is a laser based smoke detector, which m...",1
4,993874,which of the following planets has the shortes...,Neptune has the longest year( Pluto would be b...,0
...,...,...,...,...
295,1040065,what is the elevation of orlando,Elevation: 106 feet. Land area: 93.5 square mi...,0
296,695081,what is a plateau period,pla•teau. 1. a land area having a relatively l...,1
297,728820,what is cease fire mean,"A ceasefire (or truce), also called cease fire...",1
298,427635,is the tv show lucifer renewed,FOX Renewed TV Series Lucifer for Second Seaso...,1


### Load preprocessed datasets from BEIR - start w/ Hotpot QA 

In [16]:
from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="hotpotqa")
hotpotqa = beir_data_loader.load_dataset_to_df(download=True)

In [17]:
random.seed(42)


hotpotqa_raw_subset = hotpotqa.sample(n=200, random_state=42)

all_responses = [
    (row["query"], row["expected_response"])
    for idx, row in hotpotqa_raw_subset.iterrows()
]

all_contexts = [
    (row["query"], context["text"])
    for idx, row in hotpotqa_raw_subset.iterrows()
    for context in row["expected_chunks"]
]

hotpotqa_subset_for_answer_relevance, hotpotqa_subset_for_context_relevance = (
    [],
    [],
)


for idx, row in hotpotqa_raw_subset.iterrows():
    # Positive examples for answer relevance
    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": row["expected_response"],  # Positive response
        "expected_score": 1,  # Positive example, score = 1
    })

    # Negative examples for answer relevance (random unrelated response)
    negative_response = random.choice([
        r
        for q, r in all_responses
        if q != row["query"]  # Pick response from another query
    ])

    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": negative_response,  # Negative response
        "expected_score": 0,  # Negative example, score = 0
    })


for idx, row in hotpotqa_raw_subset.iterrows():
    positive_examples = []
    negative_examples = []

    # Generate positive examples for context relevance
    for context in row["expected_chunks"]:
        positive_examples.append({
            "query": row["query"],
            "expected_response": context["text"],  # Positive context
            "expected_score": context["expected_score"],  # Should be 1
        })

    # Generate negative examples for context relevance
    for _ in positive_examples:
        negative_context = random.choice([
            c
            for q, c in all_contexts
            if q != row["query"]  # Pick context from another query
        ])
        negative_examples.append({
            "query": row["query"],
            "expected_response": negative_context,  # Negative context
            "expected_score": 0,  # Negative example, score = 0
        })

    # Add positive and negative examples to the result set
    hotpotqa_subset_for_context_relevance.extend(positive_examples)
    hotpotqa_subset_for_context_relevance.extend(negative_examples)


hotpotqa_subset_for_context_relevance_true_labels = [
    entry["expected_score"] for entry in hotpotqa_subset_for_context_relevance
]
hotpotqa_subset_for_answer_relevance_true_labels = [
    entry["expected_score"] for entry in hotpotqa_subset_for_answer_relevance
]

hotpotqa_subset_for_context_relevance = pd.DataFrame(
    hotpotqa_subset_for_context_relevance
)

hotpotqa_subset_for_answer_relevance = pd.DataFrame(
    hotpotqa_subset_for_answer_relevance
)

In [18]:
hotpotqa_subset_for_context_relevance

Unnamed: 0,query,expected_response,expected_score
0,VIVA Media AG changed it's name in 2004. What ...,"VIVA Media GmbH (until 2004 ""VIVA Media AG"") i...",1
1,VIVA Media AG changed it's name in 2004. What ...,"A Gesellschaft mit beschränkter Haftung (] , a...",1
2,VIVA Media AG changed it's name in 2004. What ...,"Yemoja (Yoruba: ""Yemọja"" ) is a major water de...",0
3,VIVA Media AG changed it's name in 2004. What ...,"Jonathan Monroe ""Jonny"" Craig (born March 26, ...",0
4,Which of Jonny Craig and Pete Doherty has been...,"Jonathan Monroe ""Jonny"" Craig (born March 26, ...",1
...,...,...,...
795,The King who opened Newcastle Civic Center in ...,Walk All Over Me is a Canadian film released i...,0
796,When was the Western Germanic language spoken ...,The Leda is a river in north-western Germany i...,1
797,When was the Western Germanic language spoken ...,Old Frisian is a West Germanic language spoken...,1
798,When was the Western Germanic language spoken ...,Bishop's Stortford Football Club is a football...,0


### Set up feedback LLM providers 

We will experiment with 2 current OpenAI models and a mix of commercial and open source models avaiable in Cortex

In [19]:
import os

import snowflake.connector
from trulens.providers.cortex import Cortex
from trulens.providers.openai import OpenAI

snowflake_connection_parameters = {
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_USER_PASSWORD"],
}
snowflake_connection = snowflake.connector.connect(
    **snowflake_connection_parameters
)


gpt_4o = OpenAI(model_engine="gpt-4o")
gpt_4o_mini = OpenAI(model_engine="gpt-4o-mini")

snowflake_arctic = Cortex(snowflake_connection, model_engine="snowflake-arctic")
mistral_large = Cortex(snowflake_connection, model_engine="mistral-large")
llama3_1_8b = Cortex(snowflake_connection, model_engine="llama3.1-8b")

CORTEX_PROVIDERS = [snowflake_arctic, llama3_1_8b, mistral_large]
OPENAI_PROVIDERS = [gpt_4o, gpt_4o_mini]
ALL_PROVIDERS = CORTEX_PROVIDERS + OPENAI_PROVIDERS

In [20]:
from trulens.core import TruSession

session = TruSession()
session.reset_database()

🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `TruSession` to prevent this.


Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


### Snowflake IT dataset experiment runs:


In [22]:
import math
import time

from trulens.apps.basic import TruBasicApp
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    read_results,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    write_results,
)


def run_feedback_experiment(
    feedback_func_wrapper, app_name, app_version, dataset_df, true_labels
):
    if len(dataset_df) != len(true_labels):
        raise ValueError("dataset df must have the same length as labels")
    tru_wrapped_basic_app = TruBasicApp(
        feedback_func_wrapper, app_name=app_name, app_version=app_version
    )

    generated_scores, labels, latencies = [], [], []
    for i in range(len(dataset_df)):
        arg_1 = dataset_df.iloc[i]["query"]
        arg_2 = dataset_df.iloc[i]["expected_response"]
        try:
            with tru_wrapped_basic_app as _:
                start_time = time.time()
                score = tru_wrapped_basic_app.app(arg_1, arg_2)

                end_time = time.time()
                true_score = true_labels[i]

                if not math.isnan(score):
                    generated_scores.append(score)
                    labels.append(true_score)
                    latencies.append(end_time - start_time)

                    # print(f"Generated score: {score} | true_score: {true_score} \n")
        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with first arg {arg_1} and second arg {arg_2}"
            )
            continue

    write_results(
        generated_scores,
        labels,
        latencies,
        f"results/{app_name}_{app_version}_results.csv",
    )

In [23]:
import concurrent.futures
import traceback

from trulens.feedback.v2.feedback import Groundedness


def runn_all_experiments_for_provider(provider):
    """
    Runs all experiments for a given provider.
    """

    def context_relevance_binary(input, output) -> float:
        return provider.context_relevance_with_cot_reasons(
            question=input,
            context=output,
            criteria="A relevant context to the question should get a score of 1, and an irrelevant context should get a score of 0. The score can only be either 0 or 1 (binary).",
            min_score_val=0,
            max_score_val=1,
        )[0]

    # Run context relevance binary experiment
    # context_relevance_binary_run = create_benchmark_experiment_app(
    #     app_name=f"Snowflake IT  - {provider.model_engine}",
    #     app_version='context_relevance_binary',
    #     benchmark_experiment=TruBenchmarkExperiment(
    #         feedback_fn=context_relevance_binary,
    #         agg_funcs=snowflake_it_metrics,
    #         benchmark_params=benchmark_params
    #     )
    # )

    # with context_relevance_binary_run as recording:
    #     feedback_res = context_relevance_binary_run.app(snowflake_it_for_context_relevance)
    #     print(f'feedback results: {feedback_res}')
    #     write_results(feedback_scores=feedback_res, file_name=f"results/{provider.model_engine}-context_relevance_binary_feedback_scores.csv")

    # # Similar pattern for answer relevance binary experiment
    def answer_relevance_binary(input, output) -> float:
        return provider.relevance(
            prompt=input,
            response=output,
            criteria="A relevant response to the prompt should get a score of 1, and an irrelevant response should get a score of 0. The score can only be either 0 or 1 (binary).",
            min_score_val=0,
            max_score_val=1,
        )

    # answer_relevance_binary_run = create_benchmark_experiment_app(
    #     app_name=f"Snowflake IT - {provider.model_engine}",
    #     app_version='answer_relevance_binary',
    #     benchmark_experiment=TruBenchmarkExperiment(
    #         feedback_fn=answer_relevance_binary,
    #         agg_funcs=snowflake_it_metrics,
    #         benchmark_params=benchmark_params
    #     )
    # )

    # with answer_relevance_binary_run as recording:
    #     feedback_res = answer_relevance_binary_run.app(snowflake_it_for_answer_relevance)
    #     write_results(feedback_scores=feedback_res, file_name=f"results/{provider.model_engine}-answer_relevance_binary_feedback_scores.csv")

    def groundedness_binary(input, output) -> float:
        return provider.groundedness_measure_with_cot_reasons(
            source=input,
            statement=output,
            criteria="A grounded response to the query should get a score of 1, and an ungrounded response should get a score of 0. The score can only be either 0 or 1 (binary).",
            min_score_val=0,
            max_score_val=1,
            use_sent_tokenize=True,
        )[0]

    def groundedness_likert_4(input, output) -> float:
        return provider.groundedness_measure_with_cot_reasons(
            source=input,
            statement=output,
            use_sent_tokenize=True
        )[0]

    # Define a function to wrap the run_feedback_experiment call
    def run_experiment(
        feedback_fn, app_name, app_version, dataset_df, true_labels
    ):
        run_feedback_experiment(
            feedback_func_wrapper=feedback_fn,
            app_name=app_name,
            app_version=app_version,
            dataset_df=dataset_df,
            true_labels=true_labels,
        )

    context_relevance_experiments = [
        {
            "feedback_fn": context_relevance_binary,
            "app_name": f"Snowflake IT balanced - {provider.model_engine}",
            "app_version": "context_relevance_binary",
            "dataset_df": snowflake_it_for_context_relevance,
            "true_labels": snowflake_it_for_context_relevance_true_labels,
        },
        {
            "feedback_fn": context_relevance_binary,
            "app_name": f"Hotpot QA (800 samples) - {provider.model_engine}",
            "app_version": "context_relevance_binary",
            "dataset_df": hotpotqa_subset_for_context_relevance,
            "true_labels": hotpotqa_subset_for_context_relevance_true_labels,
        },
        {
            "feedback_fn": context_relevance_binary,
            "app_name": f"MS MARCO hard negatives (first 400 samples) - {provider.model_engine}",
            "app_version": "context_relevance_binary",
            "dataset_df": ms_marco_hard_neg_balanced,
            "true_labels": [
                row["expected_score"]
                for _, row in ms_marco_hard_neg_balanced.iterrows()
            ],
        },
        {
            # MS MARCO V2 for context relevance
            "feedback_fn": context_relevance_binary,
            "app_name": f"MS MARCO V2 balanced (300 samples) - {provider.model_engine}",
            "app_version": "context_relevance_binary",
            "dataset_df": ms_marco_balanced_sample_300,
            "true_labels": [
                row["expected_score"]
                for _, row in ms_marco_balanced_sample_300.iterrows()
            ],
        },
    ]

    groundedness_experiments = [
        # {
        #     "feedback_fn": groundedness_binary,
        #     "app_name": f"Snowflake IT balanced - {provider.model_engine}",
        #     "app_version": "groundedness_binary",
        #     "dataset_df": snowflake_it_for_groundness,
        #     "true_labels": snowflake_it_for_groundness_true_labels,
        # },
        {
            "feedback_fn": groundedness_likert_4,
            "app_name": f"SummEval (bucketed samples) - {provider.model_engine}",
            "app_version": "groundedness_likert4",
            "dataset_df": summeval,
            "true_labels": [
                row["expected_score"] for _, row in summeval.iterrows()
            ],
        },
        {
            "feedback_fn": groundedness_likert_4,
            "app_name": f"QAGS CNN_DM (bucketed samples) - {provider.model_engine}",
            "app_version": "groundedness_likert4",
            "dataset_df": qags_cnn_dm,
            "true_labels": qags_cnn_dm_true_labels,
        },
        {
            "feedback_fn": groundedness_likert_4,
            "app_name": f"QAGS XSum (bucketed samples) - {provider.model_engine}",
            "app_version": "groundedness_likert4",
            "dataset_df": qags_xsum,
            "true_labels": qqags_xsum_true_labels,
        },
    ]
    answer_relevance_experiments = [
        {
            "feedback_fn": answer_relevance_binary,
            "app_name": f"Snowflake IT balanced - {provider.model_engine}",
            "app_version": "answer_relevance_binary",
            "dataset_df": snowflake_it_for_answer_relevance,
            "true_labels": snowflake_it_answer_relevance_true_labels,
        },
        {
            "feedback_fn": answer_relevance_binary,
            "app_name": f"Hotpot QA (400 samples) - {provider.model_engine}",
            "app_version": "answer_relevance_binary",
            "dataset_df": hotpotqa_subset_for_answer_relevance,
            "true_labels": hotpotqa_subset_for_answer_relevance_true_labels,
        },
    ]

    for exp in (
        # answer_relevance_experiments
        # + context_relevance_experiments
        groundedness_experiments
    ):
        print(f"Running experiment: {exp['app_name']} - {exp['app_version']}")
        if "groundedness" in exp["app_version"]:
            print(f"Groundedness system prompt: {Groundedness.system_prompt}")

        run_experiment(
            exp["feedback_fn"],
            exp["app_name"],
            exp["app_version"],
            exp["dataset_df"],
            exp["true_labels"],
        )


with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the run_experiment_for_provider function for each provider
    futures = [
        executor.submit(runn_all_experiments_for_provider, provider)
        for provider in ALL_PROVIDERS
    ]

    # Optionally, gather results or exceptions
    for future in concurrent.futures.as_completed(futures):
        try:
            result = (
                future.result()
            )  # This will re-raise any exceptions caught during execution
        except Exception as e:
            traceback.print_exc()
            print(f"An error occurred: {e}")

Running experiment: SummEval (bucketed samples) - snowflake-arctic - groundedness_likert4
Groundedness system prompt: You are an INFORMATION OVERLAP classifier; providing the overlap of information (entailment or groundedness) between the source and statement.
Respond only as a number from 0 to 3, where 0 is the lowest score according to the criteria and 3 is the highest possible score.


    You should score the groundedness of the statement based on the following criteria:
    - Statements that are directly supported by the source should be considered grounded and should get a high score.
    - Statements that are not directly supported by the source should be considered not grounded and should get a low score.
    - Statements of doubt, that admissions of uncertainty or not knowing the answer are considered abstention, and should be counted as the most overlap and therefore get a max score of 3.
    
Never elaborate.
Running experiment: SummEval (bucketed samples) - llama3.1-8b - gr

WARNI [trulens.core.app] Unsure what the main input string is for the call to _call with args <BoundArguments (input='(CNN)Donald Sterling\'s racist remarks cost him an NBA team last year. But now it\'s his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling\'s wife sued her. In the lawsuit, Rochelle "Shelly" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple\'s money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. "This is a victory for the Sterling famil

Error Tried to find int or float number using pattern ([+-]?[0-9]+\.[0-9]*|[1-9][0-9]*|0) in
  Please provide the statement sentence you'd like me to evaluate. I'll respond with the template you specified. in run_feedback_experiment row 2 with first arg (CNN)Donald Sterling's racist remarks cost him an NBA team last year. But now it's his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling's wife sued her. In the lawsuit, Rochelle "Shelly" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple's money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly 

WARNI [trulens.core.app] Unsure what the main input string is for the call to _call with args <BoundArguments (input='(CNN)Donald Sterling\'s racist remarks cost him an NBA team last year. But now it\'s his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling\'s wife sued her. In the lawsuit, Rochelle "Shelly" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple\'s money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. "This is a victory for the Sterling famil

### Metrics computation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(
    true_labels, predicted_scores, threshold=0.5, title="Confusion Matrix"
):
    # Binarize the predicted scores based on the threshold
    predicted_labels = [
        1 if score >= threshold else 0 for score in predicted_scores
    ]

    # Compute the confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    # Plot the confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        cbar=False,
        xticklabels=["Predicted 0", "Predicted 1"],
        yticklabels=["True 0", "True 1"],
    )
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

In [None]:
from trulens.feedback import GroundTruthAggregator

for provider_name in [
    "gpt-4o-mini",
    "gpt-4o",
    "snowflake-arctic",
    "llama3.1-8b",
    "mistral-large",
]:
    file_path = f"results/Hotpot QA (400 samples) - {provider_name}_answer_relevance_binary_results.csv"
    scores, labels, latencies = read_results(file_path)
    f_recall = GroundTruthAggregator(labels).recall
    f_precision = GroundTruthAggregator(labels).precision
    f_f1_score = GroundTruthAggregator(labels).f1_score

    binary_labels = []
    for label in labels:
        if label >= 0.5:
            binary_labels.append(1)
        else:
            binary_labels.append(0)

    binary_scores = []
    for score in scores:
        if score >= 0.5:
            binary_scores.append(1)
        else:
            binary_scores.append(0)

    f_cohens_kappa = GroundTruthAggregator(binary_labels).cohens_kappa
    f_auc = GroundTruthAggregator(labels).auc

    f_mae = GroundTruthAggregator(labels).mae
    f_pearson = GroundTruthAggregator(labels).pearson_correlation
    f_spearman = GroundTruthAggregator(labels).spearman_correlation
    f_matthews = GroundTruthAggregator(binary_labels).matthews_correlation

    recall = f_recall(scores)
    precision = f_precision(scores)
    f1_score = f_f1_score(scores)
    mae = f_mae(scores)
    pearson = f_pearson(scores)
    spearman = f_spearman(scores)
    cohens_kappa = f_cohens_kappa(scores)
    auc = f_auc(scores)
    matthews = f_matthews(binary_scores)

    for latency in latencies:
        if latency > 20:
            # print(f"Warning: latency is greater than 10 seconds: {latency}")
            latencies.remove(latency)
    avg_latency = sum(latencies) / len(latencies)

    # print(f"{provider_name}: mae: {mae:.4f}, pearson: {pearson:.4f}, spearman: {spearman:.4f}, Cohen's Kappa: {cohens_kappa:.4f}")
    print(
        f"{provider_name}: recall: {recall:.4f}, precision: {precision:.4f}, f1: {f1_score:.4f}, Cohen's Kappa: {cohens_kappa:.4f}, Matthews: {matthews:.4f}, AUC: {auc:.4f}, avg_latency: {avg_latency:.4f}"
    )
    print("\n")

    plot_confusion_matrix(
        labels, scores, title=f"Confusion Matrix - {provider_name}"
    )

### Loading results of internal evaluation runs scraped from Cortex Chat orchestrator 

In [None]:
cortex_eval_df = pd.read_csv("eval_scrape_mistral-large_output_1727118011.csv")

In [None]:
cortex_eval_df.columns

In [None]:
from trulens.providers.openai import OpenAI

gpt_4o = OpenAI(model_engine="gpt-4o-mini")

context_relevant_likert_4_criteria = """
        - CONTEXT that is IRRELEVANT to the QUESTION should score 0.
        - CONTEXT that is RELEVANT to some of the QUESTION should score of 1.
        - CONTEXT that is RELEVANT to most of the QUESTION should get a score of 2.
        - CONTEXT that is RELEVANT to the entirety of the QUESTION should get a score of 3, which is the full mark.
        - CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 3.
        """


def trulens_context_relevance(query, context) -> float:
    try:
        return gpt_4o.context_relevance_with_cot_reasons(
            question=query,
            context=context,
            max_score_val=3,
            min_score_val=0,
            criteria=context_relevant_likert_4_criteria,
        )[0]
    except Exception as e:
        print(f"Error in trulens_context_relevance: {e}")
        return -1


answer_relevant_likert_4_criteria = """
        - RESPONSE must be relevant to the entire PROMPT to get a score of 4.
        - RELEVANCE score should increase as the RESPONSE provides RELEVANT context to more parts of the PROMPT.
        - RESPONSE that is RELEVANT to none of the PROMPT should get a score of 0.
        - RESPONSE that is RELEVANT to some of the PROMPT should get as score of 1 or 2. Higher score indicates more RELEVANCE.
        - RESPONSE that is RELEVANT to the entire PROMPT should get a score of 3.
        - RESPONSE that is RELEVANT and answers the entire PROMPT completely should get a score of 3.
        - RESPONSE that confidently FALSE should get a score of 0.
        - RESPONSE that is only seemingly RELEVANT should get a score of 0.
        - Answers that intentionally do not answer the question, such as 'I don't know' and model refusals, should also be counted as the least RELEVANT and get a score of 0.
    """


def trulens_answer_relevance(query, response) -> float:
    try:
        return gpt_4o.relevance(
            prompt=query,
            response=response,
            min_score_val=0,
            max_score_val=3,
            criteria=answer_relevant_likert_4_criteria,
        )
    except Exception as e:
        print(f"Error in trulens_answer_relevance: {e}")
        return -1


def trulens_answer_relevance_cot(query, response) -> float:
    try:
        return gpt_4o.relevance_with_cot_reasons(
            prompt=query,
            response=response,
            min_score_val=0,
            max_score_val=3,
            criteria=answer_relevant_likert_4_criteria,
        )[0]
    except Exception as e:
        print(f"Error in trulens_answer_relevance: {e}")
        return -1


likert4_groundedness_criteria = """You should score the groundedness of the statement based on the following criteria:
    - Statements that are directly supported by the source should be considered grounded and should get a high score.
    - Statements that are not directly supported by the source should be considered not grounded and should get a low score.
    - Statements of doubt, that admissions of uncertainty or not knowing the answer are considered abstention, and should be counted as the most overlap and therefore get a max score."""


def trulens_groundedness(context, response) -> float:
    try:
        return gpt_4o.groundedness_measure_with_cot_reasons(
            source=context,
            statement=response,
            use_sent_tokenize=True,
            min_score_val=0,
            max_score_val=3,
            criteria=likert4_groundedness_criteria,
        )[0]
    except Exception as e:
        print(f"Error in trulens_groundedness: {e}")
        return -1


(
    context_relevance_scores,
    answer_relevance_scores,
    groundedness_scores,
    answer_relevance_cot_scores,
) = [], [], [], []

for i, row in cortex_eval_df.iterrows():
    query = row["query"]
    context_chunks = [chunk for chunk in ast.literal_eval(row["golden"])]
    llm_response = row["llm_answer"]

    assert query and llm_response, "query and llm_response should not be empty"

    print(
        f"ROW {i}: query: {query}\n , llm_response: {llm_response}\ng, context_chunks: {context_chunks} \n\n"
    )
    answer_relevance_score = trulens_answer_relevance(query, llm_response)
    answer_relevance_cot_score = trulens_answer_relevance_cot(
        query, llm_response
    )
    answer_relevance_scores.append(answer_relevance_score)
    answer_relevance_cot_scores.append(answer_relevance_cot_score)
    if len(context_chunks) > 0:
        _context_relevance_scores_per_query = []
        _groundedness_scores_per_query = []
        for context in context_chunks:
            if (
                context
                == "Country Work-from-home budget (USD) Welcome period (mo)\nPoland $350 12 "
            ):
                contex = "Country Work-from-home budget (USD) Welcome period in Poland $350 12"
            _context_relevance_scores_per_query.append(
                trulens_context_relevance(query, context)
            )
            _groundedness_scores_per_query.append(
                trulens_groundedness(context, llm_response)
            )

        context_relevance_scores.append(
            sum(_context_relevance_scores_per_query)
            / len(_context_relevance_scores_per_query)
        )
        groundedness_scores.append(
            sum(_groundedness_scores_per_query)
            / len(_groundedness_scores_per_query)
        )
    else:
        context_relevance_scores.append(0)
        groundedness_scores.append(0)
assert (
    len(context_relevance_scores)
    == len(answer_relevance_scores)
    == len(groundedness_scores)
    == len(cortex_eval_df)
    == len(answer_relevance_cot_scores)
)

# save scores to csv
cortex_eval_df["context_relevance_scores"] = context_relevance_scores
cortex_eval_df["answer_relevance_scores"] = answer_relevance_scores
cortex_eval_df["groundedness_scores"] = groundedness_scores
cortex_eval_df["answer_relevance_scores_cot"] = answer_relevance_cot_scores
cortex_eval_df.to_csv(
    "cortex_eval_df_with_trulens_scores_relevance_cot.csv", index=False
)

In [None]:
(
    cortex_eval_df["answer_relevance_scores"]
    - cortex_eval_df["answer_relevance_scores_cot"]
)

In [None]:
answer_relevance_mean = (
    cortex_eval_df["answer_relevance_scores_cot"].mean() * 100
)
context_relevance_mean = cortex_eval_df["context_relevance_scores"].mean() * 100
groundedness_mean = cortex_eval_df["groundedness_scores"].mean() * 100

print(f"Answer Relevance Mean Score: {answer_relevance_mean:.2f}%")
print(f"Context Relevance Mean Score: {context_relevance_mean:.2f}%")
print(f"Groundedness Mean Score: {groundedness_mean:.2f}%")

### Correlation test with Cortex's GT based metrics
#### 
Cortex GT-based metrics:

accuracy_llm: {-1, 0, 1, 2}, llm_citation_f1 [-1, 1.0], gt_citation_f1 [-1, 1.0], gris_llm_answer [0.0, 1.0] <-> answer relevance, context relevance, and groundedness (Likert 4)

anls, gris_anls [0.0, 1.0] 
retrieval_ndcg_at_1 [0.0, 1.0], retrieval_hit_rate_at_1 BINARY, retrieval_ndcg_at_3 [0.0, 1.0], retrieval_hit_rate_at_3 BINARY <-> context relevance 

adjusted_llm_answer vs llm_answer?


In [None]:
cortex_eval_df.columns

In [None]:
cortex_eval_df["context_relevance_scores_binary"] = cortex_eval_df[
    "context_relevance_scores"
].apply(lambda x: 1 if x >= 0.5 else 0)
cortex_eval_df["answer_relevance_scores_binary"] = cortex_eval_df[
    "answer_relevance_scores_cot"
].apply(lambda x: 1 if x >= 0.5 else 0)
cortex_eval_df["groundedness_scores_binary"] = cortex_eval_df[
    "groundedness_scores"
].apply(lambda x: 1 if x >= 0.5 else 0)

In [None]:
from scipy.stats import spearmanr
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import matthews_corrcoef

# Calculate Matthews correlation coefficient for retrieval_hit_at_1 and context_relevance_scores_binary
mcc_hit_at_1_context_relevance = matthews_corrcoef(
    cortex_eval_df["retrieval_hit_at_1"],
    cortex_eval_df["context_relevance_scores_binary"],
)
cohens_kappa_context_relevance = cohen_kappa_score(
    cortex_eval_df["retrieval_hit_at_1"],
    cortex_eval_df["context_relevance_scores_binary"],
)
print(
    f"Matthews Correlation Coefficient (retrieval_hit_at_1 vs context_relevance_scores_binary): {mcc_hit_at_1_context_relevance:.4f}"
)
print(
    f"Cohen's Kappa (retrieval_hit_at_1 vs context_relevance_scores_binary): {cohens_kappa_context_relevance:.4f}"
)

# Calculate Matthews correlation coefficient for retrieval_hit_at_3 and context_relevance_scores_binary
mcc_hit_at_3_context_relevance = matthews_corrcoef(
    cortex_eval_df["retrieval_hit_at_3"],
    cortex_eval_df["context_relevance_scores_binary"],
)
cohens_kappa_context_relevance = cohen_kappa_score(
    cortex_eval_df["retrieval_hit_at_3"],
    cortex_eval_df["context_relevance_scores_binary"],
)
print(
    f"Matthews Correlation Coefficient (retrieval_hit_at_3 vs context_relevance_scores_binary): {mcc_hit_at_3_context_relevance:.4f}"
)
print(
    f"cohen's Kappa (retrieval_hit_at_3 vs context_relevance_scores_binary): {cohens_kappa_context_relevance:.4f}"
)

In [None]:
cortex_eval_df["accuracy_llm_normalized"] = cortex_eval_df[
    "accuracy_llm"
].apply(lambda x: (x - 0) / 2)

In [None]:
REAL_VALUED_COLS = [
    "accuracy_llm_normalized",
    "llm_citation_f1",
    "gt_citation_f1",
    "gris_llm_answer",
    "anls",
    "gris_anls",
    "retrieval_ndcg_at_3",
    "retrieval_ndcg_at_1",
]

In [None]:
# Create a dictionary to store the results
results = {
    "Cortex GT-based metrics": [],
    "Spearman correlation with context_relevance_scores": [],
    "Spearman correlation with answer_relevance_scores": [],
    "Spearman correlation with groundedness_scores": [],
}

# Calculate Spearman correlations and store them in the dictionary
for col_name in REAL_VALUED_COLS:
    spearman_corr_context, _ = spearmanr(
        cortex_eval_df[col_name], cortex_eval_df["context_relevance_scores"]
    )

    spearman_corr_answer, _ = spearmanr(
        cortex_eval_df[col_name], cortex_eval_df["answer_relevance_scores_cot"]
    )

    spearman_corr_groundedness, _ = spearmanr(
        cortex_eval_df[col_name], cortex_eval_df["groundedness_scores"]
    )

    results["Cortex GT-based metrics"].append(col_name)
    results["Spearman correlation with context_relevance_scores"].append(
        spearman_corr_context
    )
    results["Spearman correlation with answer_relevance_scores"].append(
        spearman_corr_answer
    )
    results["Spearman correlation with groundedness_scores"].append(
        spearman_corr_groundedness
    )

# Convert the dictionary to a pandas DataFrame
results_df = pd.DataFrame(results)
results_df

In [None]:
plot_confusion_matrix(
    cortex_eval_df["answer_relevance_scores_binary"],
    cortex_eval_df["anls"].apply(lambda x: 1 if x >= 0.5 else 0),
    title="Confusion Matrix - Answer Relevance vs ANLS",
)

In [None]:
plot_confusion_matrix(
    cortex_eval_df["answer_relevance_scores_binary"],
    cortex_eval_df["accuracy_llm_normalized"].apply(
        lambda x: 1 if x >= 0.5 else 0
    ),
    title="Confusion Matrix - Answer Relevance vs Accuracy LLM Normalized",
)

### Comparison with Phoenix / Arize

In [None]:
from trulens.feedback import GroundTruthAggregator

for provider_name in [
    "gpt-4o-mini",
    "gpt-4o",
    "snowflake-arctic",
    "llama3.1-8b",
    "mistral-large",
]:
    file_path = f"results/Snowflake IT balanced - {provider_name}_context_relevance_binary_results.csv"
    scores, labels, latencies = read_results(file_path)

    scores = [1 if score >= 0.5 else 0 for score in scores]

    binary_labels = []
    for label in labels:
        if label >= 0.5:
            binary_labels.append(1)
        else:
            binary_labels.append(0)
    f_recall = GroundTruthAggregator(binary_labels).recall
    f_precision = GroundTruthAggregator(binary_labels).precision
    f_f1_score = GroundTruthAggregator(binary_labels).f1_score

    f_cohens_kappa = GroundTruthAggregator(binary_labels).cohens_kappa
    f_auc = GroundTruthAggregator(labels).auc

    f_mae = GroundTruthAggregator(labels).mae
    f_pearson = GroundTruthAggregator(labels).pearson_correlation
    f_spearman = GroundTruthAggregator(labels).spearman_correlation

    recall = f_recall(scores)
    precision = f_precision(scores)
    f1_score = f_f1_score(scores)
    mae = f_mae(scores)
    pearson = f_pearson(scores)
    spearman = f_spearman(scores)
    cohens_kappa = f_cohens_kappa(scores)
    auc = f_auc(scores)

    for latency in latencies:
        if latency > 20:
            # print(f"Warning: latency is greater than 10 seconds: {latency}")
            latencies.remove(latency)
    avg_latency = sum(latencies) / len(latencies)

    print(
        f"{provider_name}: mae: {mae:.4f}, pearson: {pearson:.4f}, spearman: {spearman:.4f}, Cohen's Kappa: {cohens_kappa:.4f}"
    )
    print(
        f"{provider_name}: recall: {recall:.4f}, precision: {precision:.4f}, f1: {f1_score:.4f}, Cohen's Kappa: {cohens_kappa:.4f}, avg_latency: {avg_latency:.4f}"
    )
    print("\n")

    plot_confusion_matrix(
        binary_labels, scores, title=f"Confusion Matrix - {provider_name}"
    )