### Datasets preprocessing:

Datasets that need some preprocessing before they can be used in `TruBenchmarkExperiment` class:
1. Snowflake IT (internal): both rephrased and regular?, this should be used for all 3 in the triad
2. SummEval (CNN and DailyMail summarizations with annotation) for groundedness
3. QAGS (CNN and DailyMail with Turkers' annotation) for groundedness
4. QAGS (XSUM with Turkers' annotation) for groundedness
5. MSMARCO V2 for context relevance
6. HotPot QA for answer relevance 



In [None]:
import ast
import csv
import json
import random

import pandas as pd

# Pin random seed
random.seed(42)


# SummEval
def generate_summeval_groundedness_golden_set(file_path):
    def calculate_expected_score(normalized_metrics_lst, weights_lst):
        assert len(normalized_metrics_lst) == len(weights_lst)
        return round(
            sum(
                normalized_metrics_lst[i] * weights_lst[i]
                for i in range(len(normalized_metrics_lst))
            )
            / sum(weights_lst),
            2,
        )

    with open(file_path) as f:
        for line in f:
            # Each line is a separate JSON object
            try:
                data = json.loads(line)

                # Ensure the expected keys exist in the JSON
                try:
                    row = data
                    assert (
                        len(row["machine_summaries"]) == len(row["consistency"])
                    ), "Mismatch in lengths of machine_summaries and consistency"

                    # Iterate over the summaries and create the desired dictionary structure
                    for i in range(len(row["machine_summaries"])):
                        yield {
                            "query": row.get(
                                "text", ""
                            ),  # Default to empty string if key not found
                            "expected_response": row["machine_summaries"][i],
                            "expected_score": calculate_expected_score(
                                [
                                    (row["consistency"][i] - 1)
                                    / 4,  # Normalize from [1, 5] to [0, 1]
                                ],
                                [1.0],
                            ),
                            "human_score": row["consistency"][i],
                        }

                except KeyError as e:
                    print(
                        f"Key error: {e}. Please check if the keys exist in the JSON file."
                    )
                except AssertionError as e:
                    print(
                        f"Assertion error: {e}. The lengths of 'machine_summaries' and 'consistency' do not match."
                    )

            except json.JSONDecodeError as e:
                print(f"JSON decode error: {e}. Check the line format.")


# Snowflake IT dataset


def generatate_snowflake_it_golden_set_groundedness(file_path):
    res = []
    with open(file_path, mode="r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        all_rows = list(reader)
        for row in all_rows:
            # Convert the 'golden' from a string to a list
            try:
                expected_chunks = ast.literal_eval(row["golden"])
                if not isinstance(expected_chunks, list):
                    raise ValueError("Golden column should be a list")

                for expected_chunk in expected_chunks:
                    # Yield the required fields
                    res.append({
                        "query": expected_chunk,  # source
                        "expected_response": row[
                            "expected_response"
                        ],  # statement
                        "expected_score": 1,  # retrieved chunks in the "golden" colum are always considered grounded
                    })

                # Generate a negative example for each query
                # Collect all possible chunks from other queries to use as negative contexts
                other_chunks = [
                    chunk
                    for other_row in all_rows
                    if other_row != row
                    for chunk in ast.literal_eval(other_row["golden"])
                ]

                # Randomly select a negative chunk (context from another query)
                if other_chunks:
                    negative_chunk = random.choice(other_chunks)
                    res.append({
                        "query": negative_chunk,
                        "expected_response": row[
                            "expected_response"
                        ],  # statement (not grounded by the chunk)
                        "expected_score": 0,  # Negative example, score = 0
                    })

            except (ValueError, SyntaxError) as e:
                print(f"Error parsing golden column: {e}")
                continue

    return res


def generate_snowflake_it_golden_set_answer_relevance(file_path):
    res = []
    with open(file_path, mode="r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        all_rows = list(
            reader
        )  # Store all rows in memory to use for negative example generation

        for row in all_rows:
            # generate a positive example for each query
            if (
                "I don’t know the answer to that question."
                in row["expected_response"]
            ):
                ground_truth_score = 0  # label answer relevance as 0 for ABSTENTION "I don’t know the answer to that question."
            else:
                ground_truth_score = (
                    1  # label answer relevance as 1 for all other cases
                )
            res.append({
                "query": row["query"],
                "expected_response": row["expected_response"],
                "expected_score": ground_truth_score,
            })

            # generate an easy negative example for each positive example by randomly selecting a response from another query
            negative_response = random.choice([
                r["expected_response"] for r in all_rows if r != row
            ])
            res.append({
                "query": row["query"],
                "expected_response": negative_response,  # Orthogonal response
                "expected_score": 0,  # Label answer relevance as 0 for negative examples
            })

    return res


def generate_snowflake_it_golden_set_context_relevance(file_path):
    res = []
    with open(file_path, mode="r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        all_rows = list(reader)

        # Step 1: Process each row to extract positive examples
        for row in all_rows:
            try:
                expected_chunks = ast.literal_eval(row["golden"])

                if not isinstance(expected_chunks, list):
                    raise ValueError("Golden column should be a list")

                # Generate positive examples
                for expected_chunk in expected_chunks:
                    res.append({
                        "query": row["query"],
                        "expected_response": expected_chunk,
                        "expected_score": 1,  # Positive example, score = 1
                    })

                # Step 2: Generate a negative example for each query
                # Collect all possible chunks from other queries to use as negative contexts
                other_chunks = [
                    chunk
                    for other_row in all_rows
                    if other_row != row
                    for chunk in ast.literal_eval(other_row["golden"])
                ]

                # Randomly select a negative chunk (context from another query)
                if other_chunks:
                    negative_chunk = random.choice(other_chunks)
                    res.append({
                        "query": row["query"],
                        "expected_response": negative_chunk,  # Orthogonal/negative context
                        "expected_score": 0,  # Negative example, score = 0
                    })

            except (ValueError, SyntaxError) as e:
                print(
                    f"Error parsing golden column for query '{row['query']}': {e}"
                )
                continue

        return res


def generate_qags_golden_set_groundedness(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            # Parse each line as a JSON object
            data = json.loads(line)

            # Extract the article as the query
            query = data["article"]

            # Iterate over the summary_sentences to flatten the structure
            for summary in data["summary_sentences"]:
                expected_response = summary["sentence"]

                # Calculate expected_score based on worker responses
                responses = [
                    response["response"] for response in summary["responses"]
                ]
                # Convert 'yes' to 1 and 'no' to 0, then calculate the average
                expected_score = sum(
                    1 if r.lower() == "yes" else 0 for r in responses
                ) / len(responses)

                # Yield the processed record
                yield {
                    "query": query,
                    "expected_response": expected_response,
                    "expected_score": expected_score,
                }


snowflake_it_file_path = "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/snowflake_it_v3.csv"

snowflake_it_for_answer_relevance = pd.DataFrame(
    list(
        generate_snowflake_it_golden_set_answer_relevance(
            snowflake_it_file_path
        )
    )
)
snowflake_it_answer_relevance_true_labels = list(
    snowflake_it_for_answer_relevance["expected_score"]
)

snowflake_it_for_context_relevance = pd.DataFrame(
    list(
        generate_snowflake_it_golden_set_context_relevance(
            snowflake_it_file_path
        )
    )
)
snowflake_it_for_context_relevance_true_labels = list(
    snowflake_it_for_context_relevance["expected_score"]
)


snowflake_it_for_groundness = pd.DataFrame(
    list(
        generatate_snowflake_it_golden_set_groundedness(snowflake_it_file_path)
    )
)
snowflake_it_for_groundness_true_labels = list(
    snowflake_it_for_groundness["expected_score"]
)

summeval_list = list(
    generate_summeval_groundedness_golden_set(
        "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/summeval_test.json"
    )
)

summeval_true_labels = [entry["expected_score"] for entry in summeval_list]

summeval = pd.DataFrame(
    list(
        generate_summeval_groundedness_golden_set(
            "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/summeval_test.json"
        )
    )
)

qags_cnn_dm = pd.DataFrame(
    list(
        generate_qags_golden_set_groundedness(
            "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/qags_mturk_cnndm.jsonl"
        )
    )
)

qags_cnn_dm_true_labels = [
    row["expected_score"] for _, row in qags_cnn_dm.iterrows()
]


qags_xsum = pd.DataFrame(
    list(
        generate_qags_golden_set_groundedness(
            "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/qags_mturk_xsum.jsonl"
        )
    )
)

qqags_xsum_true_labels = [
    row["expected_score"] for _, row in qags_xsum.iterrows()
]

In [None]:
random.seed(42)


def generate_ms_marco_context_relevance_benchmark(
    file_path="/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/ms_marco_v2_1_val.parquet",
):
    df = pd.read_parquet(file_path, engine="pyarrow")  # or engine='fastparquet'

    for _, row in df.iterrows():
        assert len(row["passages"]["is_selected"]) == len(
            row["passages"]["passage_text"]
        )

        if sum(row["passages"]["is_selected"]) < 1:
            # currently we only consider sample with one passage marked as relevant (there are samples where zero passage_text is selected)
            continue
        for i, passage_text in enumerate(row["passages"]["passage_text"]):
            yield {
                "query_id": row["query_id"],
                "query": row["query"],
                "expected_response": passage_text,
                "expected_score": row["passages"]["is_selected"][
                    i
                ],  # Binary relevance
            }


ms_marco = list(generate_ms_marco_context_relevance_benchmark())


score_1_entries = [entry for entry in ms_marco if entry["expected_score"] == 1]
score_0_entries = [entry for entry in ms_marco if entry["expected_score"] == 0]

# Calculate the number of samples needed from each group
num_samples_per_group = min(
    len(score_1_entries), len(score_0_entries), 150
)  # Sample 150 from each


sampled_score_1 = random.sample(score_1_entries, num_samples_per_group)
sampled_score_0 = random.sample(score_0_entries, num_samples_per_group)

# Combine and shuffle the samples to get a balanced dataset
balanced_sample = sampled_score_1 + sampled_score_0
random.shuffle(balanced_sample)

# Ensure the combined length is 300
assert len(balanced_sample) == 300

# Now you can use `balanced_sample` as your final dataset
print(
    f"Number of entries with expected_score = 1: {len([e for e in balanced_sample if e['expected_score'] == 1])}"
)
print(
    f"Number of entries with expected_score = 0: {len([e for e in balanced_sample if e['expected_score'] == 0])}"
)

ms_marco_balanced_sample_300 = pd.DataFrame(balanced_sample)

### Load preprocessed datasets from BEIR - start w/ Hotpot QA 

In [None]:
from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="hotpotqa")
hotpotqa = beir_data_loader.load_dataset_to_df(download=True)

In [None]:
random.seed(42)


hotpotqa_raw_subset = hotpotqa.sample(n=200, random_state=42)

all_responses = [
    (row["query"], row["expected_response"])
    for idx, row in hotpotqa_raw_subset.iterrows()
]

all_contexts = [
    (row["query"], context["text"])
    for idx, row in hotpotqa_raw_subset.iterrows()
    for context in row["expected_chunks"]
]

hotpotqa_subset_for_answer_relevance, hotpotqa_subset_for_context_relevance = (
    [],
    [],
)


for idx, row in hotpotqa_raw_subset.iterrows():
    # Positive examples for answer relevance
    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": row["expected_response"],  # Positive response
        "expected_score": 1,  # Positive example, score = 1
    })

    # Negative examples for answer relevance (random unrelated response)
    negative_response = random.choice([
        r
        for q, r in all_responses
        if q != row["query"]  # Pick response from another query
    ])

    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": negative_response,  # Negative response
        "expected_score": 0,  # Negative example, score = 0
    })


for idx, row in hotpotqa_raw_subset.iterrows():
    positive_examples = []
    negative_examples = []

    # Generate positive examples for context relevance
    for context in row["expected_chunks"]:
        positive_examples.append({
            "query": row["query"],
            "expected_response": context["text"],  # Positive context
            "expected_score": context["expected_score"],  # Should be 1
        })

    # Generate negative examples for context relevance
    for _ in positive_examples:
        negative_context = random.choice([
            c
            for q, c in all_contexts
            if q != row["query"]  # Pick context from another query
        ])
        negative_examples.append({
            "query": row["query"],
            "expected_response": negative_context,  # Negative context
            "expected_score": 0,  # Negative example, score = 0
        })

    # Add positive and negative examples to the result set
    hotpotqa_subset_for_context_relevance.extend(positive_examples)
    hotpotqa_subset_for_context_relevance.extend(negative_examples)


hotpotqa_subset_for_context_relevance_true_labels = [
    entry["expected_score"] for entry in hotpotqa_subset_for_context_relevance
]
hotpotqa_subset_for_answer_relevance_true_labels = [
    entry["expected_score"] for entry in hotpotqa_subset_for_answer_relevance
]

hotpotqa_subset_for_context_relevance = pd.DataFrame(
    hotpotqa_subset_for_context_relevance
)

hotpotqa_subset_for_answer_relevance = pd.DataFrame(
    hotpotqa_subset_for_answer_relevance
)

In [None]:
hotpotqa_subset_for_context_relevance

In [None]:
summeval_subset = summeval.sample(n=200, random_state=42)
summeval_subset_true_labels = [
    row["expected_score"] for _, row in summeval_subset.iterrows()
]

### Set up feedback LLM providers 

We will experiment with 2 current OpenAI models and a mix of commercial and open source models avaiable in Cortex

In [None]:
import os

import snowflake.connector
from trulens.providers.cortex import Cortex
from trulens.providers.openai import OpenAI

snowflake_connection_parameters = {
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_USER_PASSWORD"],
}
snowflake_connection = snowflake.connector.connect(
    **snowflake_connection_parameters
)


gpt_4o = OpenAI(model_engine="gpt-4o")
gpt_4o_mini = OpenAI(model_engine="gpt-4o-mini")

snowflake_arctic = Cortex(snowflake_connection, model_engine="snowflake-arctic")
mistral_large = Cortex(snowflake_connection, model_engine="mistral-large")
llama3_1_8b = Cortex(snowflake_connection, model_engine="llama3.1-8b")

CORTEX_PROVIDERS = [snowflake_arctic, llama3_1_8b, mistral_large]
OPENAI_PROVIDERS = [gpt_4o, gpt_4o_mini]
ALL_PROVIDERS = CORTEX_PROVIDERS + OPENAI_PROVIDERS

In [None]:
from trulens.core import TruSession

session = TruSession()
session.reset_database()

### Snowflake IT dataset experiment runs:


In [None]:
import math
import time
from typing import List, Tuple

from trulens.apps.basic import TruBasicApp


def write_results(
    feedback_scores: List[float],
    labels: List[float | int],
    latencies: List[float],
    file_name: str,
):
    assert len(feedback_scores) == len(labels)

    with open(file_name, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(feedback_scores)
        writer.writerow(labels)
        writer.writerow(latencies)


def read_results(
    file_name: str,
) -> Tuple[List[float | int], List[float | int], List[float]]:
    with open(file_name, "r") as file:
        reader = csv.reader(file)
        for index, row in enumerate(reader):
            if index == 0:
                # First row contains scores
                scores = list(map(float, row))  # Convert strings to floats
            elif index == 1:
                # Second row contains labels
                labels = list(map(float, row))  # Convert strings to floats
            elif index == 2:
                # Third row contains latencies
                latencies = list(map(float, row))
    return scores, labels, latencies


def run_feedback_experiment(
    feedback_func_wrapper, app_name, app_version, dataset_df, true_labels
):
    if len(dataset_df) != len(true_labels):
        raise ValueError("dataset df must have the same length as labels")
    tru_wrapped_basic_app = TruBasicApp(
        feedback_func_wrapper, app_name=app_name, app_version=app_version
    )

    generated_scores, labels, latencies = [], [], []
    for i in range(len(dataset_df)):
        arg_1 = dataset_df.iloc[i]["query"]
        arg_2 = dataset_df.iloc[i]["expected_response"]
        try:
            with tru_wrapped_basic_app as _:
                start_time = time.time()
                score = tru_wrapped_basic_app.app(arg_1, arg_2)

                end_time = time.time()
                true_score = true_labels[i]

                if not math.isnan(score):
                    generated_scores.append(score)
                    labels.append(true_score)
                    latencies.append(end_time - start_time)

                    # print(f"Generated score: {score} | true_score: {true_score} \n")
        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with first arg {arg_1} and second arg {arg_2}"
            )
            continue

    write_results(
        generated_scores,
        labels,
        latencies,
        f"results/{app_name}_{app_version}_results.csv",
    )

In [None]:
score = gpt_4o_mini.groundedness_measure_with_cot_reasons(
    source="This is a test. Earth is round",
    statement="Earth is not not round",
    criteria="A grounded response to the query should get a score of 1, and an ungrounded response should get a score of 0. The score can only be either 0 or 1 (binary).",
    max_score_val=1,
)
print(score)
score = gpt_4o_mini.groundedness_measure_with_cot_reasons(
    "This is a test. Earth is round",
    "Earth is not not round",
    criteria=""" You should score the groundedness of the statement based on the following criteria:
    - Statements that are directly supported by the source should be considered grounded and should get a high score.
    - Statements that are not directly supported by the source should be considered not grounded and should get a low score.
    - Statements of doubt, that admissions of uncertainty or not knowing the answer are considered abstention, and should be counted as the most overlap and therefore get a max score.""",
    max_score_val=3,
)
print(score)

In [None]:
from trulens.feedback.v2.feedback import Groundedness

default_groundedness_criteria = Groundedness.criteria
default_groundedness_system_prompt = Groundedness.system_prompt

likert4_groundedness_criteria = """You should score the groundedness of the statement based on the following criteria:
    - Statements that are directly supported by the source should be considered grounded and should get a high score.
    - Statements that are not directly supported by the source should be considered not grounded and should get a low score.
    - Statements of doubt, that admissions of uncertainty or not knowing the answer are considered abstention, and should be counted as the most overlap and therefore get a max score."""

In [None]:
default_groundedness_system_prompt

In [None]:
import concurrent.futures
import traceback

from trulens.feedback.v2.feedback import Groundedness


def runn_all_experiments_for_provider(provider):
    """
    Runs all experiments for a given provider.
    """

    def context_relevance_binary(input, output) -> float:
        return provider.context_relevance_with_cot_reasons(
            question=input,
            context=output,
            criteria="A relevant context to the question should get a score of 1, and an irrelevant context should get a score of 0. The score can only be either 0 or 1 (binary).",
            min_score_val=0,
            max_score_val=1,
        )[0]

    # Run context relevance binary experiment
    # context_relevance_binary_run = create_benchmark_experiment_app(
    #     app_name=f"Snowflake IT  - {provider.model_engine}",
    #     app_version='context_relevance_binary',
    #     benchmark_experiment=TruBenchmarkExperiment(
    #         feedback_fn=context_relevance_binary,
    #         agg_funcs=snowflake_it_metrics,
    #         benchmark_params=benchmark_params
    #     )
    # )

    # with context_relevance_binary_run as recording:
    #     feedback_res = context_relevance_binary_run.app(snowflake_it_for_context_relevance)
    #     print(f'feedback results: {feedback_res}')
    #     write_results(feedback_scores=feedback_res, file_name=f"results/{provider.model_engine}-context_relevance_binary_feedback_scores.csv")

    # # Similar pattern for answer relevance binary experiment
    def answer_relevance_binary(input, output) -> float:
        return provider.relevance(
            prompt=input,
            response=output,
            criteria="A relevant response to the prompt should get a score of 1, and an irrelevant response should get a score of 0. The score can only be either 0 or 1 (binary).",
            min_score_val=0,
            max_score_val=1,
        )

    # answer_relevance_binary_run = create_benchmark_experiment_app(
    #     app_name=f"Snowflake IT - {provider.model_engine}",
    #     app_version='answer_relevance_binary',
    #     benchmark_experiment=TruBenchmarkExperiment(
    #         feedback_fn=answer_relevance_binary,
    #         agg_funcs=snowflake_it_metrics,
    #         benchmark_params=benchmark_params
    #     )
    # )

    # with answer_relevance_binary_run as recording:
    #     feedback_res = answer_relevance_binary_run.app(snowflake_it_for_answer_relevance)
    #     write_results(feedback_scores=feedback_res, file_name=f"results/{provider.model_engine}-answer_relevance_binary_feedback_scores.csv")

    def groundedness_binary(input, output) -> float:
        return provider.groundedness_measure_with_cot_reasons(
            source=input,
            statement=output,
            criteria="A grounded response to the query should get a score of 1, and an ungrounded response should get a score of 0. The score can only be either 0 or 1 (binary).",
            min_score_val=0,
            max_score_val=1,
            use_sent_tokenize=True,
        )[0]

    def groundedness_likert_4(input, output) -> float:
        return provider.groundedness_measure_with_cot_reasons(
            source=input,
            statement=output,
            use_sent_tokenize=True,
            min_score_val=0,
            max_score_val=3,
            criteria=likert4_groundedness_criteria,
        )[0]

    # Define a function to wrap the run_feedback_experiment call
    def run_experiment(
        feedback_fn, app_name, app_version, dataset_df, true_labels
    ):
        run_feedback_experiment(
            feedback_func_wrapper=feedback_fn,
            app_name=app_name,
            app_version=app_version,
            dataset_df=dataset_df,
            true_labels=true_labels,
        )

    context_relevance_experiments = [
        {
            "feedback_fn": context_relevance_binary,
            "app_name": f"Snowflake IT balanced - {provider.model_engine}",
            "app_version": "context_relevance_binary",
            "dataset_df": snowflake_it_for_context_relevance,
            "true_labels": snowflake_it_for_context_relevance_true_labels,
        },
        {
            "feedback_fn": context_relevance_binary,
            "app_name": f"Hotpot QA (800 samples) - {provider.model_engine}",
            "app_version": "context_relevance_binary",
            "dataset_df": hotpotqa_subset_for_context_relevance,
            "true_labels": hotpotqa_subset_for_context_relevance_true_labels,
        },
        # {
        #     # MS MARCO V2 for context relevance
        #     'feedback_fn': context_relevance_binary,
        #     'app_name': f"MS MARCO V2 balanced (300 samples) - {provider.model_engine}",
        #     'app_version': 'context_relevance_binary',
        #     'dataset_df': ms_marco_balanced_sample_300,
        #     'true_labels': [row["expected_score"] for _, row in ms_marco_balanced_sample_300.iterrows()]
        # },
    ]

    groundedness_experiments = [
        {
            "feedback_fn": groundedness_binary,
            "app_name": f"Snowflake IT balanced - {provider.model_engine}",
            "app_version": "groundedness_binary",
            "dataset_df": snowflake_it_for_groundness,
            "true_labels": snowflake_it_for_groundness_true_labels,
        },
        # {
        #     'feedback_fn': groundedness_likert_4,
        #     'app_name': f"SummEval (200 samples) - {provider.model_engine}",
        #     'app_version': 'groundedness_likert4',
        #     'dataset_df': summeval_subset,
        #     'true_labels': [row["expected_score"] for _, row in summeval_subset.iterrows()]
        # },
        # {
        #     'feedback_fn': groundedness_likert_4,
        #     'app_name': f"QAGS CNN_DM - {provider.model_engine}",
        #     'app_version': 'groundedness_likert4',
        #     'dataset_df': qags_cnn_dm,
        #     'true_labels': qags_cnn_dm_true_labels
        # },
        # {
        #     'feedback_fn': groundedness_likert_4,
        #     'app_name': f"QAGS XSum - {provider.model_engine}",
        #     'app_version': 'groundedness_likert4',
        #     'dataset_df': qags_xsum,
        #     'true_labels': qqags_xsum_true_labels
        # }
    ]
    answer_relevance_experiments = [
        {
            "feedback_fn": answer_relevance_binary,
            "app_name": f"Snowflake IT balanced - {provider.model_engine}",
            "app_version": "answer_relevance_binary",
            "dataset_df": snowflake_it_for_answer_relevance,
            "true_labels": snowflake_it_answer_relevance_true_labels,
        },
        {
            "feedback_fn": answer_relevance_binary,
            "app_name": f"Hotpot QA (400 samples) - {provider.model_engine}",
            "app_version": "answer_relevance_binary",
            "dataset_df": hotpotqa_subset_for_answer_relevance,
            "true_labels": hotpotqa_subset_for_answer_relevance_true_labels,
        },
    ]

    for exp in (
        answer_relevance_experiments
        + context_relevance_experiments
        + groundedness_experiments
    ):
        print(f"Running experiment: {exp['app_name']} - {exp['app_version']}")
        if "groundedness" in exp["app_version"]:
            print(f"Groundedness system prompt: {Groundedness.system_prompt}")

        run_experiment(
            exp["feedback_fn"],
            exp["app_name"],
            exp["app_version"],
            exp["dataset_df"],
            exp["true_labels"],
        )


# for provider in ALL_PROVIDERS:
#     runn_all_experiments_for_provider(provider)
#     # Run the experiments in parallel using ThreadPoolExecutor
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         # Submit tasks to the executor
#         futures = [
#             executor.submit(run_experiment,
#                             exp['feedback_fn'],
#                             exp['app_name'],
#                             exp['app_version'],
#                             exp['dataset_df'],
#                             exp['true_labels'])
#             for exp in experiments
#         ]

#         # Optionally, gather results or handle exceptions
#         for future in concurrent.futures.as_completed(futures):
#             try:
#                 future.result()  # This will re-raise any exceptions caught during execution
#             except Exception as e:
#                 traceback.print_exc()
#                 print(f"An error occurred: {e}")


with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the run_experiment_for_provider function for each provider
    futures = [
        executor.submit(runn_all_experiments_for_provider, provider)
        for provider in ALL_PROVIDERS
    ]

    # Optionally, gather results or exceptions
    for future in concurrent.futures.as_completed(futures):
        try:
            result = (
                future.result()
            )  # This will re-raise any exceptions caught during execution
        except Exception as e:
            traceback.print_exc()
            print(f"An error occurred: {e}")

In [None]:
from trulens.feedback import GroundTruthAggregator

for provider_name in [
    "gpt-4o-mini",
    "gpt-4o",
    "snowflake-arctic",
    "llama3.1-8b",
    "mistral-large",
]:
    file_path = f"/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/results/Snowflake IT balanced - {provider_name}_context_relevance_binary_results.csv"
    scores, labels, latencies = read_results(file_path)
    f_recall = GroundTruthAggregator(labels).recall
    f_precision = GroundTruthAggregator(labels).precision
    f_f1_score = GroundTruthAggregator(labels).f1_score

    binary_labels = []
    for label in labels:
        if label >= 0.5:
            binary_labels.append(1)
        else:
            binary_labels.append(0)

    f_cohens_kappa = GroundTruthAggregator(binary_labels).cohens_kappa
    f_auc = GroundTruthAggregator(labels).auc

    f_mae = GroundTruthAggregator(labels).mae
    f_pearson = GroundTruthAggregator(labels).pearson_correlation
    f_spearman = GroundTruthAggregator(labels).spearman_correlation

    recall = f_recall(scores)
    precision = f_precision(scores)
    f1_score = f_f1_score(scores)
    mae = f_mae(scores)
    pearson = f_pearson(scores)
    spearman = f_spearman(scores)
    cohens_kappa = f_cohens_kappa(scores)
    # auc = f_auc(scores)

    for latency in latencies:
        if latency > 20:
            # print(f"Warning: latency is greater than 10 seconds: {latency}")
            latencies.remove(latency)
    avg_latency = sum(latencies) / len(latencies)

    # print(f"{provider_name}: mae: {mae:.4f}, pearson: {pearson:.4f}, spearman: {spearman:.4f}, Cohen's Kappa: {cohens_kappa:.4f}")
    print(
        f"{provider_name}: recall: {recall:.4f}, precision: {precision:.4f}, f1: {f1_score:.4f}, Cohen's Kappa: {cohens_kappa:.4f}, avg_latency: {avg_latency:.4f}"
    )
    print("\n")

### TruLens vs RAGAS comparison

RAGAS vs TruLens' equivalents

faithfulness <-> groundedness



In [None]:
import os

from datasets import Dataset
from ragas import evaluate
from ragas.cost import get_token_usage_for_openai
from ragas.llms import llm_factory
from ragas.metrics import faithfulness

langchain_llm = llm_factory(model="gpt-4o-mini")

faithfulness.llm = langchain_llm


# data_samples = {
#     'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
#     'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
#     'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
#     ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
# }

# dataset = Dataset.from_dict(data_samples)

# score = evaluate(dataset,metrics=[faithfulness], llm=langchain_llm,  token_usage_parser=get_token_usage_for_openai,
# )

In [None]:
score.total_cost(cost_per_input_token=5 / 1e6, cost_per_output_token=15 / 1e6)

In [None]:
def trulens_groundedness(input, output) -> float:
    return gpt_4o_mini.groundedness_measure_with_cot_reasons(
        source=input,
        statement=output,
        use_sent_tokenize=True,
        min_score_val=0,
        max_score_val=3,
        criteria=likert4_groundedness_criteria,
    )[0]

In [None]:
def ragas_experiment(
    dataset_df,
):
    data_samples = {"question": [], "answer": [], "contexts": []}
    for i, row in dataset_df.iterrows():
        data_samples["question"].append(str(i))
        data_samples["answer"].append(row["expected_response"])
        data_samples["contexts"].append([row["query"]])

    ragas_dataset = Dataset.from_dict(data_samples)

    score = evaluate(
        ragas_dataset,
        metrics=[faithfulness],
        llm=langchain_llm,
        token_usage_parser=get_token_usage_for_openai,
    )
    avg_cost = (
        score.total_cost(
            cost_per_input_token=0.15 / 1e6, cost_per_output_token=0.6 / 1e6
        )
        / 200
    )
    print(f"Average cost per sample: {avg_cost}")

    return score


def trulens_experiment(
    dataset_df,
):
    data_samples = {"question": [], "answer": [], "contexts": []}
    for i, row in dataset_df.iterrows():
        data_samples["question"].append(str(i))
        data_samples["answer"].append(row["expected_response"])
        data_samples["contexts"].append([row["query"]])

    ff_scores = []
    for i in range(len(data_samples["contexts"])):
        ff_scores.append(
            trulens_groundedness(
                data_samples["contexts"][i][0], data_samples["answer"][i]
            )
        )
    import numpy as np

    ff_scores = np.array(ff_scores)
    return ff_scores


ragas_cnn_score = ragas_experiment(qags_cnn_dm)
ragas_xsum_score = ragas_experiment(qags_xsum)

In [None]:
ragas_cnn_score.to_pandas()

In [None]:
trulens_cnn_scores, cnn_labels, latencies = read_results(
    "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/results/QAGS CNN_DM - gpt-4o-mini_groundedness_likert4_results.csv"
)
trulens_xsum_scores, xsum_labels, latencies = read_results(
    "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/results/QAGS XSum - gpt-4o-mini_groundedness_likert4_results.csv"
)

In [None]:
len(cnn_labels)

In [None]:
import numpy as np

true_scores = np.array(cnn_labels)
mae_trulens = np.mean(np.abs(trulens_cnn_scores - true_scores))
mae_ragas = np.mean(
    np.abs(
        ragas_cnn_score.to_pandas()["faithfulness"] - qags_cnn_dm_true_labels
    )
)

print(f"Trulens MAE: {mae_trulens:.4f}, Ragas MAE: {mae_ragas:.4f}")

In [None]:
summeval_ragas_data_samples = {"question": [], "answer": [], "contexts": []}
for i, row in summeval_subset.iterrows():
    summeval_ragas_data_samples["question"].append(str(i))
    summeval_ragas_data_samples["answer"].append(row["expected_response"])
    summeval_ragas_data_samples["contexts"].append([row["query"]])

summeval_ragas_dataset = Dataset.from_dict(summeval_ragas_data_samples)

score = evaluate(
    summeval_ragas_dataset,
    metrics=[faithfulness],
    llm=langchain_llm,
    token_usage_parser=get_token_usage_for_openai,
)

In [None]:
avg_cost = (
    score.total_cost(
        cost_per_input_token=0.15 / 1e6, cost_per_output_token=0.6 / 1e6
    )
    / 200
)
avg_cost

In [None]:
score.to_pandas()

In [None]:
len(summeval_ragas_data_samples["contexts"])

In [None]:
ff_scores = []
for i in range(len(summeval_ragas_data_samples["contexts"])):
    ff_scores.append(
        trulens_groundedness(
            summeval_ragas_data_samples["contexts"][i][0],
            summeval_ragas_data_samples["answer"][i],
        )
    )

In [None]:
len(ff_scores)

In [None]:
import numpy as np

ff_scores = np.array(ff_scores)
ragas_scores = np.array(score.to_pandas()["faithfulness"])

true_scores = np.array(summeval_subset_true_labels)

In [None]:
mae_trulens = np.mean(np.abs(ff_scores - true_scores))
mae_ragas = np.mean(np.abs(ragas_scores - true_scores))

print(f"Trulens MAE: {mae_trulens:.4f}, Ragas MAE: {mae_ragas:.4f}")