## TruLens vs RAGAS performance analysis for groundedness

In this notebook, we analyze the performance of TruLens current groundedness feedback function and its comparable or equivalent implementations from other evaluation frameworks using the same model for LLM-as-judges. 


### Definitions
1. TruLens `groundedness`: evaluates whether a response is fully supported by the source or retrieved contexts.

2. RAGAS `faithfulness`: measures the factual consistency of the generated answer against the given context [source](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html)





In [None]:
# ! pip install trulens-core trulens-providers-openai ragas

In [None]:
# os.environ["OPENAI_API_KEY"] = "sk-..."

### Prepare public benchmark datasets

In [None]:
import pandas as pd
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_qags_golden_set_groundedness,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_summeval_groundedness_golden_set,
)

qags_cnn_dm = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_cnndm.jsonl"))
)

qags_xsum = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_xsum.jsonl"))
)

summeval = pd.DataFrame(
    list(
        generate_summeval_groundedness_golden_set(
            "data/summeval_test.json", max_samples_per_bucket=200
        )
    )
)

# summeval_subset = summeval.sample(n=200, random_state=42)
summeval_subset = summeval
summeval_subset_true_labels = [
    row["expected_score"] for _, row in summeval_subset.iterrows()
]

summeval_subset_true_labels_binary = [
    1 if label >= 0.5 else 0 for label in summeval_subset_true_labels
]

qags_cnn_dm_true_labels = [
    row["expected_score"] for _, row in qags_cnn_dm.iterrows()
]

qags_cnn_dm_true_labels_binary = [
    1 if label >= 0.5 else 0 for label in qags_cnn_dm_true_labels
]

qags_xsum_true_labels = [
    row["expected_score"] for _, row in qags_xsum.iterrows()
]

qags_xsum_true_labels_binary = [
    1 if label >= 0.5 else 0 for label in qags_xsum_true_labels
]

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    visualize_expected_score_distribution,
)

# making sure the distribution of the expected scores is balanced for the datasets
visualize_expected_score_distribution(qags_xsum_true_labels)

len(qags_xsum_true_labels)

## Setup TruLens groundedness experiments

In [None]:
import math
import time

from datasets import Dataset
import numpy as np
from trulens.apps.basic import TruBasicApp
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    write_results,
)
from trulens.providers.openai import OpenAI

gpt_4o_mini = OpenAI(model_engine="gpt-4o-mini")


def trulens_groundedness(input, output) -> float:
    return gpt_4o_mini.groundedness_measure_with_cot_reasons(
        source=input,
        statement=output,
        use_sent_tokenize=True,
    )[0]


def trulens_groundedness_binary(input, output) -> float:
    return gpt_4o_mini.groundedness_measure_with_cot_reasons(
        source=input,
        statement=output,
        use_sent_tokenize=True,
        min_score_val=0,
        max_score_val=1,
        criteria="A grounded response to the query should get a score of 1, and an ungrounded response should get a score of 0. The score can only be either 0 or 1 (binary).",
    )[0]


def run_trulens_experiment(
    feedback_func_wrapper, app_name, app_version, dataset_df, true_labels
):
    if len(dataset_df) != len(true_labels):
        raise ValueError("dataset df must have the same length as labels")
    tru_wrapped_basic_app = TruBasicApp(
        feedback_func_wrapper, app_name=app_name, app_version=app_version
    )

    generated_scores, labels, latencies = [], [], []
    for i in range(len(dataset_df)):
        arg_1 = dataset_df.iloc[i]["query"]
        arg_2 = dataset_df.iloc[i]["expected_response"]
        true_score = true_labels[i]
        try:
            with tru_wrapped_basic_app as _:
                start_time = time.time()
                score = tru_wrapped_basic_app.app(arg_1, arg_2)

                if math.isnan(score):
                    score = 0  # if there is an NAN, we assume the score is 0

                    # print(f"Generated score: {score} | true_score: {true_score} \n")
        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with first arg {arg_1} and second arg {arg_2}"
            )
            score = 0  # if there is an error, we assume the score is 0

        end_time = time.time()
        generated_scores.append(score)
        labels.append(true_score)
        latencies.append(end_time - start_time)

    write_results(
        generated_scores,
        labels,
        latencies,
        f"results/{app_name}_{app_version}_results.csv",
    )


run_trulens_experiment(
    feedback_func_wrapper=trulens_groundedness_binary,
    app_name="groundedness-binary-10102024",
    app_version="summeval-subset",
    dataset_df=summeval_subset,
    true_labels=summeval_subset_true_labels,
)
run_trulens_experiment(
    feedback_func_wrapper=trulens_groundedness_binary,
    app_name="groundedness-binary-10102024",
    app_version="qags-cnn-dm",
    dataset_df=qags_cnn_dm,
    true_labels=qags_cnn_dm_true_labels,
)
run_trulens_experiment(
    feedback_func_wrapper=trulens_groundedness_binary,
    app_name="groundedness-binary-10102024",
    app_version="qags-xsum",
    dataset_df=qags_xsum,
    true_labels=qags_xsum_true_labels,
)

### in all our dataframes (CNN/DM, XSUM, and SummEval), the "expected_score" column is the true label for the groundedness score, query corresponds to the context, and expected_response corresponds to the response.

In [None]:
qags_cnn_dm

## Setup RAGAS faithfulness experiments

In [None]:
from ragas import evaluate
from ragas.cost import get_token_usage_for_openai
from ragas.llms import llm_factory
from ragas.metrics import faithfulness

langchain_llm = llm_factory(model="gpt-4o-mini")

faithfulness.llm = langchain_llm


def ragas_experiment(
    dataset_df,
):
    data_samples = {"question": [], "answer": [], "contexts": []}
    for i, row in dataset_df.iterrows():
        data_samples["question"].append(str(i))
        data_samples["answer"].append(row["expected_response"])
        data_samples["contexts"].append([row["query"]])

    ragas_dataset = Dataset.from_dict(data_samples)

    score = evaluate(
        ragas_dataset,
        metrics=[faithfulness],
        llm=langchain_llm,
        token_usage_parser=get_token_usage_for_openai,
    )
    avg_cost = (
        score.total_cost(
            cost_per_input_token=0.15 / 1e6, cost_per_output_token=0.6 / 1e6
        )
        / 200
    )
    print(f"Average cost per sample: {avg_cost}")

    return score


ragas_cnn_score = ragas_experiment(qags_cnn_dm)
qags_cnn_dm_true_labels = [
    row["expected_score"] for _, row in qags_cnn_dm.iterrows()
]

qags_cnn_dm_true_labels_binary = [
    1 if label > 0.5 else 0 for label in qags_cnn_dm_true_labels
]
ragas_xsum_score = ragas_experiment(qags_xsum)
qags_xsum_true_labels = [
    row["expected_score"] for _, row in qags_xsum.iterrows()
]

qags_xsum_true_labels_binary = [
    1 if label > 0.5 else 0 for label in qags_xsum_true_labels
]

In [None]:
ragas_cnn_score.to_pandas()

### Benchmarking with real-valued output scores (both TruLens' feedback scores and RAGAS scores are normalized to 0.0 to 1.0)

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    read_results,
)

trulens_cnn_scores, cnn_labels, latencies = read_results(
    "results/QAGS CNN_DM - gpt-4o-mini_groundedness_likert4_results.csv"
)
trulens_xsum_scores, xsum_labels, latencies = read_results(
    "results/QAGS XSum - gpt-4o-mini_groundedness_likert4_results.csv"
)

In [None]:
cnn_true_scores = np.array(cnn_labels)
mae_trulens_cnn = np.mean(np.abs(trulens_cnn_scores - cnn_true_scores))
mae_ragas_cnn = np.mean(
    np.abs(
        ragas_cnn_score.to_pandas()["faithfulness"] - qags_cnn_dm_true_labels
    )
)

print(
    f"QAGS CNN/DM: Trulens MAE: {mae_trulens_cnn:.4f}, Ragas MAE: {mae_ragas_cnn:.4f}"
)


xsum_true_scores = np.array(xsum_labels)
mae_trulens_xsum = np.mean(np.abs(trulens_xsum_scores - xsum_true_scores))
mae_ragas_xsum = np.mean(
    np.abs(ragas_xsum_score.to_pandas()["faithfulness"] - qags_xsum_true_labels)
)

print(
    f"QAGS XSum: Trulens MAE: {mae_trulens_xsum:.4f}, Ragas MAE: {mae_ragas_xsum:.4f}"
)

In [None]:
summeval_ragas_data_samples = {"question": [], "answer": [], "contexts": []}
for i, row in summeval_subset.iterrows():
    summeval_ragas_data_samples["question"].append(str(i))
    summeval_ragas_data_samples["answer"].append(row["expected_response"])
    summeval_ragas_data_samples["contexts"].append([row["query"]])

summeval_ragas_dataset = Dataset.from_dict(summeval_ragas_data_samples)

ragas_summeval_scores = evaluate(
    summeval_ragas_dataset,
    metrics=[faithfulness],
    llm=langchain_llm,
    token_usage_parser=get_token_usage_for_openai,
)

In [None]:
avg_cost = (
    ragas_summeval_scores.total_cost(
        # hard-coded cost per token values for OpenAI gpt-4o-mini
        cost_per_input_token=0.15 / 1e6,
        cost_per_output_token=0.6 / 1e6,
    )
    / 200
)
avg_cost

### Benchmarking with binary output scores (both TruLens' feedback scores and RAGAS scores are cast to 0 and 1)

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    compute_binary_classification_metrics,
)
from trulens.feedback.groundtruth import GroundTruthAggregator

trulens_cnn_dm_scores, trulens_cnn_dm_labels, trulens_cnn_dm_latencies = (
    read_results("results/groundedness-binary-10102024_qags-cnn-dm_results.csv")
)
trulens_cnn_dm_scores_binary = [
    1 if score >= 0.5 else 0 for score in trulens_cnn_dm_scores
]
trulens_cnn_dm_labels_binary = [
    1 if label >= 0.5 else 0 for label in trulens_cnn_dm_labels
]
print(len(trulens_cnn_dm_scores_binary), len(trulens_cnn_dm_labels_binary))

spearman_cor = GroundTruthAggregator(
    trulens_cnn_dm_labels
).spearman_correlation(trulens_cnn_dm_scores)

compute_binary_classification_metrics(
    "TruLens QAGS CNN/Daily Mail",
    trulens_cnn_dm_labels_binary,
    trulens_cnn_dm_scores_binary,
    trulens_cnn_dm_latencies,
)
print(f"TruLens QAGS CNN/Daily Mail: {spearman_cor}")


trulens_xsum_scores, xsum_labels, trulens_xsum_latencies = read_results(
    "results/groundedness-binary-10102024_qags-xsum_results.csv"
)
trulens_xsum_scores_binary = [
    1 if score >= 0.5 else 0 for score in trulens_xsum_scores
]
trulens_xsum_labels_binary = [1 if label >= 0.5 else 0 for label in xsum_labels]
spearman_cor = GroundTruthAggregator(xsum_labels).spearman_correlation(
    trulens_xsum_scores
)

print(len(trulens_xsum_scores_binary), len(trulens_xsum_labels_binary))

compute_binary_classification_metrics(
    "TruLens QAGS XSum",
    trulens_xsum_labels_binary,
    trulens_xsum_scores_binary,
    trulens_xsum_latencies,
)
print(f"TruLens QAGS XSum: {spearman_cor}")

trulens_summeval_scores, summeval_labels, trulens_summeval_latencies = (
    read_results(
        "results/groundedness-binary-10102024_summeval-subset_results.csv"
    )
)
trulens_summeval_binary = [
    1 if score >= 0.5 else 0 for score in trulens_summeval_scores
]
trulens_summeval_labels_binary = [
    1 if label >= 0.5 else 0 for label in summeval_labels
]
print(len(trulens_summeval_binary), len(trulens_summeval_labels_binary))
spearman_cor = GroundTruthAggregator(summeval_labels).spearman_correlation

compute_binary_classification_metrics(
    "TruLens SummEval subset",
    trulens_summeval_labels_binary,
    trulens_summeval_binary,
    trulens_summeval_latencies,
)

In [None]:
ragas_cnn_dm_scores_binary = [
    1 if score > 0.5 else 0
    for score in ragas_cnn_score.to_pandas()["faithfulness"]
]
ragas_cnn_dm_labels_binary = [
    1 if label >= 0.5 else 0 for label in qags_cnn_dm_true_labels
]
print(len(ragas_cnn_dm_scores_binary), len(ragas_cnn_dm_labels_binary))
compute_binary_classification_metrics(
    "Ragas QAGS CNN/DM",
    ragas_cnn_dm_labels_binary,
    ragas_cnn_dm_scores_binary,
    [],
)

ragas_xsum_scores_binary = [
    1 if score > 0.5 else 0
    for score in ragas_xsum_score.to_pandas()["faithfulness"]
]
ragas_xsum_labels_binary = [
    1 if label >= 0.5 else 0 for label in qags_xsum_true_labels
]
print(len(ragas_xsum_scores_binary), len(ragas_xsum_labels_binary))
compute_binary_classification_metrics(
    "Ragas QAGS XSum", ragas_xsum_labels_binary, ragas_xsum_scores_binary, []
)

ragas_summeval_scores_binary = [
    1 if score > 0.5 else 0
    for score in ragas_summeval_scores.to_pandas()["faithfulness"]
]
ragas_summeval_labels_binary = [
    1 if label >= 0.5 else 0 for label in summeval_subset_true_labels
]
print(len(ragas_summeval_scores_binary), len(ragas_summeval_labels_binary))
compute_binary_classification_metrics(
    "Ragas SummEval subset",
    ragas_summeval_labels_binary,
    ragas_summeval_scores_binary,
    [],
)