## TruLens vs RAGAS vs MLFlow performance comparison for groundedness

In this notebook, we analyze the performance of TruLens current groundedness feedback function and its comparable or equivalent implementations from other evaluation frameworks using the same model for LLM-as-judges. 


### Definitions
1. TruLens `groundedness`: evaluates whether a response is fully supported by the source or retrieved contexts.

2. RAGAS `faithfulness`: measures the factual consistency of the generated answer against the given context [source](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html)

3. MLflow `faithfulness`: Faithfulness will be assessed based on how factually consistent the output is to the context
[source](https://mlflow.org/docs/latest/python_api/mlflow.metrics.html#mlflow.metrics.genai.faithfulness)





In [None]:
#! pip install -q trulens-core trulens-providers-openai ragas mlflow

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "sk-..."

In [None]:
from trulens.core import TruSession

session = TruSession()
session.reset_database()

### Prepare 3 public benchmark datasets: QAGS CNN/Daily Mail, QAGS XSum, and SummEval

In [None]:
import pandas as pd
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_qags_golden_set_groundedness,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_summeval_groundedness_golden_set,
)

qags_cnn_dm = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_cnndm.jsonl"))
)

qags_xsum = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_xsum.jsonl"))
)

summeval = pd.DataFrame(
    list(
        generate_summeval_groundedness_golden_set(
            "data/summeval_test.json", max_samples_per_bucket=200
        )
    )
)


summeval_true_labels = [row["expected_score"] for _, row in summeval.iterrows()]

summeval_true_labels_binary = [
    1 if label >= 0.5 else 0 for label in summeval_true_labels
]

qags_cnn_dm_true_labels = [
    row["expected_score"] for _, row in qags_cnn_dm.iterrows()
]

qags_cnn_dm_true_labels_binary = [
    1 if label >= 0.5 else 0 for label in qags_cnn_dm_true_labels
]

qags_xsum_true_labels = [
    row["expected_score"] for _, row in qags_xsum.iterrows()
]

qags_xsum_true_labels_binary = [
    1 if label >= 0.5 else 0 for label in qags_xsum_true_labels
]
combined_dataset = pd.concat(
    [qags_cnn_dm, qags_xsum, summeval], ignore_index=True
)
combined_true_labels = (
    qags_cnn_dm_true_labels + qags_xsum_true_labels + summeval_true_labels
)

assert len(combined_dataset) == len(combined_true_labels)
print(f"Total number of samples: {len(combined_dataset)}")

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    visualize_expected_score_distribution,
)

# making sure the distribution of the expected scores is as balanced as possible for the datasets
visualize_expected_score_distribution(combined_true_labels)

## Setup experiments with TruLens `TruBasicApp` recorder

In [None]:
from datasets import Dataset
import mlflow
from mlflow.metrics.genai import faithfulness as faithfulness_mlflow
from ragas import evaluate
from ragas.cost import get_token_usage_for_openai
from ragas.llms import llm_factory
from ragas.metrics import faithfulness as faithfulness_ragas
from trulens.apps.basic import TruBasicApp
from trulens.providers.openai import OpenAI

OPENAI_LLM_NAME = "gpt-4o-mini"
gpt_4o_mini = OpenAI(model_engine=OPENAI_LLM_NAME)


def trulens_groundedness(context: str, response: str, gt_score: float) -> str:
    trulens_groundedness_res = (
        gpt_4o_mini.groundedness_measure_with_cot_reasons(
            source=context, statement=response, use_sent_tokenize=True
        )
    )
    return f"{trulens_groundedness_res[0]};{gt_score}"


langchain_llm = llm_factory(model=OPENAI_LLM_NAME)
faithfulness_mlflow.llm = langchain_llm


def ragas_faithfulness(context: str, response: str, gt_score: float) -> str:
    data_samples = {"question": [], "answer": [], "contexts": []}
    data_samples["question"].append("dummy text")
    data_samples["answer"].append(response)
    data_samples["contexts"].append(context)
    ragas_dataset = Dataset.from_dict(data_samples)

    score_dict = evaluate(
        ragas_dataset,
        metrics=[faithfulness_ragas],
        llm=langchain_llm,
        token_usage_parser=get_token_usage_for_openai,
    )

    return f"{score_dict['faithfulness']};{gt_score}"


faithfulness_metric = faithfulness_mlflow(
    model=f"openai:/{OPENAI_LLM_NAME}"
)  # not supplying any example as other metrics do zero-shot evaluation as well


def mlflow_faithfulness(context: str, response: str, gt_score: float) -> str:
    eval_data = pd.DataFrame({
        "inputs": [
            "dummy text"  # we are not using the inputs (user's queries) for faithfulness evaluation
        ],
        "predictions": [response],
        "context": [context],
    })

    with mlflow.start_run() as _:
        results = mlflow.evaluate(
            data=eval_data,
            predictions="predictions",
            extra_metrics=[
                faithfulness_metric,
            ],
            evaluators="default",
        )

    mlflow_faithfulness_score = results.metrics["faithfulness/v1/mean"]

    mlflow_faithfulness_score_norm = (
        mlflow_faithfulness_score - 1
    ) / 4.0  # normalizing the score to be between 0 and 1

    return f"{mlflow_faithfulness_score_norm};{gt_score}"


def run_experiment_and_record(
    evaluate_func_wrapper, app_name, app_version, dataset_df, true_labels
):
    if len(dataset_df) != len(true_labels):
        raise ValueError("dataset df must have the same length as labels")

    tru_wrapped_basic_app = TruBasicApp(
        evaluate_func_wrapper, app_name=app_name, app_version=app_version
    )

    for i in range(len(dataset_df)):
        arg_1 = dataset_df.iloc[i]["query"]
        arg_2 = dataset_df.iloc[i]["expected_response"]
        arg_3 = true_labels[i]

        try:
            with tru_wrapped_basic_app as _:
                tru_wrapped_basic_app.app(arg_1, arg_2, arg_3)

        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with first arg {arg_1} and second arg {arg_2}"
            )

In [None]:
run_experiment_and_record(
    evaluate_func_wrapper=trulens_groundedness,
    app_name="trulens-groundedness",
    app_version="10302024",
    dataset_df=combined_dataset,
    true_labels=combined_true_labels,
)

run_experiment_and_record(
    evaluate_func_wrapper=ragas_faithfulness,
    app_name="ragas-faithfulness",
    app_version="10302024",
    dataset_df=combined_dataset,
    true_labels=combined_true_labels,
)

run_experiment_and_record(
    evaluate_func_wrapper=mlflow_faithfulness,
    app_name="mlflow-faithfulness",
    app_version="10302024",
    dataset_df=combined_dataset,
    true_labels=combined_true_labels,
)

In [None]:
session.run_dashboard()

#### Note about column name mapping: in all our dataframes (CNN/DM, XSUM, and SummEval), the "expected_score" column is the ground truth (true) label for the groundedness score, query corresponds to the context, and expected_response corresponds to the response.

In [None]:
combined_dataset

### Benchmarking with real-valued output scores (both TruLens' feedback scores and RAGAS scores are normalized to 0.0 to 1.0)

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    read_results,
)

trulens_cnn_scores, cnn_labels, latencies = read_results(
    "results/QAGS CNN_DM - gpt-4o-mini_groundedness_likert4_results.csv"
)
trulens_xsum_scores, xsum_labels, latencies = read_results(
    "results/QAGS XSum - gpt-4o-mini_groundedness_likert4_results.csv"
)

### Benchmarking with binary output scores (both TruLens' feedback scores and RAGAS scores are cast to 0 and 1)

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    compute_binary_classification_metrics,
)
from trulens.feedback.groundtruth import GroundTruthAggregator

trulens_cnn_dm_scores, trulens_cnn_dm_labels, trulens_cnn_dm_latencies = (
    read_results("results/groundedness-binary-10102024_qags-cnn-dm_results.csv")
)
trulens_cnn_dm_scores_binary = [
    1 if score >= 0.5 else 0 for score in trulens_cnn_dm_scores
]
trulens_cnn_dm_labels_binary = [
    1 if label >= 0.5 else 0 for label in trulens_cnn_dm_labels
]
print(len(trulens_cnn_dm_scores_binary), len(trulens_cnn_dm_labels_binary))

spearman_cor = GroundTruthAggregator(
    trulens_cnn_dm_labels
).spearman_correlation(trulens_cnn_dm_scores)

compute_binary_classification_metrics(
    "TruLens QAGS CNN/Daily Mail",
    trulens_cnn_dm_labels_binary,
    trulens_cnn_dm_scores_binary,
    trulens_cnn_dm_latencies,
)
print(f"TruLens QAGS CNN/Daily Mail: {spearman_cor}")


trulens_xsum_scores, xsum_labels, trulens_xsum_latencies = read_results(
    "results/groundedness-binary-10102024_qags-xsum_results.csv"
)
trulens_xsum_scores_binary = [
    1 if score >= 0.5 else 0 for score in trulens_xsum_scores
]
trulens_xsum_labels_binary = [1 if label >= 0.5 else 0 for label in xsum_labels]
spearman_cor = GroundTruthAggregator(xsum_labels).spearman_correlation(
    trulens_xsum_scores
)

print(len(trulens_xsum_scores_binary), len(trulens_xsum_labels_binary))

compute_binary_classification_metrics(
    "TruLens QAGS XSum",
    trulens_xsum_labels_binary,
    trulens_xsum_scores_binary,
    trulens_xsum_latencies,
)
print(f"TruLens QAGS XSum: {spearman_cor}")

trulens_summeval_scores, summeval_labels, trulens_summeval_latencies = (
    read_results(
        "results/groundedness-binary-10102024_summeval-subset_results.csv"
    )
)
trulens_summeval_binary = [
    1 if score >= 0.5 else 0 for score in trulens_summeval_scores
]
trulens_summeval_labels_binary = [
    1 if label >= 0.5 else 0 for label in summeval_labels
]
print(len(trulens_summeval_binary), len(trulens_summeval_labels_binary))
spearman_cor = GroundTruthAggregator(summeval_labels).spearman_correlation

compute_binary_classification_metrics(
    "TruLens SummEval subset",
    trulens_summeval_labels_binary,
    trulens_summeval_binary,
    trulens_summeval_latencies,
)