### TruLens vs RAGAS comparison

RAGAS vs TruLens' equivalents

faithfulness <-> groundedness



In [None]:
# ! pip install trulens-core trulens-providers-openai ragas

In [None]:
# os.environ["OPENAI_API_KEY"] = "sk-..."

In [None]:
from datasets import Dataset
import numpy as np
from ragas import evaluate
from ragas.cost import get_token_usage_for_openai
from ragas.llms import llm_factory
from ragas.metrics import faithfulness
from trulens.providers.openai import OpenAI

langchain_llm = llm_factory(model="gpt-4o-mini")

faithfulness.llm = langchain_llm

gpt_4o_mini = OpenAI(model_engine="gpt-4o-mini")


def trulens_groundedness(input, output) -> float:
    return gpt_4o_mini.groundedness_measure_with_cot_reasons(
        source=input,
        statement=output,
        use_sent_tokenize=True,
    )[0]


def trulens_answer_relevance(input, output) -> float:
    return gpt_4o_mini.relevance_with_cot_reasons(
        prompt=input,
        response=output,
    )[0]

### Prepare public benchmark datasets

In [None]:
import pandas as pd
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_qags_golden_set_groundedness,
)
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_summeval_groundedness_golden_set,
)

qags_cnn_dm = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_cnndm.jsonl"))
)

qags_xsum = pd.DataFrame(
    list(generate_qags_golden_set_groundedness("data/qags_mturk_xsum.jsonl"))
)

summeval = pd.DataFrame(
    list(generate_summeval_groundedness_golden_set("data/summeval_test.json"))
)

summeval_subset = summeval.sample(n=200, random_state=42)
summeval_subset_true_labels = [
    row["expected_score"] for _, row in summeval_subset.iterrows()
]

In [None]:
def ragas_experiment(
    dataset_df,
):
    data_samples = {"question": [], "answer": [], "contexts": []}
    for i, row in dataset_df.iterrows():
        data_samples["question"].append(str(i))
        data_samples["answer"].append(row["expected_response"])
        data_samples["contexts"].append([row["query"]])

    ragas_dataset = Dataset.from_dict(data_samples)

    score = evaluate(
        ragas_dataset,
        metrics=[faithfulness],
        llm=langchain_llm,
        token_usage_parser=get_token_usage_for_openai,
    )
    avg_cost = (
        score.total_cost(
            cost_per_input_token=0.15 / 1e6, cost_per_output_token=0.6 / 1e6
        )
        / 200
    )
    print(f"Average cost per sample: {avg_cost}")

    return score


def trulens_experiment(
    dataset_df,
):
    data_samples = {"question": [], "answer": [], "contexts": []}
    for i, row in dataset_df.iterrows():
        data_samples["question"].append(str(i))
        data_samples["answer"].append(row["expected_response"])
        data_samples["contexts"].append([row["query"]])

    ff_scores = []
    for i in range(len(data_samples["contexts"])):
        ff_scores.append(
            trulens_groundedness(
                data_samples["contexts"][i][0], data_samples["answer"][i]
            )
        )

    ff_scores = np.array(ff_scores)
    return ff_scores


ragas_cnn_score = ragas_experiment(qags_cnn_dm)
ragas_xsum_score = ragas_experiment(qags_xsum)

In [None]:
ragas_cnn_score.to_pandas()

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    read_results,
)

trulens_cnn_scores, cnn_labels, latencies = read_results(
    "results/QAGS CNN_DM - gpt-4o-mini_groundedness_likert4_results.csv"
)
trulens_xsum_scores, xsum_labels, latencies = read_results(
    "results/QAGS XSum - gpt-4o-mini_groundedness_likert4_results.csv"
)

In [None]:
true_scores = np.array(cnn_labels)
mae_trulens = np.mean(np.abs(trulens_cnn_scores - true_scores))
mae_ragas = np.mean(
    np.abs(ragas_cnn_score.to_pandas()["faithfulness"] - true_scores)
)

print(f"Trulens MAE: {mae_trulens:.4f}, Ragas MAE: {mae_ragas:.4f}")

In [None]:
summeval_ragas_data_samples = {"question": [], "answer": [], "contexts": []}
for i, row in summeval_subset.iterrows():
    summeval_ragas_data_samples["question"].append(str(i))
    summeval_ragas_data_samples["answer"].append(row["expected_response"])
    summeval_ragas_data_samples["contexts"].append([row["query"]])

summeval_ragas_dataset = Dataset.from_dict(summeval_ragas_data_samples)

score = evaluate(
    summeval_ragas_dataset,
    metrics=[faithfulness],
    llm=langchain_llm,
    token_usage_parser=get_token_usage_for_openai,
)

In [None]:
avg_cost = (
    score.total_cost(
        cost_per_input_token=0.15 / 1e6, cost_per_output_token=0.6 / 1e6
    )
    / 200
)
avg_cost

In [None]:
ff_scores = []
for i in range(len(summeval_ragas_data_samples["contexts"])):
    ff_scores.append(
        trulens_groundedness(
            summeval_ragas_data_samples["contexts"][i][0],
            summeval_ragas_data_samples["answer"][i],
        )
    )

In [None]:
ff_scores = np.array(ff_scores)
ragas_scores = np.array(score.to_pandas()["faithfulness"])

true_scores = np.array(summeval_subset_true_labels)
mae_trulens = np.mean(np.abs(ff_scores - true_scores))
mae_ragas = np.mean(np.abs(ragas_scores - true_scores))

print(f"Trulens MAE: {mae_trulens:.4f}, Ragas MAE: {mae_ragas:.4f}")