### TruLens vs RAGAS comparison

RAGAS vs TruLens' equivalents

faithfulness <-> groundedness



In [None]:
import os

from datasets import Dataset
from ragas import evaluate
from ragas.cost import get_token_usage_for_openai
from ragas.llms import llm_factory
from ragas.metrics import faithfulness
from trulens.providers.openai import OpenAI
import numpy as np

langchain_llm = llm_factory(model="gpt-4o-mini")

faithfulness.llm = langchain_llm

gpt_4o_mini = OpenAI(model_engine="gpt-4o-mini")
likert4_groundedness_criteria = """You should score the groundedness of the statement based on the following criteria:
    - Statements that are directly supported by the source should be considered grounded and should get a high score.
    - Statements that are not directly supported by the source should be considered not grounded and should get a low score.
    - Statements of doubt, that admissions of uncertainty or not knowing the answer are considered abstention, and should be counted as the most overlap and therefore get a max score."""

# data_samples = {
#     'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
#     'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
#     'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
#     ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
# }

# dataset = Dataset.from_dict(data_samples)

# score = evaluate(dataset,metrics=[faithfulness], llm=langchain_llm,  token_usage_parser=get_token_usage_for_openai,
# )

def trulens_groundedness(input, output) -> float:
    return gpt_4o_mini.groundedness_measure_with_cot_reasons(
        source=input,
        statement=output,
        use_sent_tokenize=True,
        min_score_val=0,
        max_score_val=3,
        criteria=likert4_groundedness_criteria,
    )[0]

In [None]:
def ragas_experiment(
    dataset_df,
):
    data_samples = {"question": [], "answer": [], "contexts": []}
    for i, row in dataset_df.iterrows():
        data_samples["question"].append(str(i))
        data_samples["answer"].append(row["expected_response"])
        data_samples["contexts"].append([row["query"]])

    ragas_dataset = Dataset.from_dict(data_samples)

    score = evaluate(
        ragas_dataset,
        metrics=[faithfulness],
        llm=langchain_llm,
        token_usage_parser=get_token_usage_for_openai,
    )
    avg_cost = (
        score.total_cost(
            cost_per_input_token=0.15 / 1e6, cost_per_output_token=0.6 / 1e6
        )
        / 200
    )
    print(f"Average cost per sample: {avg_cost}")

    return score


def trulens_experiment(
    dataset_df,
):
    data_samples = {"question": [], "answer": [], "contexts": []}
    for i, row in dataset_df.iterrows():
        data_samples["question"].append(str(i))
        data_samples["answer"].append(row["expected_response"])
        data_samples["contexts"].append([row["query"]])

    ff_scores = []
    for i in range(len(data_samples["contexts"])):
        ff_scores.append(
            trulens_groundedness(
                data_samples["contexts"][i][0], data_samples["answer"][i]
            )
        )
    

    ff_scores = np.array(ff_scores)
    return ff_scores


ragas_cnn_score = ragas_experiment(qags_cnn_dm)
ragas_xsum_score = ragas_experiment(qags_xsum)

In [None]:
ragas_cnn_score.to_pandas()

In [None]:
trulens_cnn_scores, cnn_labels, latencies = read_results(
    "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/results/QAGS CNN_DM - gpt-4o-mini_groundedness_likert4_results.csv"
)
trulens_xsum_scores, xsum_labels, latencies = read_results(
    "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/results/QAGS XSum - gpt-4o-mini_groundedness_likert4_results.csv"
)

In [None]:
import numpy as np

true_scores = np.array(cnn_labels)
mae_trulens = np.mean(np.abs(trulens_cnn_scores - true_scores))
mae_ragas = np.mean(
    np.abs(
        ragas_cnn_score.to_pandas()["faithfulness"] - qags_cnn_dm_true_labels
    )
)

print(f"Trulens MAE: {mae_trulens:.4f}, Ragas MAE: {mae_ragas:.4f}")

In [None]:
summeval_ragas_data_samples = {"question": [], "answer": [], "contexts": []}
for i, row in summeval_subset.iterrows():
    summeval_ragas_data_samples["question"].append(str(i))
    summeval_ragas_data_samples["answer"].append(row["expected_response"])
    summeval_ragas_data_samples["contexts"].append([row["query"]])

summeval_ragas_dataset = Dataset.from_dict(summeval_ragas_data_samples)

score = evaluate(
    summeval_ragas_dataset,
    metrics=[faithfulness],
    llm=langchain_llm,
    token_usage_parser=get_token_usage_for_openai,
)


In [None]:
avg_cost = (
    score.total_cost(
        cost_per_input_token=0.15 / 1e6, cost_per_output_token=0.6 / 1e6
    )
    / 200
)
avg_cost

In [None]:
ff_scores = []
for i in range(len(summeval_ragas_data_samples["contexts"])):
    ff_scores.append(
        trulens_groundedness(
            summeval_ragas_data_samples["contexts"][i][0],
            summeval_ragas_data_samples["answer"][i],
        )
    )

In [None]:


ff_scores = np.array(ff_scores)
ragas_scores = np.array(score.to_pandas()["faithfulness"])

true_scores = np.array(summeval_subset_true_labels)
mae_trulens = np.mean(np.abs(ff_scores - true_scores))
mae_ragas = np.mean(np.abs(ragas_scores - true_scores))

print(f"Trulens MAE: {mae_trulens:.4f}, Ragas MAE: {mae_ragas:.4f}")