In [None]:
from trulens.core.session import TruSession

session = TruSession()
session.reset_database()

In [None]:
import random

import pandas as pd
from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

random.seed(42)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="hotpotqa")


hotpotqa = beir_data_loader.load_dataset_to_df(download=True)


hotpotqa_raw_subset = hotpotqa.sample(n=200, random_state=42)

all_responses = [
    (row["query"], row["expected_response"])
    for idx, row in hotpotqa_raw_subset.iterrows()
]


hotpotqa_subset_for_answer_relevance = []

for idx, row in hotpotqa_raw_subset.iterrows():
    # Positive examples for answer relevance
    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": row["expected_response"],  # Positive response
        "expected_score": 1,  # Positive example, score = 1
    })

    # Negative examples for answer relevance (random unrelated response)
    negative_response = random.choice([
        r
        for q, r in all_responses
        if q != row["query"]  # Pick response from another query
    ])

    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": negative_response,  # Negative response
        "expected_score": 0,  # Negative example, score = 0
    })


hotpotqa_subset_for_answer_relevance_true_labels = [
    entry["expected_score"] for entry in hotpotqa_subset_for_answer_relevance
]

hotpotqa_subset_for_answer_relevance = pd.DataFrame(
    hotpotqa_subset_for_answer_relevance
)

In [None]:
import asyncio

from langchain_openai import OpenAIEmbeddings
from ragas import SingleTurnSample
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import llm_factory
from ragas.metrics import ResponseRelevancy

langchain_llm = llm_factory(model="gpt-4o")
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())


def ragas_response_relevance(query: str, response: str, gt_score: float) -> str:
    sample = SingleTurnSample(
        user_input=query, response=response, retrieved_contexts=[]
    )

    scorer = ResponseRelevancy(
        llm=langchain_llm, embeddings=evaluator_embeddings
    )

    res = asyncio.run(scorer.single_turn_ascore(sample))

    return f"{res};{gt_score}"

In [None]:
ragas_response_relevance("TEST", "ANSWER", 0)

In [None]:
from trulens.apps.basic import TruBasicApp
from trulens.core import Feedback
from trulens.core import Provider

THRESHOLD = 0.5  # for passage retrieval annotation, we consider a score of 0.5 or above as relevant


class CustomTermFeedback(Provider):
    def true_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 1 else 0.0

    def true_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 0 else 0.0

    def false_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 0 else 0.0

    def false_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 1 else 0.0

    def term_absolute_error(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        return abs(feedback_score - gt_score)

    def raw_gt_score(self, output: str) -> float:
        return float(output.split(";")[1])

    def raw_feedback_score(self, output: str) -> float:
        return float(output.split(";")[0])


custom_term_feedback = CustomTermFeedback()

f_tp = Feedback(
    custom_term_feedback.true_positive,
    name="True Positive",
    higher_is_better=True,
).on_output()
f_tn = Feedback(
    custom_term_feedback.true_negative,
    name="True Negative",
    higher_is_better=True,
).on_output()
f_fp = Feedback(
    custom_term_feedback.false_positive,
    name="False Positive",
    higher_is_better=False,
).on_output()
f_fn = Feedback(
    custom_term_feedback.false_negative,
    name="False Negative",
    higher_is_better=False,
).on_output()
f_abs_err = Feedback(
    custom_term_feedback.term_absolute_error,
    name="Absolute Error",
    higher_is_better=False,
).on_output()
f_raw_gt_score = Feedback(
    custom_term_feedback.raw_gt_score,
    name="Raw GT Score",
    higher_is_better=True,
).on_output()
f_raw_feedback_score = Feedback(
    custom_term_feedback.raw_feedback_score,
    name="Raw Feedback Score",
    higher_is_better=True,
).on_output()

CUSTOM_FEEDBACK_FUNCS = [
    f_tp,
    f_tn,
    f_fp,
    f_fn,
    f_abs_err,
    f_raw_gt_score,
    f_raw_feedback_score,
]


def run_experiment_and_record(
    evaluate_func_wrapper, app_name, app_version, dataset_df, true_labels
):
    if len(dataset_df) != len(true_labels):
        raise ValueError("dataset df must have the same length as labels")

    tru_wrapped_basic_app = TruBasicApp(
        evaluate_func_wrapper,
        app_name=app_name,
        app_version=app_version,
        feedbacks=CUSTOM_FEEDBACK_FUNCS,
    )

    for i in range(len(dataset_df)):
        arg_1 = dataset_df.iloc[i]["query"]
        arg_2 = dataset_df.iloc[i]["expected_response"]
        arg_3 = true_labels[i]

        try:
            with tru_wrapped_basic_app as _:
                tru_wrapped_basic_app.app(arg_1, arg_2, arg_3)

        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with first arg {arg_1} and second arg {arg_2}"
            )

In [None]:
run_experiment_and_record(
    evaluate_func_wrapper=ragas_response_relevance,
    app_version="ragas-answer-relevance",
    app_name="12202024",
    dataset_df=hotpotqa_subset_for_answer_relevance,
    true_labels=hotpotqa_subset_for_answer_relevance_true_labels,
)