In [None]:
import random

import pandas as pd
from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

random.seed(42)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="hotpotqa")


hotpotqa = beir_data_loader.load_dataset_to_df(download=True)


hotpotqa_raw_subset = hotpotqa.sample(n=200, random_state=42)

all_responses = [
    (row["query"], row["expected_response"])
    for idx, row in hotpotqa_raw_subset.iterrows()
]


hotpotqa_subset_for_answer_relevance = []

for idx, row in hotpotqa_raw_subset.iterrows():
    # Positive examples for answer relevance
    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": row["expected_response"],  # Positive response
        "expected_score": 1,  # Positive example, score = 1
    })

    # Negative examples for answer relevance (random unrelated response)
    negative_response = random.choice([
        r
        for q, r in all_responses
        if q != row["query"]  # Pick response from another query
    ])

    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": negative_response,  # Negative response
        "expected_score": 0,  # Negative example, score = 0
    })


hotpotqa_subset_for_answer_relevance_true_labels = [
    entry["expected_score"] for entry in hotpotqa_subset_for_answer_relevance
]

hotpotqa_subset_for_answer_relevance = pd.DataFrame(
    hotpotqa_subset_for_answer_relevance
)

In [None]:
import os

from trulens.core import TruSession
from trulens.providers.openai import AzureOpenAI
from trulens.providers.openai import OpenAI

session = TruSession()
session.reset_database()

az_openai_provider = AzureOpenAI(
    deployment_name=os.environ["AZURE_OPENAI_DEPLOYMENT"],  # gpt-4o
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"],
)

openai_provider = OpenAI(model_engine="gpt-4o")


def trulens_optimized_answer_relevance(
    prompt: str, response: str, gt_score: float
) -> str:
    score = openai_provider.relevance(
        prompt=prompt,
        response=response,
    )
    return f"{score};{gt_score};N/A"


trulens_optimized_answer_relevance("hey wuz good?", "hi im doing well", 1.0)

In [None]:
from trulens.apps.basic import TruBasicApp
from trulens.core import Feedback
from trulens.core import Provider

THRESHOLD = 0.5


class CustomTermFeedback(Provider):
    def true_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 1 else 0.0

    def true_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 0 else 0.0

    def false_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 0 else 0.0

    def false_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 1 else 0.0

    def raw_gt_score(self, output: str) -> float:
        return float(output.split(";")[1])

    def raw_feedback_score(self, output: str) -> float:
        return float(output.split(";")[0])


custom_term_feedback = CustomTermFeedback()

f_tp = Feedback(
    custom_term_feedback.true_positive,
    name="True Positive",
    higher_is_better=True,
).on_output()
f_tn = Feedback(
    custom_term_feedback.true_negative,
    name="True Negative",
    higher_is_better=True,
).on_output()
f_fp = Feedback(
    custom_term_feedback.false_positive,
    name="False Positive",
    higher_is_better=False,
).on_output()
f_fn = Feedback(
    custom_term_feedback.false_negative,
    name="False Negative",
    higher_is_better=False,
).on_output()

f_raw_gt_score = Feedback(
    custom_term_feedback.raw_gt_score,
    name="Raw GT Score",
    higher_is_better=True,
).on_output()
f_raw_feedback_score = Feedback(
    custom_term_feedback.raw_feedback_score,
    name="Raw Feedback Score",
    higher_is_better=True,
).on_output()

CUSTOM_FEEDBACK_FUNCS = [
    f_tp,
    f_tn,
    f_fp,
    f_fn,
    f_raw_gt_score,
    f_raw_feedback_score,
]


def run_answer_relevance_experiment(
    func_wrapper, dataset_df, app_name, app_version="dataset_name"
):
    tru_wrapped_app = TruBasicApp(
        func_wrapper,
        app_name=app_name,
        app_version=f"{app_version}",
        feedbacks=CUSTOM_FEEDBACK_FUNCS,
    )

    for i, row in dataset_df.iterrows():
        prompt = row["query"]
        response = row["expected_response"]
        gt_score = row["expected_score"]

        try:
            with tru_wrapped_app as _:
                tru_wrapped_app.app(prompt, response, gt_score)

        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with query {prompt} and response {response}"
            )

In [None]:
run_answer_relevance_experiment(
    trulens_optimized_answer_relevance,
    hotpotqa_subset_for_answer_relevance,
    "Answer Relevance (optimized)",
    "HotpotQA",
)