In [None]:
import random

import pandas as pd
from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

random.seed(42)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="hotpotqa")


hotpotqa = beir_data_loader.load_dataset_to_df(download=True)


hotpotqa_raw_subset = hotpotqa.sample(n=200, random_state=42)

all_responses = [
    (row["query"], row["expected_response"])
    for idx, row in hotpotqa_raw_subset.iterrows()
]


hotpotqa_subset_for_answer_relevance = []

for idx, row in hotpotqa_raw_subset.iterrows():
    # Positive examples for answer relevance
    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": row["expected_response"],  # Positive response
        "expected_score": 1,  # Positive example, score = 1
    })

    # Negative examples for answer relevance (random unrelated response)
    negative_response = random.choice([
        r
        for q, r in all_responses
        if q != row["query"]  # Pick response from another query
    ])

    hotpotqa_subset_for_answer_relevance.append({
        "query": row["query"],
        "expected_response": negative_response,  # Negative response
        "expected_score": 0,  # Negative example, score = 0
    })


hotpotqa_subset_for_answer_relevance_true_labels = [
    entry["expected_score"] for entry in hotpotqa_subset_for_answer_relevance
]

hotpotqa_subset_for_answer_relevance = pd.DataFrame(
    hotpotqa_subset_for_answer_relevance
)

In [None]:
from sklearn.model_selection import train_test_split


def balanced_split(
    df,
    label_column="expected_score",
    train_size=0.6,
    dev_size=0.2,
    test_size=0.2,
    random_state=42,
):
    """
    Splits a DataFrame into train, dev, and test sets with balanced labels.

    Args:
        df (pd.DataFrame): The input DataFrame.
        label_column (str): The column containing the labels to balance on.
        train_size (float): Proportion of the data to use for training.
        dev_size (float): Proportion of the data to use for dev/validation.
        test_size (float): Proportion of the data to use for testing.
        random_state (int): Random seed for reproducibility.

    Returns:
        train_df (pd.DataFrame): Training split.
        dev_df (pd.DataFrame): Development/validation split.
        test_df (pd.DataFrame): Testing split.
    """
    assert (
        abs(train_size + dev_size + test_size - 1.0) < 1e-5
    ), "Sizes must sum to 1.0"

    # Step 1: Split train+dev and test
    train_dev_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df[label_column],
        random_state=random_state,
    )

    # Step 2: Calculate relative size for dev split from train+dev
    dev_relative_size = dev_size / (train_size + dev_size)

    # Step 3: Split train and dev
    train_df, dev_df = train_test_split(
        train_dev_df,
        test_size=dev_relative_size,
        stratify=train_dev_df[label_column],
        random_state=random_state,
    )

    return train_df, dev_df, test_df


train_df, dev_df, test_df = balanced_split(
    hotpotqa_subset_for_answer_relevance,
    train_size=0.4,
    dev_size=0.3,
    test_size=0.3,
)

print(f"Train size: {len(train_df)}")
print(f"Dev size: {len(dev_df)}")
print(f"Test size: {len(test_df)}")

In [None]:
# from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
#     visualize_expected_score_distribution,
# )

# hotpotqa_gt_scores = [
#     entry["expected_score"] for _, entry in hotpotqa_subset_for_answer_relevance.iterrows()
# ]
# visualize_expected_score_distribution(hotpotqa_gt_scores)

# train_scores = [entry["expected_score"] for _, entry in train_df.iterrows()]
# visualize_expected_score_distribution(train_scores)

# dev_scores = [entry["expected_score"] for _, entry in dev_df.iterrows()]
# visualize_expected_score_distribution(dev_scores)

# test_scores = [entry["expected_score"] for _, entry in test_df.iterrows()]
# visualize_expected_score_distribution(test_scores)

### Implement TruLens' `answer_relevance` in AdalFlow

In [None]:
from trulens.feedback.v2.feedback import PromptResponseRelevance

print(PromptResponseRelevance.system_prompt)

In [None]:
from typing import Dict, Optional, Union

import adalflow as adal
from adalflow.optim.types import ParameterType
import pandas as pd
from trulens.feedback import generated as feedback_generated

few_shot_template = r"""<START_OF_SYSTEM_PROMPT>
{{system_prompt}}
{# Few shot demos #}
{% if few_shot_demos is not none %}
Here are some examples:
{{few_shot_demos}}
{% endif %}
<END_OF_SYSTEM_PROMPT>
<START_OF_USER>
{{user_prompt}}
<END_OF_USER>
"""


class AnswerRelevanceTaskPipeline(adal.Component):
    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
        super().__init__()

        system_prompt = adal.Parameter(
            data=PromptResponseRelevance.system_prompt,
            role_desc="To give task instruction to the language model in the system prompt",
            requires_opt=True,
            param_type=ParameterType.PROMPT,
            instruction_to_optimizer="You can try to show examples to see if it helps. Also focus on the case when prediction is 1 but ground truth is 0 (False positives)",
        )
        few_shot_demos = adal.Parameter(
            data=None,
            role_desc="To provide few shot demos to the language model",
            requires_opt=True,  # Changed to True for few-shot learning
            param_type=ParameterType.DEMOS,
        )

        self.evaluate_relevance = adal.Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=few_shot_template,
            prompt_kwargs={
                "system_prompt": system_prompt,
                "few_shot_demos": few_shot_demos,
            },
            use_cache=True,
            output_processors=self.parse_output,
        )

    @adal.fun_to_component
    def parse_output(response: str):
        score = (
            feedback_generated.re_configured_rating(
                response,
                min_score_val=0,
                max_score_val=3,
            )
        ) / 3
        return score

    def call(
        self,
        prompt: str,
        response: str,
        id: Optional[str] = None,
    ) -> Union[adal.GeneratorOutput, adal.Parameter]:
        user_prompt = PromptResponseRelevance.user_prompt.format(
            prompt=prompt, response=response
        )

        return self.evaluate_relevance(
            prompt_kwargs={"user_prompt": user_prompt}, id=id
        )

In [None]:
from adalflow.components.model_client.openai_client import OpenAIClient

# az_gpt_4o_model = {
#     "model_client": AzureOpenAIClient(),
#     "model_kwargs": {
#         "model": os.environ["AZURE_OPENAI_DEPLOYMENT"],
#         "max_tokens": 4000,
#         "temperature": 0.0,
#         "top_p": 0.99,
#         "frequency_penalty": 0,
#         "presence_penalty": 0,
#         "stop": None,
#     },
# }

gpt_4o_model = {
    "model_client": OpenAIClient(),
    "model_kwargs": {
        "model": "gpt-4o",
        "max_tokens": 4000,
        "temperature": 0.0,
        "top_p": 0.99,
        "frequency_penalty": 0,
        "presence_penalty": 0,
        "stop": None,
    },
}
task_pipeline = AnswerRelevanceTaskPipeline(**gpt_4o_model)
print(task_pipeline)

output = task_pipeline(prompt="Is apple safe to eat?", response="ha!")
output

In [None]:
task_pipeline.train()  # set to train mode

In [None]:
train_df

In [None]:
from dataclasses import dataclass
from dataclasses import field
import uuid

from adalflow.datasets.types import Example


@dataclass
class HotpotQAData(Example):
    __doc__ = (
        """A dataclass for representing examples in the HotpotQA dataset."""
    )

    id: str = field(
        metadata={"desc": "The unique identifier of the example", "type": "id"},
        default_factory=lambda: str(
            uuid.uuid4()
        ),  # Ensures a unique UUID for each instance
    )
    query: Optional[str] = field(
        metadata={"desc": "The query from user."},
        default=None,
    )

    expected_response: Optional[str] = field(
        metadata={"desc": "The expected answer to the query."},
        default=None,
    )

    expected_score: Optional[float] = field(
        metadata={"desc": "The expected relevance score for the answer."},
        default=None,
    )


train_dataset = [
    HotpotQAData(
        query=row["query"],
        expected_response=row["expected_response"],
        expected_score=row["expected_score"],
    )
    for _, row in train_df.iterrows()
]
val_dataset = [
    HotpotQAData(
        query=row["query"],
        expected_response=row["expected_response"],
        expected_score=row["expected_score"],
    )
    for _, row in dev_df.iterrows()
]
test_dataset = [
    HotpotQAData(
        query=row["query"],
        expected_response=row["expected_response"],
        expected_score=row["expected_score"],
    )
    for _, row in test_df.iterrows()
]


def answer_relevance_eval_fn(y: float, y_gt: float) -> float:
    return 1.0 if y == y_gt else 0.0


def weighted_relevance_loss(
    y: float, y_gt: float, false_negative_weight
) -> float:
    """
    Penalizes false negative to improve recall and keeps the loss in [0, 1].
    """
    # Identify the type of error
    if y_gt > y:  # False negatives
        penalty = false_negative_weight
    elif y != y_gt and not (
        y > 1 and y_gt == 1
    ):  # Other mismatches (false positives)
        penalty = 1.0
    else:  # Correct predictions
        return 0.0

    # Normalize the penalty to keep the loss in [0, 1]
    normalized_loss = penalty / (false_negative_weight + 1.0)

    return (
        1 - normalized_loss
    )  # textual loss higher the better (UNLIKE typical ML loss)


class AnswerRelevanceAdalComponent(adal.AdalComponent):
    def __init__(
        self,
        model_client: adal.ModelClient,
        model_kwargs: Dict,
        backward_engine_model_config: Dict = None,
        teacher_model_config: Dict = None,
        text_optimizer_model_config: Dict = None,
    ):
        task = AnswerRelevanceTaskPipeline(model_client, model_kwargs)

        eval_fn = answer_relevance_eval_fn

        loss_fn = adal.EvalFnToTextLoss(
            eval_fn=lambda y, y_gt: weighted_relevance_loss(
                y, y_gt, false_negative_weight=3.0
            ),
            eval_fn_desc="Give a lower score when the model gives the wrong rating comparing to ground truth to penalize both false negatives and false positives  (i.e. incorrectly classifying a relevant response as not relevant or vice versa).",
        )

        super().__init__(task=task, eval_fn=eval_fn, loss_fn=loss_fn)
        self.backward_engine_model_config = backward_engine_model_config
        self.teacher_model_config = teacher_model_config
        self.text_optimizer_model_config = text_optimizer_model_config

    def prepare_task(self, sample: HotpotQAData):
        return self.task.call, {
            "prompt": sample.query,
            "response": sample.expected_response,
            "id": sample.id,
        }

    def prepare_loss(self, sample: HotpotQAData, pred: adal.Parameter):
        # prepare the gt and pred for the loss function
        y_gt = adal.Parameter(
            name="y_gt",
            data=sample.expected_score,
            eval_input=sample.expected_score,
            requires_opt=False,
        )

        # print(f"pred: {pred}")
        # print(f"pred.full_response: {pred.full_response}")

        pred.eval_input = (
            pred.full_response.data
            if pred
            and pred.full_response
            and isinstance(pred.full_response.data, float)
            else 0
        )

        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}}

    def prepare_eval(self, sample: HotpotQAData, y_pred: adal.GeneratorOutput):
        y_label = -1
        if (
            y_pred
            and y_pred.data is not None
            and isinstance(y_pred.data, (int, float))
        ):
            y_label = 1 if y_pred.data > 0.5 else 0
        print(
            f"y_pred: {y_pred}, y_label: {y_label}, sample.expected_score: {sample.expected_score}"
        )
        return self.eval_fn, {"y": y_label, "y_gt": sample.expected_score}

    def configure_backward_engine(self):
        super().configure_backward_engine_helper(
            **self.backward_engine_model_config
        )

    def configure_teacher_generator(self):
        super().configure_teacher_generator_helper(**self.teacher_model_config)

    def configure_optimizers(self):
        to = super().configure_text_optimizer_helper(
            **self.text_optimizer_model_config
        )
        do = super().configure_demo_optimizer_helper()  # Add demo optimizer
        return to + do  # Return both text and demo optimizers

In [None]:
def diagnose(
    model_client: adal.ModelClient,
    model_kwargs: Dict,
) -> Dict:
    trainset, valset, testset = (
        train_dataset,
        val_dataset,
        test_dataset,
    )
    # use max_samples=10 to test the code

    adal_component = AnswerRelevanceAdalComponent(model_client, model_kwargs)
    trainer = adal.Trainer(adaltask=adal_component)
    trainer.diagnose(dataset=trainset, split="train")
    trainer.diagnose(dataset=valset, split="val")
    trainer.diagnose(dataset=testset, split="test")


diagnose(**gpt_4o_model)

In [None]:
def train(
    train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
    raw_shots: int = 0,
    bootstrap_shots: int = 1,
    max_steps=1,
    num_workers=4,
    strategy="random",
    optimization_order="sequential",
    debug=False,
    resume_from_ckpt=None,
    exclude_input_fields_from_bootstrap_demos=False,
):
    adal_component = AnswerRelevanceAdalComponent(
        **gpt_4o_model,
        teacher_model_config=gpt_4o_model,
        text_optimizer_model_config=gpt_4o_model,
        backward_engine_model_config=gpt_4o_model,
    )
    print(adal_component)
    trainer = adal.Trainer(
        train_batch_size=train_batch_size,
        adaltask=adal_component,
        strategy=strategy,
        max_steps=max_steps,
        num_workers=num_workers,
        raw_shots=raw_shots,
        bootstrap_shots=bootstrap_shots,
        debug=debug,
        weighted_sampling=True,
        optimization_order=optimization_order,
        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
    )
    print(trainer)

    # train_dataset, val_dataset, test_dataset = load_datasets()
    trainer.fit(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        test_dataset=test_dataset,
        debug=debug,
        resume_from_ckpt=resume_from_ckpt,
    )

In [None]:
train(
    train_batch_size=6,
    debug=False,
    max_steps=15,
    strategy="constrained",
    raw_shots=1,
    bootstrap_shots=1,
    exclude_input_fields_from_bootstrap_demos=True,
)

In [None]:
optimized_system_prompt = """You are a RELEVANCE grader; providing the relevance of the given RESPONSE to the given PROMPT.\nRespond only as a number from 0 to 3, where 0 is the lowest score according to the criteria and 3 is the highest possible score.\n\nA few additional scoring guidelines:\n\n- Long RESPONSES should score equally well as short RESPONSES.\n\n- RESPONSE must be relevant to the entire PROMPT to get a maximum score of 3.\n- RELEVANCE score should increase as the RESPONSE provides RELEVANT context to more parts of the PROMPT.\n- RESPONSE that is RELEVANT to none of the PROMPT should get a minimum score of 0.\n- RESPONSE that is RELEVANT and answers the entire PROMPT completely should get a score of 3.\n- RESPONSE that confidently FALSE should get a score of 0.\n- RESPONSE that is only seemingly RELEVANT should get a score of 0.\n- Answers that intentionally do not answer the question, such as 'I don't know' and model refusals, should also be counted as the least RELEVANT and get a score of 0.\n\n- Pay special attention to avoid false negatives by recognizing partial relevance, even if the RESPONSE is not fully aligned with the PROMPT.\n\n- Never elaborate."""

In [None]:
from trulens.core import TruSession
from trulens.providers.openai import OpenAI

session = TruSession()
session.reset_database()
# az_openai_provider = AzureOpenAI(
#     deployment_name=os.environ["AZURE_OPENAI_DEPLOYMENT"],  # gpt-4o
#     api_key=os.environ["AZURE_OPENAI_API_KEY"],
#     azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
#     api_version=os.environ["OPENAI_API_VERSION"],
# )

openai_provider = OpenAI(model_engine="gpt-4o")


def trulens_optimized_answer_relevance(
    prompt: str, response: str, gt_score: float
) -> str:
    score = openai_provider.relevance(
        prompt=prompt,
        response=response,
    )
    return f"{score};{gt_score};N/A"


trulens_optimized_answer_relevance("hey wuz good?", "hi im doing well", 1.0)

In [None]:
hotpotqa_subset_for_answer_relevance

In [None]:
from trulens.apps.basic import TruBasicApp
from trulens.core import Feedback
from trulens.core import Provider

THRESHOLD = 0.5


class CustomTermFeedback(Provider):
    def true_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 1 else 0.0

    def true_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 0 else 0.0

    def false_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 0 else 0.0

    def false_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 1 else 0.0

    def raw_gt_score(self, output: str) -> float:
        return float(output.split(";")[1])

    def raw_feedback_score(self, output: str) -> float:
        return float(output.split(";")[0])


custom_term_feedback = CustomTermFeedback()

f_tp = Feedback(
    custom_term_feedback.true_positive,
    name="True Positive",
    higher_is_better=True,
).on_output()
f_tn = Feedback(
    custom_term_feedback.true_negative,
    name="True Negative",
    higher_is_better=True,
).on_output()
f_fp = Feedback(
    custom_term_feedback.false_positive,
    name="False Positive",
    higher_is_better=False,
).on_output()
f_fn = Feedback(
    custom_term_feedback.false_negative,
    name="False Negative",
    higher_is_better=False,
).on_output()

f_raw_gt_score = Feedback(
    custom_term_feedback.raw_gt_score,
    name="Raw GT Score",
    higher_is_better=True,
).on_output()
f_raw_feedback_score = Feedback(
    custom_term_feedback.raw_feedback_score,
    name="Raw Feedback Score",
    higher_is_better=True,
).on_output()

CUSTOM_FEEDBACK_FUNCS = [
    f_tp,
    f_tn,
    f_fp,
    f_fn,
    f_raw_gt_score,
    f_raw_feedback_score,
]


def run_answer_relevance_experiment(
    func_wrapper, dataset_df, app_name, app_version="dataset_name"
):
    tru_wrapped_app = TruBasicApp(
        func_wrapper,
        app_name=app_name,
        app_version=f"{app_version}",
        feedbacks=CUSTOM_FEEDBACK_FUNCS,
    )

    for i, row in dataset_df.iterrows():
        prompt = row["query"]
        response = row["expected_response"]
        gt_score = row["expected_score"]

        try:
            with tru_wrapped_app as _:
                tru_wrapped_app.app(prompt, response, gt_score)

        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with query {prompt} and response {response}"
            )

In [None]:
run_answer_relevance_experiment(
    trulens_optimized_answer_relevance,
    hotpotqa_subset_for_answer_relevance,
    "Answer Relevance (optimized)",
    "HotpotQA",
)