In [None]:
import os

from trulens.providers.openai import AzureOpenAI

os.environ["AZURE_OPENAI_DEPLOYMENT"] = "sfc-cortex-analyst-dev"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://sfc-apim-sweden.azure-api.net"
os.environ["AZURE_OPENAI_API_KEY"] = "a9e754bd44684c9c820577232e188f52"
os.environ["OPENAI_API_VERSION"] = "2023-07-01-preview"
az_openai_provider = AzureOpenAI(
    deployment_name=os.environ["AZURE_OPENAI_DEPLOYMENT"],  # gpt-4o
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"],
)

In [None]:
!pip install datasets ir_datasets
!pip install seaborn

In [None]:
!pip install huggingface_hub

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_balanced_llm_aggrefact_benchmark,
)

llm_aggrefact_dev_df = generate_balanced_llm_aggrefact_benchmark(split="dev")
llm_aggrefact_test_df = generate_balanced_llm_aggrefact_benchmark(
    split="test"
)  # the one used for evaluation
llm_aggrefact_dev_df

In [None]:
llm_aggrefact_dev_df

In [None]:
# cnn_aggrefact_dev_df = llm_aggrefact_dev_df[llm_aggrefact_dev_df['dataset'] == 'AggreFact-CNN']
# ragtruth_test_df = llm_aggrefact_test_df[llm_aggrefact_test_df['dataset'] == 'RAGTruth'] # the one selected for data slice

In [None]:
import pandas as pd


def generate_exact_splits_from_df(
    df, train_size=300, dev_size=200, test_size=600, random_seed=40
):
    # Balance each dataset by sampling an equal number of instances per label
    balanced_dfs = []
    for dataset_name in df["dataset"].unique():
        df_subset = df[df["dataset"] == dataset_name]
        min_count = df_subset["label"].value_counts().min()
        df_balanced = (
            df_subset.groupby("label")
            .apply(lambda x: x.sample(min_count, random_state=random_seed))
            .reset_index(drop=True)
        )
        balanced_dfs.append(df_balanced)

    # Concatenate all balanced subsets into a single DataFrame
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)

    # Ensure the dataset is large enough for the requested sizes
    total_required_size = train_size + dev_size + test_size
    if len(balanced_df) < total_required_size:
        raise ValueError(
            "Balanced dataset size is smaller than the requested split sizes."
        )

    # Shuffle the data
    balanced_df = balanced_df.sample(
        frac=1, random_state=random_seed
    ).reset_index(drop=True)

    # Perform explicit slicing for train, dev, and test sets
    test_df = balanced_df[:test_size]
    dev_df = balanced_df[test_size : test_size + dev_size]
    train_df = balanced_df[
        test_size + dev_size : test_size + dev_size + train_size
    ]

    return train_df, dev_df, test_df


# Load and balance the dataset and generate splits
# train, dev, test = generate_exact_splits_llm_aggrefact(llm_aggrefact_dev_df)
train, dev, test = generate_exact_splits_from_df(
    llm_aggrefact_test_df, train_size=200, dev_size=300, test_size=300
)


# Display sizes
print(f"Train size: {len(train)}")
print(f"Dev size: {len(dev)}")
print(f"Test size: {len(test)}")

# Optionally save to CSV
train.to_csv("train.csv", index=False)
dev.to_csv("dev.csv", index=False)
test.to_csv("test.csv", index=False)


data_train = []
data_dev = []
data_test = []
for i, example in train.iterrows():
    data_train.append({
        "query": example.doc,
        "expected_score": example.label,
        "expected_response": example.claim,
    })


for i, example in dev.iterrows():
    data_dev.append({
        "query": example.doc,
        "expected_score": example.label,
        "expected_response": example.claim,
    })

for i, example in test.iterrows():
    data_test.append({
        "query": example.doc,
        "expected_score": example.label,
        "expected_response": example.claim,
    })
df_train = pd.DataFrame(data_train)

df_dev = pd.DataFrame(data_dev)

df_test = pd.DataFrame(data_test)

print(
    f"len(df_train): {len(df_train)}; len(df_dev): {len(df_dev)}; len(df_test): {len(df_test)}"
)

### Implement TruLens' `groundedness_with_cot_reasons` in AdalFlow

In [None]:
df_dev.expected_score.value_counts()

In [None]:
import re
from typing import Dict, Optional, Tuple, Union
import warnings

import adalflow as adal
from adalflow.optim.types import ParameterType
import nltk
import pandas as pd
from trulens.feedback import generated as feedback_generated

nltk.download("punkt_tab", quiet=True)


few_shot_template = r"""<START_OF_SYSTEM_PROMPT>
{{system_prompt}}
{# Few shot demos #}
{% if few_shot_demos is not none %}
Here are some examples:
{{few_shot_demos}}
{% endif %}
<END_OF_SYSTEM_PROMPT>
<START_OF_USER>
{{user_prompt}}
<END_OF_USER>
"""


class GroundednessTaskPipeline(adal.Component):
    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
        super().__init__()

        system_prompt = adal.Parameter(
            # data=Groundedness.system_prompt,
            data="""
            You are an INFORMATION OVERLAP classifier; providing the overlap of information (entailment or groundedness) between the source and statement.

            Respond only as a number from 0 to 3, where 0 is the lowest score according to the criteria and 3 is the highest possible score.\n\nYou should score the groundedness of the statement based on the following criteria:\n\n- Statements that are directly supported by the source should be considered grounded and should get a high score.\n\n- Statements that are not directly supported by the source should be considered not grounded and should get a low score.\n\n- Statements of doubt, admissions of uncertainty, or not knowing the answer are considered abstention, and should be counted as the most overlap and therefore get a max score of 3.\n\n- Consider indirect or implicit evidence, or the context of the statement, to avoid penalizing potentially factual claims due to lack of explicit support.\n\n- Be cautious of false positives; ensure that high scores are only given when there is clear supporting evidence.\n\n- Pay special attention to cases where the prediction is 1 but the ground truth is 0, and ensure that indirect evidence is not mistaken for direct support.\n\nNever elaborate.
            """,
            role_desc="To give task instruction to the language model in the system prompt",
            requires_opt=True,
            param_type=ParameterType.PROMPT,
            instruction_to_optimizer="You can try to show examples to see if it helps. Also focus on the case when prediction is 1 but ground truth is 0 (False positives)",
        )
        few_shot_demos = adal.Parameter(
            data=None,
            role_desc="To provide few shot demos to the language model",
            requires_opt=True,  # Changed to True for few-shot learning
            param_type=ParameterType.DEMOS,
        )

        self.evaluate_hypothesis = adal.Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            template=few_shot_template,
            prompt_kwargs={
                "system_prompt": system_prompt,
                "few_shot_demos": few_shot_demos,
            },
            use_cache=True,
            output_processors=self.parse_single_groundedness_output,
        )

    @adal.fun_to_component
    def parse_single_groundedness_output(response: str) -> Tuple[float, Dict]:
        score, reason = None, None
        if response and "Supporting Evidence" in response:
            score = -1
            supporting_evidence = None
            criteria = None
            for line in response.split("\n"):
                if "Score" in line:
                    score = (
                        feedback_generated.re_configured_rating(
                            line,
                            min_score_val=0,
                            max_score_val=3,
                        )
                    ) / 3
                criteria_lines = []
                supporting_evidence_lines = []
                collecting_criteria = False
                collecting_evidence = False

                for line in response.split("\n"):
                    if "Criteria:" in line:
                        criteria_lines.append(
                            line.split("Criteria:", 1)[1].strip()
                        )
                        collecting_criteria = True
                        collecting_evidence = False
                    elif "Supporting Evidence:" in line:
                        supporting_evidence_lines.append(
                            line.split("Supporting Evidence:", 1)[1].strip()
                        )
                        collecting_evidence = True
                        collecting_criteria = False
                    elif collecting_criteria:
                        if "Supporting Evidence:" not in line:
                            criteria_lines.append(line.strip())
                        else:
                            collecting_criteria = False
                    elif collecting_evidence:
                        if "Criteria:" not in line:
                            supporting_evidence_lines.append(line.strip())
                        else:
                            collecting_evidence = False

                criteria = "\n".join(criteria_lines).strip()
                supporting_evidence = "\n".join(
                    supporting_evidence_lines
                ).strip()
            reason = {
                "reason": (
                    f"{'Criteria: ' + str(criteria)}\n"
                    f"{'Supporting Evidence: ' + str(supporting_evidence)}"
                )
            }
            score = score
            reason = reason

        else:
            if not response:
                score = 0
                reason = {"reason": "No response generated."}
            else:
                score = (
                    feedback_generated.re_configured_rating(
                        response,
                        min_score_val=0,
                        max_score_val=3,
                    )
                ) / 3
                warnings.warn(
                    "No supporting evidence provided. Returning score only.",
                    UserWarning,
                )
                score = score
                reason = {}

        score_pattern = re.compile(r"Score:\s*([0-9.]+)")
        match = score_pattern.search(reason.get("reason", ""))
        normalized_reason = None
        if match:
            original_reason_score = float(match.group(1))
            normalized_reason_score = (original_reason_score) / 3

            # Ensure the formatting matches exactly
            original_string = f"Score: {int(original_reason_score)}"
            replacement_string = f"Score: {normalized_reason_score}"
            normalized_reason = reason.copy()
            normalized_reason["reason"] = normalized_reason["reason"].replace(
                original_string, replacement_string
            )

        if normalized_reason is not None:
            return score, normalized_reason
        else:
            return score, reason

    def call(
        self,
        premise: str,
        hypothesis: str,
        id: Optional[str] = None,
    ) -> Union[adal.GeneratorOutput, adal.Parameter]:
        # TODO - add trivial statement prompt to be another parameter to optimize

        # def evaluate_hypothesis(index, hypothesis):
        user_prompt = """SOURCE: {premise}

        Hypothesis: {hypothesis}

        Please answer with the template below for all statement sentences:

        Criteria: <Statement Sentence>
        Supporting Evidence: <Identify and describe the location in the source where the information matches the statement. Provide a detailed, human-readable summary indicating the path or key details. if nothing matches, say NOTHING FOUND. For the case where the statement is an abstention, say ABSTENTION>
        Score: <Output a number based on the scoring output space / range>
        """.format(premise=premise, hypothesis=hypothesis)

        return self.evaluate_hypothesis(
            prompt_kwargs={"user_prompt": user_prompt}, id=id
        )

In [None]:
import os

from adalflow.components.model_client.openai_client import AzureOpenAIClient

az_gpt_4o_model = {
    "model_client": AzureOpenAIClient(),
    "model_kwargs": {
        "model": os.environ["AZURE_OPENAI_DEPLOYMENT"],
        "max_tokens": 4000,
        "temperature": 0.0,
        "top_p": 0.99,
        "frequency_penalty": 0,
        "presence_penalty": 0,
        "stop": None,
    },
}

# gpt_4o_model = {
#     "model_client": OpenAIClient(),
#     "model_kwargs": {
#         "model": "gpt-4o",
#         "max_tokens": 4000,
#         "temperature": 0.0,
#         "top_p": 0.99,
#         "frequency_penalty": 0,
#         "presence_penalty": 0,
#         "stop": None,
#     },
# }

task_pipeline = GroundednessTaskPipeline(**az_gpt_4o_model)
print(task_pipeline)

In [None]:
output = task_pipeline(
    premise="All fruits are edible", hypothesis=" Apple is edible"
)
output

### Start auto prompt optimization with Adalflow

In [None]:
task_pipeline.train()  # set to train mode

In [None]:
from dataclasses import dataclass
from dataclasses import field
import uuid

from adalflow.datasets.types import Example


@dataclass
class LLMAggreFactData(Example):
    __doc__ = """A dataclass for representing examples in the LLM-AggreFact dataset."""

    id: str = field(
        metadata={"desc": "The unique identifier of the example", "type": "id"},
        default_factory=lambda: str(
            uuid.uuid4()
        ),  # Ensures a unique UUID for each instance
    )
    query: Optional[str] = field(
        metadata={"desc": "The source context from the retrieved documents."},
        default=None,
    )

    expected_response: Optional[str] = field(
        metadata={
            "desc": "The generated response to the query that its groundedness shall be evaluated."
        },
        default=None,
    )

    expected_score: Optional[float] = field(
        metadata={"desc": "The expected groundedness score for the answer."},
        default=None,
    )


train_dataset = [
    LLMAggreFactData(
        query=row["query"],
        expected_response=row["expected_response"],
        expected_score=row["expected_score"],
    )
    for _, row in df_train.iterrows()
]
val_dataset = [
    LLMAggreFactData(
        query=row["query"],
        expected_response=row["expected_response"],
        expected_score=row["expected_score"],
    )
    for _, row in df_dev.iterrows()
]
test_dataset = [
    LLMAggreFactData(
        query=row["query"],
        expected_response=row["expected_response"],
        expected_score=row["expected_score"],
    )
    for _, row in df_test.iterrows()
]


def groundedness_eval_fn(y: float, y_gt: float) -> float:
    y_binary = 1 if y >= 0.5 else 0
    return 1.0 if y_binary == y_gt else 0.0


def weighted_groundedness_loss(
    y: float, y_gt: float, false_positive_weight: float = 3.0
) -> float:
    """
    Penalizes false positives more heavily and keeps the loss in [0, 1].
    """
    y_binary = 1 if y >= 0.5 else 0

    # Identify the type of error
    if y_binary == 1 and y_gt == 0:  # False positive
        penalty = false_positive_weight
    elif y_binary != y_gt:  # Other mismatches (false negatives)
        penalty = 1.0
    else:  # Correct predictions
        return 0.0

    # Normalize the penalty to keep the loss in [0, 1]
    normalized_loss = penalty / (false_positive_weight + 1.0)

    return normalized_loss


class GroundednessAdalComponentOnLLMAggreFact(adal.AdalComponent):
    def __init__(
        self,
        model_client: adal.ModelClient,
        model_kwargs: Dict,
        backward_engine_model_config: Dict = None,
        teacher_model_config: Dict = None,
        text_optimizer_model_config: Dict = None,
    ):
        task = GroundednessTaskPipeline(model_client, model_kwargs)
        # eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
        eval_fn = groundedness_eval_fn
        loss_fn = adal.EvalFnToTextLoss(
            eval_fn=lambda y, y_gt: weighted_groundedness_loss(
                y, y_gt, false_positive_weight=3.0
            ),
            eval_fn_desc=(
                "Weighted loss to penalize false positives - this is when the model classifies the claim as factual while it's not fully supported by the source document: "
                """y_binary = 1 if y >= 0.5 else 0
                    if y_binary == 1 and y_gt == 0:  # False positive
                        penalty = false_positive_weight
                    elif y_binary != y_gt:  # Other mismatches (false negatives)
                        penalty = 1.0
                    else:  # Correct predictions
                        return 0.0

                    # Normalize the penalty to keep the loss in [0, 1]
                    normalized_loss = penalty / (false_positive_weight + 1.0)

                    return normalized_loss
                """
            ),
        )
        # loss_fn = adal.EvalFnToTextLoss(
        #     eval_fn=eval_fn,
        #     eval_fn_desc="exact_match: 1 if y == y_gt else 0",
        # )

        super().__init__(task=task, eval_fn=eval_fn, loss_fn=loss_fn)
        self.backward_engine_model_config = backward_engine_model_config
        self.teacher_model_config = teacher_model_config
        self.text_optimizer_model_config = text_optimizer_model_config

    def prepare_task(self, sample: LLMAggreFactData):
        return self.task.call, {
            "premise": sample.query,
            "hypothesis": sample.expected_response,
            "id": sample.id,
        }

    def prepare_loss(self, sample: LLMAggreFactData, pred: adal.Parameter):
        # prepare the gt and pred for the loss function
        y_gt = adal.Parameter(
            name="y_gt",
            data=sample.expected_score,
            eval_input=sample.expected_score,
            requires_opt=False,
        )

        pred.eval_input = (
            pred.full_response.data[0]
            if pred and pred.full_response and len(pred.full_response.data) > 0
            else 0
        )

        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}}

    def prepare_eval(
        self, sample: LLMAggreFactData, y_pred: adal.GeneratorOutput
    ):
        # print("ok printing prepare eval")

        # print(f"Y_pred: {y_pred}")

        y_label = -1
        if (
            y_pred
            and y_pred.data
            and len(y_pred.data) > 0
            and isinstance(y_pred.data[0], float)
        ):
            y_label = y_pred.data[0]
        return self.eval_fn, {"y": y_label, "y_gt": sample.expected_score}

    def configure_backward_engine(self):
        super().configure_backward_engine_helper(
            **self.backward_engine_model_config
        )

    def configure_teacher_generator(self):
        super().configure_teacher_generator_helper(**self.teacher_model_config)

    def configure_optimizers(self):
        to = super().configure_text_optimizer_helper(
            **self.text_optimizer_model_config
        )
        do = super().configure_demo_optimizer_helper()  # Add demo optimizer
        return to + do  # Return both text and demo optimizers

In [None]:
def diagnose(
    model_client: adal.ModelClient,
    model_kwargs: Dict,
) -> Dict:
    trainset, valset, testset = (
        train_dataset,
        val_dataset,
        test_dataset,
    )
    # use max_samples=10 to test the code

    adal_component = GroundednessAdalComponentOnLLMAggreFact(
        model_client, model_kwargs
    )
    trainer = adal.Trainer(adaltask=adal_component)
    trainer.diagnose(dataset=trainset, split="train")
    trainer.diagnose(dataset=valset, split="val")
    trainer.diagnose(dataset=testset, split="test")

In [None]:
diagnose(**az_gpt_4o_model)

In [None]:
def train(
    train_batch_size=2,  # larger batch size is not that effective, probably because of llm's lost in the middle
    raw_shots: int = 0,
    bootstrap_shots: int = 2,
    max_steps=1,
    num_workers=4,
    strategy="random",
    optimization_order="sequential",
    debug=False,
    resume_from_ckpt=None,
    exclude_input_fields_from_bootstrap_demos=False,
):
    adal_component = GroundednessAdalComponentOnLLMAggreFact(
        **az_gpt_4o_model,
        teacher_model_config=az_gpt_4o_model,
        text_optimizer_model_config=az_gpt_4o_model,
        backward_engine_model_config=az_gpt_4o_model,
    )
    print(adal_component)
    trainer = adal.Trainer(
        train_batch_size=train_batch_size,
        adaltask=adal_component,
        strategy=strategy,
        max_steps=max_steps,
        num_workers=num_workers,
        raw_shots=raw_shots,
        bootstrap_shots=bootstrap_shots,
        debug=debug,
        weighted_sampling=True,
        optimization_order=optimization_order,
        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
    )
    print(trainer)

    # train_dataset, val_dataset, test_dataset = load_datasets()
    trainer.fit(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        test_dataset=test_dataset,
        debug=debug,
        resume_from_ckpt=resume_from_ckpt,
    )

In [None]:
train(
    train_batch_size=8,
    debug=False,
    max_steps=8,
    strategy="constrained",
    raw_shots=1,
    bootstrap_shots=1,
    exclude_input_fields_from_bootstrap_demos=True,
)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# Function to plot a confusion matrix
def plot_confusion_matrix(tp, tn, fp, fn, title):
    matrix = [[tp, fn], [fp, tn]]

    fig, ax = plt.subplots(figsize=(5, 5))
    cax = ax.matshow(matrix, cmap="Blues")
    fig.colorbar(cax)

    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(["Positive", "Negative"])
    ax.set_yticklabels(["Positive", "Negative"])

    for (i, j), val in np.ndenumerate(matrix):
        ax.text(
            j, i, f"{val}", ha="center", va="center", color="black", fontsize=12
        )

    plt.title(title, pad=20)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


csv_file = (
    "/Users/dhuang/Documents/git/trulens/LLM AggreFact groundedness (2).csv"
)
# Convert data to DataFrame
df = pd.read_csv(csv_file)


# Aggregate TP, TN, FP, FN across all rows
total_tp = df["TOTAL_TP"].sum()
total_tn = df["TOTAL_TN"].sum()
total_fp = df["TOTAL_FP"].sum()
total_fn = df["TOTAL_FN"].sum()


plot_confusion_matrix(
    total_tp,
    total_tn,
    total_fp,
    total_fn,
    "Confusion Matrix (before optimization)",
)

In [None]:
csv_file2 = (
    "/Users/dhuang/Documents/git/trulens/Optimized prompt - LLM AggreFact.csv"
)
# Convert data to DataFrame
df = pd.read_csv(csv_file2)


# Aggregate TP, TN, FP, FN across all rows
total_tp = df["TOTAL_TP"].sum()
total_tn = df["TOTAL_TN"].sum()
total_fp = df["TOTAL_FP"].sum()
total_fn = df["TOTAL_FN"].sum()


plot_confusion_matrix(
    total_tp,
    total_tn,
    total_fp,
    total_fn,
    "Confusion Matrix (after optimization)",
)