### Datasets preprocessing:

Datasets that need some preprocessing before they can be used in `TruBenchmarkExperiment` class:
1. Snowflake IT (internal): both rephrased and regular?, this should be used for all 3 in the triad
2. SummEval (CNN and DailyMail summarizations with annotation) for groundedness
3. QAGS (CNN and DailyMail with Turkers' annotation) for groundedness
4. QAGS (XSUM with Turkers' annotation) for groundedness
5. MSMARCO V2 for context relevance
6. HotPot QA for answer relevance 



In [None]:
import ast
import csv
import json

import pandas as pd


# SummEval
def generate_summeval_groundedness_golden_set(file_path):
    def calculate_expected_score(normalized_metrics_lst, weights_lst):
        assert len(normalized_metrics_lst) == len(weights_lst)
        return round(
            sum(
                normalized_metrics_lst[i] * weights_lst[i]
                for i in range(len(normalized_metrics_lst))
            )
            / sum(weights_lst),
            2,
        )

    with open(file_path) as f:
        for line in f:
            # Each line is a separate JSON object
            try:
                data = json.loads(line)

                # Ensure the expected keys exist in the JSON
                try:
                    row = data
                    assert (
                        len(row["machine_summaries"]) == len(row["consistency"])
                    ), "Mismatch in lengths of machine_summaries and consistency"

                    # Iterate over the summaries and create the desired dictionary structure
                    for i in range(len(row["machine_summaries"])):
                        yield {
                            "query": row.get(
                                "text", ""
                            ),  # Default to empty string if key not found
                            "expected_response": row["machine_summaries"][i],
                            "expected_score": calculate_expected_score(
                                [
                                    (row["consistency"][i] - 1)
                                    / 4,  # Normalize from [1, 5] to [0, 1]
                                ],
                                [1.0],
                            ),
                            "human_score": row["consistency"][i],
                        }

                except KeyError as e:
                    print(
                        f"Key error: {e}. Please check if the keys exist in the JSON file."
                    )
                except AssertionError as e:
                    print(
                        f"Assertion error: {e}. The lengths of 'machine_summaries' and 'consistency' do not match."
                    )

            except json.JSONDecodeError as e:
                print(f"JSON decode error: {e}. Check the line format.")


# Snowflake IT dataset


def generatate_snowflake_it_golden_set_groundedness(file_path):
    with open(file_path, mode="r", encoding="utf-8") as f:
        reader = csv.DictReader(f)  # Read the CSV file as a dictionary
        for row in reader:
            # Convert the 'golden' from a string to a list
            try:
                expected_chunks = ast.literal_eval(row["golden"])
                if not isinstance(expected_chunks, list):
                    raise ValueError("Golden column should be a list")

                for expected_chunk in expected_chunks:
                    # Yield the required fields
                    yield {
                        "query": expected_chunk,
                        "expected_response": row["expected_response"],
                        "expected_score": 1,  # Static score as per the requirement
                    }
            except (ValueError, SyntaxError) as e:
                print(f"Error parsing golden column: {e}")
                continue


def generate_snowflake_it_golden_set_answer_relevance(file_path):
    with open(file_path, mode="r", encoding="utf-8") as f:
        reader = csv.DictReader(f)  # Read the CSV file as a dictionary
        for row in reader:
            # Extract data and yield the required fields
            yield {
                "query": row["query"],
                "expected_response": row["expected_response"],
                "expected_score": 1,  # always positive example for answer relevance
            }


def generate_snowflake_it_golden_set_context_relevance(file_path):
    with open(file_path, mode="r", encoding="utf-8") as f:
        reader = csv.DictReader(f)  # Read the CSV file as a dictionary
        for row in reader:
            # Convert the 'golden' from a string to a list
            try:
                expected_chunks = ast.literal_eval(row["golden"])
                # if len(expected_chunks) > 1:
                #     print(
                #         f'query w/ more than one golden contexts: {row["query"]}'
                #     )
                #     print(expected_chunks)
                if not isinstance(expected_chunks, list):
                    raise ValueError("Golden column should be a list")

                for expected_chunk in expected_chunks:
                    # Yield the required fields
                    yield {
                        "query": row["query"],
                        "expected_response": expected_chunk,
                        "expected_score": 1,  # Static score as per the requirement
                    }
            except (ValueError, SyntaxError) as e:
                print(f"Error parsing golden column: {e}")
                continue


def generate_qags_golden_set_groundedness(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            # Parse each line as a JSON object
            data = json.loads(line)

            # Extract the article as the query
            query = data["article"]

            # Iterate over the summary_sentences to flatten the structure
            for summary in data["summary_sentences"]:
                expected_response = summary["sentence"]

                # Calculate expected_score based on worker responses
                responses = [
                    response["response"] for response in summary["responses"]
                ]
                # Convert 'yes' to 1 and 'no' to 0, then calculate the average
                expected_score = sum(
                    1 if r.lower() == "yes" else 0 for r in responses
                ) / len(responses)

                # Yield the processed record
                yield {
                    "query": query,
                    "expected_response": expected_response,
                    "expected_score": expected_score,
                }

In [None]:
summeval_list = list(
    generate_summeval_groundedness_golden_set(
        "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/summeval_test.json"
    )
)

summeval_true_labels = [entry["expected_score"] for entry in summeval_list]

summeval = pd.DataFrame(
    list(
        generate_summeval_groundedness_golden_set(
            "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/summeval_test.json"
        )
    )
)

In [None]:
snowflake_it_file_path = "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/snowflake_it_v3.csv"

snowflake_it_for_answer_relevance = pd.DataFrame(
    list(
        generate_snowflake_it_golden_set_answer_relevance(
            snowflake_it_file_path
        )
    )
)
snowflake_it_answer_relevance_true_labels = [
    1 for _ in range(len(snowflake_it_for_answer_relevance))
]

snowflake_it_for_context_relevance = pd.DataFrame(
    list(
        generate_snowflake_it_golden_set_context_relevance(
            snowflake_it_file_path
        )
    )
)
snowflake_it_for_context_relevance_true_labels = [
    1 for _ in range(len(snowflake_it_for_context_relevance))
]

snowflake_it_for_groundness = pd.DataFrame(
    list(
        generatate_snowflake_it_golden_set_groundedness(snowflake_it_file_path)
    )
)
snowflake_it_for_groundness_true_labels = [
    1 for _ in range(len(snowflake_it_for_groundness))
]

In [None]:
qags_cnn_dm = pd.DataFrame(
    list(
        generate_qags_golden_set_groundedness(
            "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/qags_mturk_cnndm.jsonl"
        )
    )
)

qags_cnn_dm_true_labels = [
    row["expected_score"] for _, row in qags_cnn_dm.iterrows()
]


qags_xsum = pd.DataFrame(
    list(
        generate_qags_golden_set_groundedness(
            "/Users/dhuang/Documents/git/trulens/src/benchmark/trulens/benchmark/benchmark_frameworks/experiments/data/qags_mturk_xsum.jsonl"
        )
    )
)

qqags_xsum_true_labels = [
    row["expected_score"] for _, row in qags_xsum.iterrows()
]

### Set up feedback LLM providers 

We will experiment with 2 current OpenAI models and a mix of commercial and open source models avaiable in Cortex

In [None]:
import os

import snowflake.connector
from trulens.providers.cortex import Cortex
from trulens.providers.openai import OpenAI

# OPENAI_MODELS = ["gpt-4o", "gpt-4o-mini"]
# CORTEX_MODELS = ["snowflake-arctic", "llama3.1-8b", "llama3.1-70b", "mistral-large"]

gpt_4o = OpenAI("gpt-4o")
gpt_4o_mini = OpenAI("gpt-4o-mini")

snowflake_connection_parameters = {
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_USER_PASSWORD"],
}
snowflake_connection = snowflake.connector.connect(
    **snowflake_connection_parameters
)

snowflake_arctic = Cortex(snowflake_connection, model_engine="snowflake-arctic")
llama3_1_8b = Cortex(snowflake_connection, model_engine="llama3.1-8b")

### Sanity check 

In [None]:
gpt_4o.groundedness_measure_with_cot_reasons(
    summeval[0]["query"],
    summeval[0]["expected_response"],
    criteria="Grounded should get a socre of 10, and non-grounded should get a score of 0. The score should be between 0 to 10",
    max_score_val=10,
    use_sent_tokenize=False,
)

In [None]:
gpt_4o_mini.context_relevance_with_cot_reasons(
    snowflake_it_for_context_relevance[0]["query"],
    snowflake_it_for_context_relevance[0]["expected_response"],
)

In [None]:
import pandas as pd
from trulens.core import TruSession

session = TruSession()
session.reset_database()

In [None]:
def context_relevance_likert_4(provider, input, output) -> float:
    return provider.context_relevance_with_cot_reasons(
        question=input,
        context=output,
    )


def context_relevance_binary(provider, input, output) -> float:
    return provider.context_relevance(
        question=input,
        context=output,
        criteria="A relevant context to the question should get a score of 1, and an irrelevant context should get a score of 0. The score should be either 0 or 1 (binary).",
        min_score_val=0,
        max_score_val=1,
    )


def answer_relevance_binary(provider, input, output) -> float:
    return provider.relevance(
        prompt=input,
        response=output,
        criteria="A relevant response to the prompt should get a score of 1, and an irrelevant response should get a score of 0. The score should be either 0 or 1 (binary).",
        min_score_val=0,
        max_score_val=1,
    )


def groundedness_likert_4(provider, input, output) -> float:
    return provider.groundedness_measure_with_cot_reasons(
        source=input, statement=output
    )


def groudedness_binary(provider, input, output) -> float:
    return provider.groundedness_measure_with_cot_reasons(
        source=input,
        statement=output,
        criteria="A grounded response based on the source should get a socre of 1, and non-grounded one should get a score of 0. The score should be either 0 or 1 (binary).",
        min_score_val=0,
        max_score_val=1,
    )

In [None]:
# snowflake_it_benchmark = TruBenchmarkExperiment(