In [None]:
from trulens.core import TruSession

session = TruSession()
session.reset_database()

### Datasets preprocessing:

Datasets that need some preprocessing before they can be used in `TruBenchmarkExperiment` class:
1. Snowflake IT (internal): both rephrased and regular?, this should be used for all 3 in the triad
2. SummEval (CNN and DailyMail summarizations with annotation) for groundedness
3. QAGS (CNN and DailyMail with Turkers' annotation) for groundedness
4. QAGS (XSUM with Turkers' annotation) for groundedness
5. MSMARCO V2 for context relevance
6. HotPot QA for answer relevance 



In [None]:
import json


# SummEval
def generate_summeval_groundedness_golden_set(file_path):
    def calculate_expected_score(normalized_metrics_lst, weights_lst):
        assert len(normalized_metrics_lst) == len(weights_lst)
        return round(
            sum(
                normalized_metrics_lst[i] * weights_lst[i]
                for i in range(len(normalized_metrics_lst))
            )
            / sum(weights_lst),
            2,
        )

    with open(file_path) as f:
        data = json.load(f)

    for item in data["rows"]:
        row = item["row"]

        assert len(row["machine_summaries"]) == len(row["consistency"])

        for i in range(len(row["machine_summaries"])):
            yield {
                "query": row["text"],
                "expected_response": row["machine_summaries"][i],
                "expected_score": calculate_expected_score(
                    [
                        (row["consistency"][i] - 1)
                        / 4,  # normalize from [1, 5] to [0, 1]
                    ],
                    [1.0],
                ),
                "human_score": row["consistency"][i],
            }