In [None]:
from trulens.benchmark.benchmark_frameworks.dataset.beir_loader import (
    TruBEIRDataLoader,
)

beir_data_loader = TruBEIRDataLoader(data_folder="./", dataset_name="hotpotqa")
hotpotqa = beir_data_loader.load_dataset_to_df(download=True)

In [None]:
import random

import pandas as pd

random.seed(42)
hotpotqa_raw_subset = hotpotqa.sample(n=500, random_state=42)

all_responses = [
    (row["query"], row["expected_response"])
    for idx, row in hotpotqa_raw_subset.iterrows()
]

all_contexts = [
    (row["query"], context["text"])
    for idx, row in hotpotqa_raw_subset.iterrows()
    for context in row["expected_chunks"]
]

answer_relevance_dataset = []


for idx, row in hotpotqa_raw_subset.iterrows():
    # Positive examples for answer relevance
    answer_relevance_dataset.append({
        "query": row["query"],
        "response": row["expected_response"],  # Positive response
        "expected_score": 1,  # Positive example, score = 1
    })

    # Negative examples for answer relevance (random unrelated response)
    negative_response = random.choice([
        r
        for q, r in all_responses
        if q != row["query"]  # Pick response from another query
    ])

    answer_relevance_dataset.append({
        "query": row["query"],
        "response": negative_response,  # Negative response
        "expected_score": 0,  # Negative example, score = 0
    })
answer_relevance_df = pd.DataFrame(answer_relevance_dataset)

In [None]:
answer_relevance_df

In [None]:
import pandas as pd
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_trec_dl_passage_benchmark,
)

trec_2021_samples = list(
    generate_trec_dl_passage_benchmark(
        max_samples_per_query_per_score=4,
        dataset_path="msmarco-passage-v2/trec-dl-2021/judged",
    )
)
trec_2022_samples = list(
    generate_trec_dl_passage_benchmark(
        max_samples_per_query_per_score=4,
        dataset_path="msmarco-passage-v2/trec-dl-2022/judged",
    )
)
trec_combined = trec_2021_samples + trec_2022_samples


context_relevance_df = pd.DataFrame(trec_combined)
context_relevance_df.to_csv(
    "trec_dl_2021_2022_combined_scoreddocs_intervals.csv", index=False
)

context_relevance_df = pd.read_csv(
    "trec_dl_2021_2022_combined_scoreddocs_intervals.csv"
)
print(f"Totoal number of samples: {len(context_relevance_df)}")

In [None]:
context_relevance_df["context"] = context_relevance_df["expected_response"]
context_relevance_df["expected_score"] = context_relevance_df[
    "expected_score"
].apply(lambda x: 1 if x >= 0.5 else 0)
context_relevance_df

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    generate_balanced_llm_aggrefact_benchmark,
)

llm_aggrefact_df = generate_balanced_llm_aggrefact_benchmark(split="test")
# Get 500 samples from each class
groundedness_df = (
    llm_aggrefact_df.groupby("label")
    .apply(lambda x: x.sample(n=500, random_state=42))
    .reset_index(drop=True)
)

# Verify the balance
print(groundedness_df["label"].value_counts())

In [None]:
groundedness_df

In [None]:
groundedness_df["context"] = groundedness_df["doc"]
groundedness_df["response"] = groundedness_df["claim"]
groundedness_df["expected_score"] = groundedness_df["label"]
groundedness_df

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    visualize_expected_score_distribution,
)

visualize_expected_score_distribution(answer_relevance_df["expected_score"])
visualize_expected_score_distribution(context_relevance_df["expected_score"])
visualize_expected_score_distribution(groundedness_df["expected_score"])

In [None]:
context_relevance_df[
    "expected_score"
].value_counts()  # best effort to balance the classes, given the data annotation quality

In [None]:
from trulens.feedback.groundtruth import GroundTruthAggregator

f_recall = GroundTruthAggregator(
    answer_relevance_df["expected_score"].to_list()
).recall
f_precision = GroundTruthAggregator(
    answer_relevance_df["expected_score"].to_list()
).precision
f_f1_score = GroundTruthAggregator(
    answer_relevance_df["expected_score"].to_list()
).f1_score
f_cohens_kappa = GroundTruthAggregator(
    answer_relevance_df["expected_score"].to_list()
).cohens_kappa

In [None]:
# !pip install krippendorff
import krippendorff
import numpy as np

ratings_with_nan = np.array([
    [1, 2, np.nan, 4, 5],  # llm judge scores
    [1, 2, 3, 4, np.nan],
    [1, 2, 3, 4, 5],
])

alpha = krippendorff.alpha(ratings_with_nan, level_of_measurement="ordinal")
print(f"Krippendorff's Alpha (with missing data): {alpha}")