In [None]:
# Import relevance feedback function
from trulens_eval.feedback import GroundTruthAgreement, BenchmarkAggregator
from trulens_eval import Tru
import numpy as np

tru = Tru()
golden_set = [
    {
        "query": "who are the Apple's competitors?",
        "response": "Apple competitors include Samsung, Google, and Microsoft.",
        "expected_score": 1.0,
    },
    {
        "query": "what is the capital of France?",
        "response": "Paris is the capital of France.",
        "expected_score": 1.0,
    },
    {
        "query": "what is the capital of Spain?",
        "response": "I love going to Spain.",
        "expected_score": 0,
    },
]
# Create a Feedback object using the numeric_difference method of the ground_truth object
ground_truth = GroundTruthAgreement(golden_set)

In [None]:
tru.reset_database()

In [None]:
from trulens_eval.feedback import Cortex

provider = Cortex(model_engine="snowflake-arctic")

In [None]:
from typing import Tuple


def context_relevance_ff_to_score(input, output, temperature):
    return provider.context_relevance(
        question=input, context=output, temperature=temperature
    )


def context_relevance_ff_to_score_with_confidence(
    input, output, temperature
) -> Tuple[float, float]:
    return provider.context_relevance_verb_confidence(
        question=input, context=output, temperature=temperature
    )

### Collect all prompt and expected responses from the golden set and pass to BenchmarkAggregator as ground truth labels

In [None]:
prompts = []
responses = []
for i in range(len(golden_set)):
    prompt = golden_set[i]["query"]
    response = golden_set[i]["response"]

    prompts.append(prompt)
    responses.append(response)

true_labels = [entry["expected_score"] for entry in golden_set]

mae_agg_func = BenchmarkAggregator(true_labels=true_labels).mae

In [None]:
from trulens_eval.feedback.benchmark_frameworks.tru_benchmark_experiment import (
    BenchmarkParams,
)

tru_benchmark_arctic = tru.BenchmarkExperiment(
    app_id="benchmark_arctic",
    ground_truth=golden_set,
    feedback_to_score_fn=context_relevance_ff_to_score,
    agg_funcs=[mae_agg_func],
    benchmark_params=BenchmarkParams(temperature=0),
)

In [None]:
with tru_benchmark_arctic as recording:
    feedback_res = tru_benchmark_arctic.app.collect_feedback_scores()

In [None]:
feedback_res  # generate feedback scores from our context relevance feedback function

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
ece_agg_func = BenchmarkAggregator(true_labels=true_labels).ece
tru_benchmark_arctic_calibration = tru.BenchmarkExperiment(
    app_id="benchmark_arctic with calibration",
    ground_truth=golden_set,
    feedback_to_score_fn=context_relevance_ff_to_score_with_confidence,
    agg_funcs=[ece_agg_func],
    benchmark_params=BenchmarkParams(temperature=0),
)

In [None]:
with tru_benchmark_arctic_calibration as recording:
    feedback_results = (
        tru_benchmark_arctic_calibration.app.collect_feedback_scores()
    )

In [None]:
feedback_results

In [None]:
tru.get_leaderboard(app_ids=[])