In [None]:
# !pip install -q scikit-learn litellm

In [None]:
# Import groundedness feedback function
from benchmark_frameworks.eval_as_recommendation import (
    run_benchmark_with_temp_scaling,
)
from test_cases import generate_ms_marco_context_relevance_benchmark
from trulens.core import TruSession

TruSession().reset_database()

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["SNOWFLAKE_ACCOUNT"] = "xxx-xxx"  # xxx-xxx.snowflakecomputing.com
os.environ["SNOWFLAKE_USER"] = "xxx"
os.environ["SNOWFLAKE_USER_PASSWORD"] = "xxx"
os.environ["SNOWFLAKE_DATABASE"] = "xxx"
os.environ["SNOWFLAKE_SCHEMA"] = "xxx"
os.environ["SNOWFLAKE_WAREHOUSE"] = "xxx"

In [None]:
from snowflake.snowpark import Session
from trulens.core.utils.keys import check_keys

check_keys("SNOWFLAKE_ACCOUNT", "SNOWFLAKE_USER", "SNOWFLAKE_USER_PASSWORD")

connection_params = {
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_USER_PASSWORD"],
}


# Create a Snowflake session
snowpark_session = Session.builder.configs(connection_params).create()

### Set up initial model providers as evaluators for meta evaluation

We will start with GPT-4o as the benchmark

In [None]:
from trulens.providers.cortex import Cortex
from trulens.providers.openai import OpenAI

gpt4o = OpenAI(model_engine="gpt-4o")
mistral = Cortex(snowpark_session, model_engine="mistral-large")

In [None]:
gpt4o.context_relevance_with_cot_reasons(
    "who is the guy calling?", "some guy calling saying his name is Danny"
)

In [None]:
score, confidence = gpt4o.context_relevance_verb_confidence(
    "who is steve jobs", "apple founder is steve jobs"
)
print(f"score: {score}, confidence: {confidence}")

In [None]:
score, confidence = mistral.context_relevance_verb_confidence(
    "who is the guy calling?",
    "some guy calling saying his name is Danny",
    temperature=0.5,
)
print(f"score: {score}, confidence: {confidence}")

In [None]:
benchmark_data = []
for i in range(1, 6):
    dataset_path = f"./datasets/ms_marco/ms_marco_train_v2.1_{i}.json"
    benchmark_data.extend(
        list(generate_ms_marco_context_relevance_benchmark(dataset_path))
    )

In [None]:
import pandas as pd

df = pd.DataFrame(benchmark_data)

print(df.count())

In [None]:
df.head()

### Temperature Scaling 

In [None]:
from trulens.providers.openai import OpenAI

temperatures = [0, 0.3, 0.7, 1]


def wrapped_relevance_gpt4o(input, output, temperature):
    return gpt4o.context_relevance_verb_confidence(
        question=input, context=output, temperature=temperature
    )


def wrapped_relevance_mistral(input, output, temperature):
    return mistral.context_relevance_verb_confidence(
        question=input, context=output, temperature=temperature
    )


feedback_functions = {
    "gpt-4o": wrapped_relevance_gpt4o,
    "mistral-large": wrapped_relevance_mistral,
}

backoffs_by_functions = {
    "gpt-4o": 0,
    "mistral-large": 0,
}

In [None]:
import concurrent.futures

# Parallelizing temperature scaling
k = 1  #  MS MARCO specific
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(
            run_benchmark_with_temp_scaling,
            df,
            feedback_functions,
            temp,
            k,
            backoffs_by_functions,
        )
        for temp in temperatures
    ]
    for future in concurrent.futures.as_completed(futures):
        future.result()

### Visualization of calibration


In [None]:
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve


def plot_reliability_diagram(csv_file, temperature, ece_value, brier_score):
    data = pd.read_csv(
        csv_file,
        header=None,
        names=["query_id", "relevance_score", "confidence_score", "true_label"],
    )

    # Compute calibration curve
    true_pred = (
        (data["relevance_score"] >= 0.5).astype(int) == data["true_label"]
    ).astype(int)

    prob_true, prob_pred = calibration_curve(
        true_pred, data["confidence_score"], n_bins=5
    )

    # Plot reliability diagram
    plt.plot(
        prob_pred,
        prob_true,
        marker="o",
        linewidth=1,
        label=f"Temperature {temperature}",
    )
    plt.plot([0, 1], [0, 1], linestyle="--", label="Perfectly calibrated")

    # Display ECE value
    plt.text(
        0.6,
        0.2,
        f"ECE: {ece_value:.4f}",
        bbox=dict(facecolor="white", alpha=0.5),
    )
    plt.text(
        0.6,
        0.1,
        f"Brier score: {brier_score:.4f}",
        bbox=dict(facecolor="white", alpha=0.5),
    )
    # Labels and title
    plt.xlabel("Confidence bins")
    plt.ylabel("Accuracy bins")
    plt.title(f"Reliability Diagram for GPT-4o with t={temperature}")
    plt.legend()

In [None]:
csv_file = "results/gpt-4o-t_0-benchmark_eval_results.csv"
ece = 0.25978426229508195
brier_score = 0.23403157255616272

In [None]:
plot_reliability_diagram(csv_file, 0, ece, brier_score)

In [None]:
import pandas as pd

# List of temperatures and corresponding CSV files
temperatures = [0, 0.3, 0.7, 1]
csv_files = [
    "consolidated_results_verbalized_ece_t_0.csv",
    "consolidated_results_verbalized_ece_t_0.3.csv",
    "consolidated_results_verbalized_ece_t_0.7.csv",
    "consolidated_results_verbalized_ece_t_1.csv",
]

# Load and combine data
data = []
for temp, csv_file in zip(temperatures, csv_files):
    df = pd.read_csv(csv_file)
    df["Temperature"] = temp
    data.append(df)

combined_data = pd.concat(data)

# Plotting
plt.figure(figsize=(14, 8))
bar_width = 0.1

# Plot Precision@1
plt.subplot(3, 1, 1)
for i, function_name in enumerate(combined_data["Function Name"].unique()):
    subset = combined_data[combined_data["Function Name"] == function_name]
    plt.bar(
        [t + i * bar_width for t in temperatures],
        subset["Precision@1"],
        width=bar_width,
        label=function_name,
    )
plt.title("Precision@1 (higher the better)")
plt.xlabel("Temperature")
plt.ylabel("Precision@1")
plt.xticks(
    [
        t + bar_width * (len(combined_data["Function Name"].unique()) - 1) / 2
        for t in temperatures
    ],
    temperatures,
)
plt.legend()

# Plot ECE
plt.subplot(3, 1, 2)
for i, function_name in enumerate(combined_data["Function Name"].unique()):
    subset = combined_data[combined_data["Function Name"] == function_name]
    plt.bar(
        [t + i * bar_width for t in temperatures],
        subset["ECE"],
        width=bar_width,
        label=function_name,
    )
plt.title("ECE (lower the better)")
plt.xlabel("Temperature")
plt.ylabel("ECE")
plt.legend()

# Plot Brier Score
plt.subplot(3, 1, 3)
for i, function_name in enumerate(combined_data["Function Name"].unique()):
    subset = combined_data[combined_data["Function Name"] == function_name]
    plt.bar(
        [t + i * bar_width for t in temperatures],
        subset["Brier Score"],
        width=bar_width,
        label=function_name,
    )
plt.title("Brier Score (lower the better)")
plt.xlabel("Temperature")
plt.ylabel("Brier Score")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
temperatures = [0, 0.3, 0.7, 1]
csv_files = [
    "consolidated_results_verbalized_ece_t_0.csv",
    "consolidated_results_verbalized_ece_t_0.3.csv",
    "consolidated_results_verbalized_ece_t_0.7.csv",
    "consolidated_results_verbalized_ece_t_1.csv",
]

In [None]:
# Load and combine data
data = []
for temp, csv_file in zip(temperatures, csv_files):
    df = pd.read_csv(csv_file)
    df["Temperature"] = temp
    data.append(df)

combined_data = pd.concat(data)

In [None]:
combined_data.groupby(["Function Name", "Temperature"]).mean()