In [None]:
# pip install -q scikit-learn litellm

In [None]:
# Import groundedness feedback function
from benchmark_frameworks.eval_as_recommendation import compute_ece
from benchmark_frameworks.eval_as_recommendation import score_passages
from test_cases import generate_ms_marco_context_relevance_benchmark

from trulens_eval import Tru

Tru().reset_database()

benchmark_data = []
for i in range(1, 6):
    dataset_path = f"./datasets/ms_marco/ms_marco_train_v2.1_{i}.json"
    benchmark_data.extend(
        list(generate_ms_marco_context_relevance_benchmark(dataset_path))
    )

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
os.environ["ANTHROPIC_API_KEY"] = "..."
os.environ["TOGETHERAI_API_KEY"] = "..."

In [None]:
import pandas as pd

df = pd.DataFrame(benchmark_data)

print(len(df.groupby("query_id").count()))

In [None]:
df.groupby("query_id").head()

#### Define feedback functions for contexnt relevance to be evaluated

### Visualization

### Temperature Scaling 

In [None]:
from trulens_eval.feedback import LiteLLM
from trulens_eval.feedback import OpenAI

temperatures = [0, 0.3, 0.7, 1]
# GPT 3.5
turbo = OpenAI(model_engine="gpt-3.5-turbo")


def wrapped_relevance_turbo_t(input, output, temperature):
    return turbo.context_relevance_confidence_verb_2s_top1(
        input, output, temperature
    )


# # GPT 4 turbo
gpt4 = OpenAI(model_engine="gpt-4-1106-preview")


def wrapped_relevance_gpt4_t(input, output, temperature):
    return gpt4.context_relevance_confidence_verb_2s_top1(
        input, output, temperature
    )


claude_1 = LiteLLM(model_engine="claude-instant-1")


def wrapped_relevance_claude1_t(input, output, temperature):
    claude_1.context_relevance_confidence_verb_2s_top1(
        input, output, temperature
    )


claude_2 = LiteLLM(model_engine="claude-2")


def wrapped_relevance_claude2_t(input, output, temperature):
    claude_2.context_relevance_confidence_verb_2s_top1(
        input, output, temperature
    )


feedback_functions = {
    "GPT-3.5-Turbo": wrapped_relevance_turbo_t,
    "GPT-4-Turbo": wrapped_relevance_gpt4_t,
    # 'Claude-1': wrapped_relevance_claude1_t,
    # 'Claude-2': wrapped_relevance_claude2_t,
}

backoffs_by_functions = {
    "GPT-3.5-Turbo": 0,
    "GPT-4-Turbo": 0.5,
    # 'Claude-1': 1.5,
    # 'Claude-2': 1.5,
}

In [None]:
for temp in temperatures:
    # Running the benchmark
    results = []

    intermediate_results = []
    for name, func in feedback_functions.items():
        try:
            scores, true_relevance = score_passages(
                df,
                name,
                func,
                backoffs_by_functions[name]
                if name in backoffs_by_functions
                else 0.5,
                n=1,
                temperature=temp,
            )
            ece_value = compute_ece(scores, true_relevance)

            results.append(
                (
                    name,
                    ece_value,
                )
            )
            print(f"Finished running feedback function name {name}")

            print("Saving results...")
            tmp_results_df = pd.DataFrame(
                results, columns=[f"Model-t-{temp}", "ECE"]
            )

            tmp_results_df.to_csv(f"results_verbalized_ece_t_{temp}.csv")
            print(tmp_results_df)
            intermediate_results.append(tmp_results_df)
        except Exception as e:
            print(
                f"Failed to run benchmark for feedback function name {name} due to {e}"
            )
    # Convert results to DataFrame for display
    results_df = pd.DataFrame(
        results,
        columns=[
            f"Model-t-{temp}",
            "ECE",
        ],
    )

In [None]:
results_df.to_csv("results_verbalized_ece_temp_scaling.csv")

In [None]:
results_df_1 = pd.read_csv("results_temp_scaling_gpt-3.5.csv")
results_df_2 = pd.read_csv("results_temp_scaling_gpt-4.csv")

In [None]:
results_df_1

In [None]:
results_df_2

In [None]:
import matplotlib.pyplot as plt

# Make sure results_df is defined and contains the necessary columns
# Also, ensure that K is defined

plt.figure(figsize=(12, 10))

# Graph for nDCG, Recall@K, and Precision@K
plt.subplot(2, 1, 1)  # First subplot
ax1 = results_df.plot(
    x="Model",
    y=["nDCG", f"Recall@{K}", "Precision@1"],
    kind="bar",
    ax=plt.gca(),
)
plt.title("Feedback Function Performance (Higher is Better)")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(loc="upper left")

# Graph for ECE
plt.subplot(2, 1, 2)  # Second subplot
ax2 = results_df.plot(
    x="Model", y=["ECE"], kind="bar", ax=plt.gca(), color="orange"
)
plt.title("Feedback Function Calibration (Lower is Better)")
plt.ylabel("ECE")
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()