In [None]:
# pip install -q scikit-learn litellm


In [None]:
# Import groundedness feedback function
from trulens_eval import Tru
from test_cases import generate_ms_marco_context_relevance_benchmark
from benchmark_frameworks.eval_as_recommendation import score_passages, compute_ndcg, compute_ece, recall_at_k, precision_at_k
Tru().reset_database()

benchmark_data = []
for i in range(1, 6):
    dataset_path = f"./datasets/ms_marco/ms_marco_train_v2.1_{i}.json"
    benchmark_data.extend(list(generate_ms_marco_context_relevance_benchmark(dataset_path)))


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
os.environ["ANTHROPIC_API_KEY"] = "..."
os.environ["TOGETHERAI_API_KEY"] = "..."

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame(benchmark_data)
print(len(df.groupby("query_id").count()))

In [None]:
df.groupby("query_id").head()

#### Define feedback functions for contexnt relevance to be evaluated

In [None]:
from trulens_eval.feedback import OpenAI, LiteLLM



# GPT 3.5
turbo = OpenAI(model_engine="gpt-3.5-turbo")

def wrapped_relevance_turbo(input, output):
    return turbo.qs_relevance_confidence_verb_2s_top1(input, output)

# # GPT 4 turbo
gpt4 = OpenAI(model_engine="gpt-4-1106-preview")

def wrapped_relevance_gpt4(input, output):
    return gpt4.qs_relevance_confidence_verb_2s_top1(input, output)


# Anthropic
claude_1 = LiteLLM(model_engine="claude-instant-1")
def wrapped_relevance_claude1(input, output):
    return claude_1.qs_relevance_confidence_verb_2s_top1(input, output)

claude_2 = LiteLLM(model_engine="claude-2")
def wrapped_relevance_claude2(input, output):
    return claude_2.qs_relevance_confidence_verb_2s_top1(input, output)


# # # Meta
llama_2_13b = LiteLLM(model_engine="together_ai/togethercomputer/Llama-2-7B-32K-Instruct")
def wrapped_relevance_llama2(input, output):
    return llama_2_13b.qs_relevance_confidence_verb_2s_top1(input, output)

# Define a list of your feedback functions
feedback_functions = {
    'GPT-3.5-Turbo': wrapped_relevance_turbo,
    'GPT-4-Turbo': wrapped_relevance_gpt4,
    'Claude-1': wrapped_relevance_claude1,
    'Claude-2': wrapped_relevance_claude2,
    'Llama-2': wrapped_relevance_llama2,

}

backoffs_by_functions = {
    'GPT-3.5-Turbo': 0.5,
    'GPT-4-Turbo': 0.5,
    'Claude-1': 1.5,
    'Claude-2': 1.5,
    'Llama-2': 2,
}

In [None]:

# Running the benchmark
results = []

K = 5 # for recall@K

intermediate_results = []
for name, func in feedback_functions.items():
    try:
        scores, true_relevance = score_passages(df, name, func, backoffs_by_functions[name] if name in backoffs_by_functions else 0.5, n=1)
        ndcg_value = compute_ndcg(scores, true_relevance)
        ece_value = compute_ece(scores, true_relevance)
        precision_k = np.mean([precision_at_k(sc, tr, 1) for sc, tr in zip(scores, true_relevance)])
        recall_k = np.mean([recall_at_k(sc, tr, K) for sc, tr in zip(scores, true_relevance)])
        results.append((name, ndcg_value, ece_value, recall_k, precision_k))
        print(f"Finished running feedback function name {name}")
    
        print("Saving results...")
        tmp_results_df = pd.DataFrame(results, columns=['Model', 'nDCG', 'ECE', f'Recall@{K}', 'Precision@1'])
        print(tmp_results_df)
        intermediate_results.append(tmp_results_df)
    except Exception as e:
        print(f"Failed to run benchmark for feedback function name {name} due to {e}")
# Convert results to DataFrame for display
results_df = pd.DataFrame(results, columns=['Model', 'nDCG', 'ECE', f'Recall@{K}', 'Precision@1'])


In [None]:
results_df

### Visualization

In [None]:
results_df.to_csv("results_claude.csv")

In [None]:

import matplotlib.pyplot as plt


# Make sure results_df is defined and contains the necessary columns
# Also, ensure that K is defined

plt.figure(figsize=(12, 10))

# Graph for nDCG, Recall@K, and Precision@K
plt.subplot(2, 1, 1)  # First subplot
ax1 = results_df.plot(x='Model', y=['nDCG', f'Recall@{K}', 'Precision@1'], kind='bar', ax=plt.gca())
plt.title('Feedback Function Performance (Higher is Better)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(loc='upper left')

# Graph for ECE
plt.subplot(2, 1, 2)  # Second subplot
ax2 = results_df.plot(x='Model', y=['ECE'], kind='bar', ax=plt.gca(), color='orange')
plt.title('Feedback Function Calibration (Lower is Better)')
plt.ylabel('ECE')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


