In [None]:
from experiment import load_dataset
import pandas as pd
            
train, test, dev = load_dataset("climaQA")

In [None]:
true_test = test[test['label'] == 1].head(100)
queries = test['query'].unique()

In [None]:
from transformers import pipeline
from tqdm import tqdm

pipe = pipeline("text-classification", model="tcalamai/climaQA_42_distilRoBERTa", token=os.environ['HUB_TOKEN'],  padding="max_length", truncation=True, max_length=512, device_map="auto")

inputs_lists = []
label_lists = []

for i, r in true_test.iterrows():
    inputs_lists += [
        [{'text': r['text'], 'text_pair': q} for q in queries]
        ]
    label_lists += [r['query']]

outputs_lists = []

for input_list in tqdm(inputs_lists, desc="Running predictions"):
    outputs_lists.append(pipe(input_list))

In [None]:
ranks = []
for output_list, query in zip(outputs_lists, label_lists):

    scoring_df = pd.DataFrame(output_list)
    scoring_df['queries'] = queries

    scoring_df.loc[scoring_df['label'] == 0, "score"] = 1-scoring_df.loc[scoring_df['label'] == 0, "score"]

    scoring_df = scoring_df.sort_values(['label', 'score'], ascending=[False, False]).reset_index(drop=True)

    ranks += [1+scoring_df[scoring_df['queries'] == query].index[0]]

ranks = np.array(ranks)

In [None]:
import numpy as np

reciprocal_ranking = 1/np.array(ranks)
mrr = np.mean(reciprocal_ranking)

mrr

In [None]:
import numpy as np

def calculate_mrr(ranks):
    """Calculates the Mean Reciprocal Rank (MRR) given a list of ranks."""
    return np.mean([1 / rank for rank in ranks if rank > 0])

def bootstrap_mrr(ranks, n_iterations=1000, confidence_level=0.95):
    """Performs bootstrapping to calculate the MRR distribution and confidence interval."""
    mrr_samples = []
    n = len(ranks)
    
    for _ in range(n_iterations):
        # Sample with replacement from the original ranks
        sample = np.random.choice(ranks, size=n, replace=True)
        mrr_samples.append(calculate_mrr(sample))
    
    # Calculate the confidence interval
    lower_percentile = (1 - confidence_level) / 2
    upper_percentile = 1 - lower_percentile
    ci_lower = np.percentile(mrr_samples, lower_percentile * 100)
    ci_upper = np.percentile(mrr_samples, upper_percentile * 100)
    
    # Return the bootstrapped MRR samples and confidence interval
    return ci_lower, ci_upper

# Example usage
ranks = [1, 2, 3, 4, 5]  # Replace with your list of ranks
n_iterations = 1000  # Number of bootstrap iterations
confidence_level = 0.95  # Confidence level for the interval

ci_lower, ci_upper = bootstrap_mrr(ranks, n_iterations, confidence_level)
print(f"{int(confidence_level * 100)}% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")


In [None]:
fixed_ranking = train[train['label']==1]['query'].value_counts(normalize=True).reset_index()

map_ranking_distrib = dict()
for i, v in enumerate(fixed_ranking['query'].values):
    map_ranking_distrib[v] = i+1

np.mean(1/test[test['label']==1]['query'].map(map_ranking_distrib))

ci_lower, ci_upper = bootstrap_mrr(test[test['label']==1]['query'].map(map_ranking_distrib).values, n_iterations, confidence_level)
print(f"{int(confidence_level * 100)}% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")

In [None]:
# np.mean(1/np.random.randint(1, len(test[test['label']==1]['query'].unique())+1, len(test[test['label']==1])))

random_ranking = np.random.randint(
    1, 
    len(test[test['label']==1]['query'].unique())+1,
    len(test[test['label']==1])
)

ci_lower, ci_upper = bootstrap_mrr(random_ranking, n_iterations, confidence_level)
print(f"{int(confidence_level * 100)}% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")