In [1]:
import pandas as pd

df = pd.read_csv(r'C:\Users\zoghla\Desktop\Yosr Capstone\LLM-Ensemble-Model-main\Full Dataset Ranking\ranked_responses_final.csv')


# 2) Map each “judge” to the column where it scored responses
judges = {
    'openai/gpt-4o':                'openai/gpt-4o',
    'anthropic/claude-3.5-sonnet':  'anthropic/claude-3.5-sonnet',
    'deepseek/deepseek-chat':       'deepseek/deepseek-chat',
    'perplexity/sonar':             'perplexity/sonar'
}

# 3) Compute, for each judge, how often it ranks its *own* response highest
self_win_rates = {}
for judge_model, score_col in judges.items():
    self_wins = 0
    total    = 0
    # iterate over all unique prompts
    for prompt_id, group in df.groupby('ID'):
        # find the response with the maximum score *according to* this judge
        idx_max = group[score_col].idxmax()
        winner  = df.at[idx_max, 'Model']
        total  += 1
        if winner == judge_model:
            self_wins += 1
    self_win_rates[judge_model] = self_wins / total * 100.0

# 4) Print out the self-preference rates
print("Self-preference (self-win) rates over all prompts:")
for jm, rate in self_win_rates.items():
    print(f"  {jm:>30s}: {rate:5.2f}%")

Self-preference (self-win) rates over all prompts:
                   openai/gpt-4o: 29.87%
     anthropic/claude-3.5-sonnet: 42.31%
          deepseek/deepseek-chat: 43.13%
                perplexity/sonar: 21.63%


In [2]:
import pandas as pd
import itertools

# 1) Load your full dataset
df = pd.read_csv(r'C:\Users\zoghla\Desktop\Yosr Capstone\LLM-Ensemble-Model-main\Full Dataset Ranking\ranked_responses_final.csv')

# 2) Define the four judges and the columns where they scored each response
judges = {
    'openai/gpt-4o':               'openai/gpt-4o',
    'anthropic/claude-3.5-sonnet': 'anthropic/claude-3.5-sonnet',
    'deepseek/deepseek-chat':      'deepseek/deepseek-chat',
    'perplexity/sonar':            'perplexity/sonar'
}

# 3) Build a table of “top choice per judge per prompt”
#    -> top_choices[judge] is a Series indexed by ID with the Model they ranked highest
top_choices = {}
for judge_name, score_col in judges.items():
    top = (df
           .loc[df.groupby('ID')[score_col].idxmax()]  # for each ID, pick row with max score_col
           .set_index('ID')['Model'])
    top_choices[judge_name] = top

top_df = pd.DataFrame(top_choices)  # shape = (#unique IDs) × 4

# 4) Unanimous agreement: all judges picked the same Model
n_prompts = len(top_df)
n_unanimous = (top_df.nunique(axis=1) == 1).sum()
print(f"Unanimous agreement: {n_unanimous}/{n_prompts} = {n_unanimous/n_prompts*100:.2f}%\n")

# 5) Pairwise agreement between judges
print("Pairwise agreement rates:")
for j1, j2 in itertools.combinations(judges.keys(), 2):
    agree_rate = (top_df[j1] == top_df[j2]).mean() * 100
    print(f"  {j1:>30s} vs {j2:<30s}: {agree_rate:5.2f}%")
print()

# 6) For each model: count how many prompts had exactly k judges pick it (k=1..4)
dist = {}
for model in judges.keys():
    # For each prompt (row), count how many judges picked this model
    counts = top_df.eq(model).sum(axis=1).value_counts().sort_index()
    # ensure entries for 0–4
    dist[model] = [counts.get(k, 0) for k in range(5)]

dist_df = pd.DataFrame(dist, index=range(5))
dist_df.index.name = 'num_judges_picked'
print("Distribution of how many judges picked each model as #1:")
print(dist_df.to_string())


Unanimous agreement: 1892/5289 = 35.77%

Pairwise agreement rates:
                   openai/gpt-4o vs anthropic/claude-3.5-sonnet   : 61.69%
                   openai/gpt-4o vs deepseek/deepseek-chat        : 61.71%
                   openai/gpt-4o vs perplexity/sonar              : 62.60%
     anthropic/claude-3.5-sonnet vs deepseek/deepseek-chat        : 62.79%
     anthropic/claude-3.5-sonnet vs perplexity/sonar              : 64.06%
          deepseek/deepseek-chat vs perplexity/sonar              : 59.50%

Distribution of how many judges picked each model as #1:
                   openai/gpt-4o  anthropic/claude-3.5-sonnet  deepseek/deepseek-chat  perplexity/sonar
num_judges_picked                                                                                      
0                           3177                         2517                    2394              3733
1                           1049                          754                     833               859
2        

In [3]:
import os
import pandas as pd
import numpy as np

# Path to your CSV file – update this to your actual file location
csv_path = r'C:\Users\zoghla\Desktop\Yosr Capstone\LLM-Ensemble-Model-main\Full Dataset Ranking\ranked_responses_final.csv'

# Check if file exists
if not os.path.exists(csv_path):
    print(f"File not found: {csv_path}. Please update 'csv_path' to your dataset location.")
else:
    # Load the data
    df = pd.read_csv(csv_path, sep=',')

    # Define the judges and their score columns
    judges = {
        'openai/gpt-4o':               'openai/gpt-4o',
        'anthropic/claude-3.5-sonnet': 'anthropic/claude-3.5-sonnet',
        'deepseek/deepseek-chat':      'deepseek/deepseek-chat',
        'perplexity/sonar':            'perplexity/sonar'
    }

    # Compute top choice per judge per prompt
    top_choices = {
        name: df.loc[df.groupby('ID')[col].idxmax()].set_index('ID')['Model']
        for name, col in judges.items()
    }
    top_df = pd.DataFrame(top_choices)

    # Build the count matrix n_ij: rows=prompts, cols=models
    categories = list(judges.keys())
    count_matrix = np.array([
        [row.tolist().count(cat) for cat in categories]
        for _, row in top_df.iterrows()
    ])

    # Fleiss' kappa calculation
    N, k = count_matrix.shape  # N items, k categories
    m = 4  # number of judges
    # Proportion of all assignments to category j
    p = count_matrix.sum(axis=0) / (N * m)
    # Agreement for each item
    P_i = (np.sum(count_matrix * (count_matrix - 1), axis=1)) / (m * (m - 1))
    P_bar = np.mean(P_i)
    P_e = np.sum(p ** 2)
    kappa = (P_bar - P_e) / (1 - P_e)

    print(f"Fleiss' kappa: {kappa:.4f}")

Fleiss' kappa: 0.4709


In [4]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

# 1) Load your ranking system results
our_df = pd.read_csv('ranked_responses_final.csv')

# 2) Extract our system’s top-1 per prompt
top_our = (
    our_df
    .loc[our_df.groupby('Prompt')['Final Score'].idxmax()]
    .loc[:, ['Prompt','Model']]
    .rename(columns={'Model':'our_best_model'})
)

# 3) Load Gemini’s rankings
gem_df = pd.read_csv('ranked_results_with_explanations.csv')

# 3.5) Prompt‐mapping check
total_gem = len(gem_df)
mapped    = gem_df['Prompt'].isin(our_df['Prompt']).sum()
print(f"Prompt mapping: {mapped}/{total_gem} = {mapped/total_gem*100:.1f}%\n")

Prompt mapping: 500/500 = 100.0%



In [5]:
# 4) Merge and compute raw agreement
comparison = pd.merge(
    top_our, 
    gem_df[['Prompt','best_model']], 
    on='Prompt', 
    how='inner'
).dropna(subset=['our_best_model','best_model'])

n = len(comparison)
agree = (comparison['our_best_model'] == comparison['best_model']).sum()
print(f"Compared on {n} prompts:")
print(f"  Agreement rate: {agree}/{n} = {agree/n*100:.1f}%\n")

Compared on 499 prompts:
  Agreement rate: 364/499 = 72.9%



In [7]:
# 5) Distribution of Gemini’s picks
print("Gemini best_model counts:")
print(gem_df['best_model'].value_counts(), "\n")

# 6) Distribution of our system’s picks on the *same* sampled prompts
sampled_prompts = gem_df['Prompt'].unique()
our_sampled = top_our[top_our['Prompt'].isin(sampled_prompts)]
print(f"Our system best_model counts (on {len(our_sampled)} sampled prompts):")
print(our_sampled['our_best_model'].value_counts(), "\n")

Gemini best_model counts:
best_model
deepseek/deepseek-chat         200
anthropic/claude-3.5-sonnet    177
openai/gpt-4o                   71
perplexity/sonar                51
Name: count, dtype: int64 

Our system best_model counts (on 500 sampled prompts):
our_best_model
deepseek/deepseek-chat         182
anthropic/claude-3.5-sonnet    175
openai/gpt-4o                   94
perplexity/sonar                49
Name: count, dtype: int64 



In [8]:
# 7) Cohen’s κ between the two “raters”
kappa = cohen_kappa_score(comparison['our_best_model'], comparison['best_model'])
print(f"Cohen’s κ: {kappa:.4f}")

Cohen’s κ: 0.6098
