In [2]:
# -*- coding: utf-8 -*-
import os
import json
import numpy as np
import pandas as pd
# from scipy.stats import kendalltau # Replaced by pandas.corr
from sklearn.preprocessing import StandardScaler # Only if needed by ranking/helpers, not directly for kendall
from collections import defaultdict
import plotly.express as px
import plotly.graph_objects as go
# import plotly.figure_factory as ff # For dendrograms if kept, not primary for heatmaps
# from plotly.subplots import make_subplots # Not used in current version
import plotly.io as pio
import re

# Set default plotly template
pio.templates.default = "plotly_white"

# --- 1. Global Configurations ---

# UPDATED: Centralized model configuration to link Holmes and Harness names
MODEL_CONFIG = [
    # Core Models
    {"short_name": "Qwen2.5 Base", "holmes_name": "Qwen__Qwen2.5-7B", "harness_name": "Qwen2.5-7B"},
    {"short_name": "Qwen2.5 Instruct", "holmes_name": "Qwen__Qwen2.5-7B-Instruct", "harness_name": "Qwen2.5-7B-Instruct"},
    {"short_name": "Qwen2.5 Coder", "holmes_name": "Qwen__Qwen2.5-Coder-7B", "harness_name": "Qwen2.5-Coder-7B"},
    {"short_name": "Qwen2.5 Math", "holmes_name": "Qwen__Qwen2.5-Math-7B", "harness_name": "Qwen2.5-Math-7B"},
    # Coder Merged Models
    {"short_name": "Linear (Coder)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29"},
    {"short_name": "Task Arithmetic (Coder)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29"},
    {"short_name": "DARE Ties (Coder)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29"},
    {"short_name": "Ties (Coder)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29"},
    {"short_name": "Slerp (Coder)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29"},
    # Math Merged Models
    {"short_name": "Linear (Math)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-linear-24", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-linear-24"},
    {"short_name": "Slerp (Math)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-slerp-24", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-slerp-24"},
    {"short_name": "Task Arithmetic (Math)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-task_arithmetic-26", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-task_arithmetic-26"},
    {"short_name": "Ties (Math)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-ties-25", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-ties-25"},
    {"short_name": "DARE Ties (Math)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-dare_ties-27", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-dare_ties-27"},
    {"short_name": "Della (Math)", "holmes_name": "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-della-27", "harness_name": "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-della-27"},
]

# --- Generate lists and mappings from the new config ---
holmes_model_list = [m['holmes_name'] for m in MODEL_CONFIG if 'holmes_name' in m]
harness_model_list = [m['harness_name'] for m in MODEL_CONFIG if 'harness_name' in m]

MASTER_SHORT_NAMES = {}
for model in MODEL_CONFIG:
    if 'holmes_name' in model:
        MASTER_SHORT_NAMES[model['holmes_name']] = model['short_name']
    if 'harness_name' in model:
        MASTER_SHORT_NAMES[model['harness_name']] = model['short_name']

COMPARABLE_MODEL_SHORT_NAMES = sorted(list(set(m['short_name'] for m in MODEL_CONFIG)))


font_config = {
    "title_font_size": 35,
    "font_size": 20,
    "xaxis_title_font_size": 32,
    "yaxis_title_font_size": 32,
    "xaxis_tickfont_size": 26,
    "yaxis_tickfont_size": 26,
    "legend_title_font_size": 18,
    "legend_font_size": 16,
}

default_plot_height = 800
heatmap_color_scale = 'RdBu_r'
OUTPUT_DIR = "rankings_output"

# --- Helper function for cleaning model names for plots ---
def clean_plot_name(name):
    if isinstance(name, str) and "Merged_" in name:
        return re.sub(r'_\d+$', '', name)
    return name

def clean_axis_label(label):
    label = label.replace("HolmesM_", "H-Main: ")
    label = label.replace("HarnessM_", "Ha-Main: ")
    label = label.replace("HarnessS_", "Ha-Sub: ")
    label = re.sub(r'Ha-Sub: ([^_]+)_', r'Ha-Sub: \1 - ', label)
    return label


# --- 2. Holmes Data Section ---
HOLMES_ABS_DATA_FILE = "results_flash-holmes.csv"
HOLMES_GROUP_DATA_FILE = "transformed_results_2.csv"
HOLMES_MAIN_TASK_COL_FROM_GROUP_FILE = "probing dataset"
HOLMES_SUBTASK_COL_FROM_GROUP_FILE = "probe"
HOLMES_LINGUISTIC_COMPETENCY_COL = "linguistic competencies"
HOLMES_SUBTASK_PHENOMENA_COL = "linguistic phenomena" # Retained for potential future use

def process_frame_holmes(frame):
    if "Unnamed: 0" in frame.columns:
        del frame["Unnamed: 0"]
    if "linguistic subfield" in frame.columns and HOLMES_LINGUISTIC_COMPETENCY_COL not in frame.columns:
        frame[HOLMES_LINGUISTIC_COMPETENCY_COL] = frame["linguistic subfield"]
    return frame

def load_data_holmes(abs_filepath, group_filepath, model_list_in_file,
                     main_task_col_in_group_file, sub_task_col_in_group_file,
                     linguistic_competency_col_in_group_file, phenomena_col_in_group_file): # phenomena_col retained
    try:
        raw_abs_df = pd.read_csv(abs_filepath)
    except FileNotFoundError:
        print(f"Error: Holmes absolute scores file not found at {abs_filepath}")
        return pd.DataFrame(), pd.DataFrame()

    if "Unnamed: 0" in raw_abs_df.columns: del raw_abs_df["Unnamed: 0"]
    raw_abs_df.rename(columns={'probing_dataset': 'holmes_subtask_id', 'model_name': 'model'}, inplace=True)

    if 'encoding' in raw_abs_df.columns:
        raw_abs_df = raw_abs_df[raw_abs_df['encoding'] == 'full'].copy()
    
    raw_abs_df['score'] = pd.to_numeric(raw_abs_df['score'], errors='coerce')
    abs_df_grouped = raw_abs_df.groupby(['holmes_subtask_id', 'model'])['score'].mean().reset_index()
    abs_pivot_df = abs_df_grouped.pivot_table(index='holmes_subtask_id', columns='model', values='score').reset_index()

    for model_col in model_list_in_file: 
        if model_col not in abs_pivot_df.columns:
            abs_pivot_df[model_col] = np.nan
    
    cols_to_keep_abs = ['holmes_subtask_id'] + [m for m in model_list_in_file if m in abs_pivot_df.columns]
    abs_scores_per_subtask_df = abs_pivot_df[cols_to_keep_abs].copy()

    try:
        raw_group_df = pd.read_csv(group_filepath)
    except FileNotFoundError:
        print(f"Error: Holmes group info file not found at {group_filepath}.")
        for model_col in model_list_in_file:
            if model_col in abs_scores_per_subtask_df.columns: abs_scores_per_subtask_df[model_col] *= 100
        return pd.DataFrame(columns=model_list_in_file), abs_scores_per_subtask_df 

    group_df_processed = process_frame_holmes(raw_group_df.copy())
    rename_map_group = {
        sub_task_col_in_group_file: 'holmes_subtask_id',
        linguistic_competency_col_in_group_file: 'holmes_linguistic_competency',
        main_task_col_in_group_file: 'holmes_main_task_category'
    }
    # Add phenomena if it exists and isn't already mapped (it might be the same as linguistic_competency_col)
    if phenomena_col_in_group_file not in rename_map_group:
        rename_map_group[phenomena_col_in_group_file] = 'holmes_linguistic_phenomena'


    group_df_processed.rename(columns=rename_map_group, inplace=True)

    id_cols_group = ['holmes_main_task_category', 'holmes_subtask_id', 'holmes_linguistic_competency']
    # Add phenomena to id_cols if it was successfully renamed
    if 'holmes_linguistic_phenomena' in group_df_processed.columns:
         id_cols_group.append('holmes_linguistic_phenomena')
    
    if 'probe type' in group_df_processed.columns: id_cols_group.append('probe type')
    
    id_cols_group_present = [col for col in id_cols_group if col in group_df_processed.columns]

    if 'holmes_subtask_id' not in group_df_processed.columns:
        print(f"Critical Error: Subtask ID column ('holmes_subtask_id' from '{sub_task_col_in_group_file}') not found in Holmes group file.")
        for model_col in model_list_in_file:
            if model_col in abs_scores_per_subtask_df.columns: abs_scores_per_subtask_df[model_col] *= 100
        return pd.DataFrame(columns=model_list_in_file), abs_scores_per_subtask_df

    group_info_to_merge = group_df_processed[id_cols_group_present].drop_duplicates(subset=['holmes_subtask_id'])
    subtasks_df_with_groups = pd.merge(abs_scores_per_subtask_df, group_info_to_merge, on='holmes_subtask_id', how='left')

    models_in_merged_df = [m for m in model_list_in_file if m in subtasks_df_with_groups.columns]
    for model_col in models_in_merged_df:
        subtasks_df_with_groups[model_col] = pd.to_numeric(subtasks_df_with_groups[model_col], errors='coerce') * 100

    summary_df = pd.DataFrame()
    if 'holmes_main_task_category' in subtasks_df_with_groups.columns and models_in_merged_df:
        summary_df = subtasks_df_with_groups.groupby('holmes_main_task_category')[models_in_merged_df].mean()
    else:
        summary_df = pd.DataFrame(columns=models_in_merged_df)
        print("Warning: 'holmes_main_task_category' not found or no models. Holmes summary will be empty.")
        
    if not summary_df.empty:
        summary_df.index = summary_df.index.fillna('Unknown_Holmes_MainTask').astype(str)
        summary_df = summary_df[~summary_df.index.str.lower().isin(['nan', 'none', ''])]

    if 'holmes_linguistic_competency' in subtasks_df_with_groups.columns:
        subtasks_df_with_groups['holmes_linguistic_competency'] = subtasks_df_with_groups['holmes_linguistic_competency'].fillna('Unknown_Ling_Comp').astype(str)
        subtasks_df_with_groups = subtasks_df_with_groups[~subtasks_df_with_groups['holmes_linguistic_competency'].str.lower().isin(['nan', 'none', ''])]
    else:
        print("Warning: 'holmes_linguistic_competency' column not found in Holmes subtask data.")
        subtasks_df_with_groups['holmes_linguistic_competency'] = 'Unknown_Ling_Comp'

    return summary_df, subtasks_df_with_groups

# --- 3. Harness Data Section ---
HARNESS_TASKS = ["gsm8k", "mmlu", "leaderboard"]
HARNESS_PATHS = {m: {t: f"organized_results/{t}/{m}/result.json" for t in HARNESS_TASKS} for m in harness_model_list}

def load_summary_harness(paths_dict, model_list_in_file, tasks_list):
    df = pd.DataFrame(index=tasks_list, columns=model_list_in_file, dtype=float)
    key_map = {"gsm8k": "exact_match,strict-match", "mmlu": "acc,none", "leaderboard": "acc_norm,none"} # Ensure 'leaderboard' uses a common metric like 'acc_norm,none' if available at top level
    for m in model_list_in_file:
        for t in tasks_list:
            fp = paths_dict.get(m, {}).get(t)
            if not fp or not os.path.isfile(fp):
                df.at[t, m] = np.nan
                if fp: print(f"Warning: Harness file not found {fp}")
                continue
            try:
                with open(fp, 'r') as f: data = json.load(f)
                # For 'leaderboard' summary, try to get the overall score if present
                if t == 'leaderboard' and 'results' in data and 'leaderboard' in data['results']:
                     results_for_task = data['results']['leaderboard']
                elif 'results' in data and t in data['results']:
                     results_for_task = data['results'][t]
                elif 'groups' in data and t in data['groups']: # Fallback to groups if specific task not in results
                     results_for_task = data['groups'][t]
                else:
                    results_for_task = {}
                
                val_key = key_map.get(t)
                val = results_for_task.get(val_key, np.nan) if val_key else np.nan
                df.at[t, m] = val * 100 if val is not None and not pd.isna(val) else np.nan
            except Exception as e:
                print(f"Error loading Harness file {fp}: {e}"); df.at[t, m] = np.nan
    return df.dropna(how='all', axis=1).dropna(how='all', axis=0)


def load_leaderboard_with_groups_harness(paths_dict, model_list_in_file):
    agg = defaultdict(dict); inv_group = {}
    leaderboard_paths = {m: paths_dict.get(m, {}).get('leaderboard') for m in model_list_in_file}
    leaderboard_paths = {m: p for m, p in leaderboard_paths.items() if p}

    if not any(os.path.isfile(fp) for fp in leaderboard_paths.values() if fp):
        return pd.DataFrame(columns=['subtask_cleaned', 'harness_group'] + model_list_in_file)

    first_valid_file_checked_for_groups = False
    for m, fp in leaderboard_paths.items():
        if not fp or not os.path.isfile(fp): continue
        try:
            with open(fp, 'r') as f: data = json.load(f)
            # Extract group_subtasks mapping from the first valid file
            if 'group_subtasks' in data and not inv_group and not first_valid_file_checked_for_groups:
                for grp, subs in data['group_subtasks'].items():
                    clean_grp_name = grp.replace('leaderboard_', '') if isinstance(grp, str) else str(grp)
                    for sub in subs: 
                        # Handle cases where sub-tasks in group_subtasks might still have prefixes
                        clean_sub_name = sub.replace('leaderboard_', '') if isinstance(sub, str) else str(sub)
                        inv_group[clean_sub_name] = clean_grp_name 
                first_valid_file_checked_for_groups = True
            
            # Process results for individual subtasks
            for key, metrics in data.get('results', {}).items():
                if isinstance(key, str) and key.startswith('leaderboard_') and key != 'leaderboard': 
                    subtask_name_cleaned = key.replace('leaderboard_', '')
                    # Prioritize acc_norm, then acc, then exact_match
                    score = metrics.get('acc_norm,none', metrics.get('acc,none', metrics.get('exact_match,none', np.nan)))
                    if pd.isna(score) and 'exact_match,strict-match' in metrics: # Fallback for gsm8k-like structure if nested
                        score = metrics.get('exact_match,strict-match', np.nan)

                    if not pd.isna(score): agg[subtask_name_cleaned][m] = score * 100
        except Exception as e: print(f"Error processing Harness file {fp} for model {m}: {e}")

    if not agg: return pd.DataFrame(columns=['subtask_cleaned', 'harness_group'] + model_list_in_file)
    
    df = pd.DataFrame.from_dict(agg, orient='index')
    for m_col in model_list_in_file: 
        if m_col not in df.columns: df[m_col] = np.nan
    
    present_models_in_agg = [m for m in model_list_in_file if m in df.columns]
    df = df[present_models_in_agg].copy() 
    df = df.dropna(subset=present_models_in_agg, how='all') # Drop rows where all model scores are NaN

    if df.empty: return pd.DataFrame(columns=['subtask_cleaned', 'harness_group'] + model_list_in_file)

    df['harness_group'] = df.index.map(lambda x: inv_group.get(x, 'Unknown_Harness_Group')) 
    df.index.name = 'subtask_cleaned' 
    df.reset_index(inplace=True) 
    
    final_cols = ['subtask_cleaned', 'harness_group'] + present_models_in_agg 
    # Ensure essential columns exist, even if empty (though 'subtask_cleaned' comes from index)
    for col in ['harness_group']: 
        if col not in df.columns:
            df[col] = 'Unknown_Harness_Group'
            
    return df[final_cols]


# --- 4. Generic Ranking Generation ---
def generate_rankings_generic(score_data, model_names_in_data_cols, short_names_map,
                              task_id_col=None, group_col_for_index=None):
    rankings_list = []
    models_to_rank = [m for m in model_names_in_data_cols if m in score_data.columns]
    if not models_to_rank:
        print("No models to rank in generate_rankings_generic.")
        return pd.DataFrame()

    if task_id_col: 
        if task_id_col not in score_data.columns:
            print(f"Error: task_id_col '{task_id_col}' not found in score_data for ranking.")
            return pd.DataFrame()
        
        # Determine iterator based on presence of group_col_for_index
        if group_col_for_index and group_col_for_index in score_data.columns:
            # Group by both group_col and task_id_col
            iterator = score_data.groupby([group_col_for_index, task_id_col], observed=False) # observed=False for newer pandas
        else:
            # Group only by task_id_col (add a dummy grouper if group_col_for_index was expected but missing)
            if group_col_for_index: 
                print(f"Warning: group_col_for_index '{group_col_for_index}' not found. Ranking by task_id_col '{task_id_col}' only.")
            score_data['_dummy_grouper_for_rank'] = 0 # Temporary column for consistent iteration structure
            iterator = score_data.groupby(['_dummy_grouper_for_rank', task_id_col], observed=False)


        for name_tuple, group_df_iter in iterator:
            # Correctly unpack name_tuple based on whether group_col_for_index was used
            if group_col_for_index and group_col_for_index in score_data.columns:
                current_group_val = name_tuple[0]
                current_task_id_val = name_tuple[1]
            else: # Only task_id_col was used (with _dummy_grouper_for_rank)
                current_group_val = None # Or some indicator like 'N/A' if group_col_for_index was specified but absent
                current_task_id_val = name_tuple[1]


            if not group_df_iter[models_to_rank].empty:
                # Scores for the current task (or sub-group of task if grouped by more)
                # .mean() is applied here in case group_df_iter contains multiple rows for the same task_id under a group
                # (e.g. if task_id_col wasn't unique within the group_df_iter, which shouldn't happen if task_id_col is the finest grain)
                scores_series = group_df_iter[models_to_rank].mean(numeric_only=True) 
            else:
                continue 

            ranked_models_full = scores_series.sort_values(ascending=False, na_position='last').index.tolist()
            ranked_short_names = [short_names_map.get(m, m) for m in ranked_models_full]
            
            row = {}
            if group_col_for_index and current_group_val is not None and group_col_for_index in score_data.columns : 
                row[group_col_for_index] = current_group_val
            row[task_id_col] = current_task_id_val # This is always the specific task identifier
            
            for i, name in enumerate(ranked_short_names):
                row[f'Rank {i+1}'] = name
            rankings_list.append(row)
        
        if '_dummy_grouper_for_rank' in score_data.columns: # Clean up dummy column
            del score_data['_dummy_grouper_for_rank']
            
    else: # This block is for when score_data is already aggregated (index = tasks, columns = models)
        if score_data.empty:
            print("Score data is empty for non-task_id_col ranking.")
            return pd.DataFrame()
            
        for task_name_idx, row_data in score_data.iterrows():
            scores = row_data[models_to_rank].astype(float) # Ensure numeric for sorting
            ranked_models_full = scores.sort_values(ascending=False, na_position='last').index.tolist()
            ranked_short_names = [short_names_map.get(m, m) for m in ranked_models_full]
            row = {'Task': task_name_idx} 
            for i, name in enumerate(ranked_short_names):
                row[f'Rank {i+1}'] = name
            rankings_list.append(row)

    if not rankings_list: return pd.DataFrame()
    rankings_df = pd.DataFrame(rankings_list)
    
    # Set index
    index_cols = []
    if group_col_for_index and group_col_for_index in rankings_df.columns: index_cols.append(group_col_for_index)
    if task_id_col and task_id_col in rankings_df.columns: index_cols.append(task_id_col)
    elif 'Task' in rankings_df.columns: index_cols.append('Task') # For summary DFs not using task_id_col
    
    if index_cols:
        try:
            # Drop rows where all specified index columns are NaN
            rankings_df.dropna(subset=index_cols, how='all', inplace=True)
            if not rankings_df.empty:
                 rankings_df.set_index(index_cols, inplace=True)
            else: 
                print(f"Warning: Rankings_df became empty after dropping NaNs in index columns: {index_cols}")
        except KeyError as e:
            print(f"Warning: Could not set index on {index_cols} for rankings_df. Columns: {rankings_df.columns}. Error: {e}")
    return rankings_df


# --- 5. Kendall's Tau Calculation using Pandas ---
def calculate_kendall_tau_with_pandas(rank_series1, rank_series2, all_possible_models):
    if rank_series1 is None or rank_series2 is None or not rank_series1 or not rank_series2:
        return np.nan

    # Convert lists of model short names to numerical ranks
    s1_numeric = pd.Series(index=all_possible_models, dtype=float)
    for i, model_name in enumerate(rank_series1):
        if model_name in s1_numeric.index:
            s1_numeric[model_name] = i 

    s2_numeric = pd.Series(index=all_possible_models, dtype=float)
    for i, model_name in enumerate(rank_series2):
        if model_name in s2_numeric.index:
            s2_numeric[model_name] = i
            
    rank_df = pd.DataFrame({'rank1': s1_numeric, 'rank2': s2_numeric})
    rank_df.dropna(inplace=True) 

    if len(rank_df) < 2: 
        return np.nan

    corr_matrix = rank_df.corr(method='kendall')
    
    if corr_matrix.shape == (2,2) and not pd.isna(corr_matrix.iloc[0,1]):
        return corr_matrix.iloc[0,1]
    else: # Should not happen if len(rank_df) >= 2 and dropna worked
        return np.nan


# --- 6. Main Execution ---
if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Output directory: {OUTPUT_DIR}")

    # --- Load Holmes Data ---
    print("--- Loading Holmes Data ---")
    summary_holmes_df, subtasks_holmes_df = load_data_holmes(
        HOLMES_ABS_DATA_FILE, HOLMES_GROUP_DATA_FILE, holmes_model_list,
        HOLMES_MAIN_TASK_COL_FROM_GROUP_FILE, HOLMES_SUBTASK_COL_FROM_GROUP_FILE, 
        HOLMES_LINGUISTIC_COMPETENCY_COL, HOLMES_SUBTASK_PHENOMENA_COL
    )
    print(f"Holmes Summary (Main Tasks) DF loaded: {summary_holmes_df.shape}")
    print(f"Holmes Subtasks (with linguistic competencies) DF loaded: {subtasks_holmes_df.shape}")
    # print("Holmes Subtasks DF head:\n", subtasks_holmes_df.head())


    # --- Load Harness Data ---
    print("\n--- Loading Harness Data ---")
    summary_harness_df = load_summary_harness(HARNESS_PATHS, harness_model_list, HARNESS_TASKS)
    subtasks_harness_df = load_leaderboard_with_groups_harness(HARNESS_PATHS, harness_model_list)
    print(f"Harness Summary DF loaded: {summary_harness_df.shape}")
    print(f"Harness Subtasks DF loaded: {subtasks_harness_df.shape}")
    # print("Harness Subtasks DF head:\n", subtasks_harness_df.head())


    # --- Generate Holmes Rankings ---
    print("\n--- Generating Holmes Rankings ---")
    holmes_summary_model_cols = [m for m in holmes_model_list if m in summary_holmes_df.columns]
    holmes_main_ranks_df = generate_rankings_generic(summary_holmes_df, holmes_summary_model_cols, MASTER_SHORT_NAMES)
    
    holmes_subtasks_model_cols = [m for m in holmes_model_list if m in subtasks_holmes_df.columns]
    holmes_subtask_ranks_df = generate_rankings_generic(subtasks_holmes_df, holmes_subtasks_model_cols, MASTER_SHORT_NAMES,
                                                       task_id_col='holmes_subtask_id', group_col_for_index='holmes_linguistic_competency')
    
    print("Holmes Main Task Ranks DF head:\n", holmes_main_ranks_df.head())
    # print("Holmes Subtask Ranks (by Ling. Competency) DF head:\n", holmes_subtask_ranks_df.head())

    if not holmes_main_ranks_df.empty:
        holmes_main_ranks_df.to_csv(os.path.join(OUTPUT_DIR, "holmes_main_task_rankings.csv"), index=True)
        print(f"Saved Holmes main task rankings.")
    if not holmes_subtask_ranks_df.empty:
        holmes_subtask_ranks_df.to_csv(os.path.join(OUTPUT_DIR, "holmes_subtask_rankings_by_ling_competency.csv"), index=True)
        print(f"Saved Holmes subtask rankings by linguistic competency.")


    # --- Generate Harness Rankings ---
    print("\n--- Generating Harness Rankings ---")
    harness_summary_model_cols = [m for m in harness_model_list if m in summary_harness_df.columns]
    harness_subtasks_model_cols = [m for m in harness_model_list if m in subtasks_harness_df.columns]
    
    harness_main_ranks_df = generate_rankings_generic(summary_harness_df, harness_summary_model_cols, MASTER_SHORT_NAMES)
    harness_subtask_ranks_df = generate_rankings_generic(subtasks_harness_df, harness_subtasks_model_cols, MASTER_SHORT_NAMES,
                                                         task_id_col='subtask_cleaned', group_col_for_index='harness_group')
    print("Harness Main Ranks DF head:\n", harness_main_ranks_df.head())
    # print("Harness Subtask Ranks DF head:\n", harness_subtask_ranks_df.head())

    if not harness_main_ranks_df.empty:
        harness_main_ranks_df.to_csv(os.path.join(OUTPUT_DIR, "harness_main_rankings.csv"), index=True)
        print(f"Saved Harness main rankings.")
    if not harness_subtask_ranks_df.empty:
        harness_subtask_ranks_df.to_csv(os.path.join(OUTPUT_DIR, "harness_subtask_rankings_by_group.csv"), index=True)
        print(f"Saved Harness subtask rankings by group.")

    # --- Prepare Task/Subtask Lists for "Huge" Correlation Matrix ---
    # This matrix correlates Holmes MAIN tasks vs ALL Harness tasks (main and sub)
    holmes_main_task_keys_labels_for_huge_corr = []
    if not holmes_main_ranks_df.empty:
        for idx in holmes_main_ranks_df.index.tolist(): # Index is task name for main ranks
            holmes_main_task_keys_labels_for_huge_corr.append((f"HolmesM_{idx}", clean_axis_label(f"HolmesM_{idx}")))
    
    harness_all_task_keys_labels_for_huge_corr = []
    if not harness_main_ranks_df.empty:
        for idx in harness_main_ranks_df.index.tolist(): # Index is task name
            harness_all_task_keys_labels_for_huge_corr.append((f"HarnessM_{idx}", clean_axis_label(f"HarnessM_{idx}")))
    if not harness_subtask_ranks_df.empty and isinstance(harness_subtask_ranks_df.index, pd.MultiIndex):
        for group_idx, subtask_idx in harness_subtask_ranks_df.index: 
            internal_key = f"HarnessS_{group_idx}_{subtask_idx}"
            harness_all_task_keys_labels_for_huge_corr.append((internal_key, clean_axis_label(internal_key)))
    elif not harness_subtask_ranks_df.empty: # Single index case (less likely with groups)
         for idx in harness_subtask_ranks_df.index:
            internal_key = f"HarnessS_{idx}" # Assuming idx might be 'group_subtask' if not multi-index
            harness_all_task_keys_labels_for_huge_corr.append((internal_key, clean_axis_label(internal_key)))

    
    # --- Huge Correlation Map (Holmes Main Tasks vs Harness All Tasks/Subtasks) ---
    print("\n--- Calculating Huge Correlation Matrix (Holmes Main Tasks vs Harness All Tasks/Subtasks) ---")
    if holmes_main_task_keys_labels_for_huge_corr and harness_all_task_keys_labels_for_huge_corr:
        holmes_labels_for_index = [item[1] for item in holmes_main_task_keys_labels_for_huge_corr]
        harness_labels_for_columns = [item[1] for item in harness_all_task_keys_labels_for_huge_corr]
        
        huge_corr_matrix = pd.DataFrame(index=holmes_labels_for_index, columns=harness_labels_for_columns, dtype=float)

        for holmes_key, holmes_display_label in holmes_main_task_keys_labels_for_huge_corr:
            task_idx_holmes = holmes_key.replace("HolmesM_", "")
            if task_idx_holmes not in holmes_main_ranks_df.index: continue
            holmes_rank_series = holmes_main_ranks_df.loc[task_idx_holmes].dropna().tolist()
            if not holmes_rank_series: continue

            for harness_key, harness_display_label in harness_all_task_keys_labels_for_huge_corr:
                harness_rank_series = None
                if harness_key.startswith("HarnessM_"):
                    task_idx_harness = harness_key.replace("HarnessM_", "")
                    if task_idx_harness in harness_main_ranks_df.index:
                        harness_rank_series = harness_main_ranks_df.loc[task_idx_harness].dropna().tolist()
                elif harness_key.startswith("HarnessS_"):
                    # For HarnessS_group_subtask format
                    parts = harness_key.replace("HarnessS_", "").split('_', 1)
                    if len(parts) == 2:
                        group_name, subtask_name = parts[0], parts[1]
                        target_index_tuple = (group_name, subtask_name)
                        if target_index_tuple in harness_subtask_ranks_df.index:
                            harness_rank_series = harness_subtask_ranks_df.loc[target_index_tuple].dropna().tolist()
                    # Fallback for single index if necessary (though less likely with current setup)
                    elif harness_key.replace("HarnessS_", "") in harness_subtask_ranks_df.index and not isinstance(harness_subtask_ranks_df.index, pd.MultiIndex):
                        harness_rank_series = harness_subtask_ranks_df.loc[harness_key.replace("HarnessS_", "")].dropna().tolist()

                if not harness_rank_series: continue
                
                tau = calculate_kendall_tau_with_pandas(holmes_rank_series, harness_rank_series, COMPARABLE_MODEL_SHORT_NAMES)
                huge_corr_matrix.at[holmes_display_label, harness_display_label] = tau
        
        huge_corr_matrix.dropna(how='all', axis=0, inplace=True)
        huge_corr_matrix.dropna(how='all', axis=1, inplace=True)
        if not huge_corr_matrix.empty:
            # Save the correlation matrix to a CSV file
            huge_corr_csv_path = os.path.join(OUTPUT_DIR, "correlation_holmes_main_vs_harness_all.csv")
            huge_corr_matrix.to_csv(huge_corr_csv_path, index=True)
            print(f"Saved huge correlation matrix to {huge_corr_csv_path}")

            print("Huge Correlation Matrix (Holmes Main vs Harness All) (sample):\n", huge_corr_matrix.head())
            fig_huge_corr = px.imshow(huge_corr_matrix.astype(float).sort_index(axis=0).sort_index(axis=1), text_auto=".2f", aspect="auto",
                                      color_continuous_scale=heatmap_color_scale, range_color=[-1,1],
                                      title="Kendall's Tau: Holmes Main Tasks vs. Harness Tasks/Subtasks")
            fig_huge_corr.update_xaxes(tickangle=45, automargin=True)
            fig_huge_corr.update_yaxes(automargin=True)
            fig_huge_corr.update_layout(height=max(800, 20 * len(huge_corr_matrix.index)), 
                                        width=max(1000, 20 * len(huge_corr_matrix.columns)),
                                        **font_config)
            fig_huge_corr.show()
        else: print("Huge correlation matrix (Holmes Main vs Harness All) is empty after NaN drop.")
    else: print("Not enough task names for huge correlation matrix (Holmes Main vs Harness All).")


    # --- Holmes Linguistic Competencies vs Harness Leaderboard Groups Correlation Map (Group-Aggregated Taus) ---
    print("\n--- Calculating Holmes Linguistic Competencies vs Harness Leaderboard Groups Correlation Matrix (Group-Aggregated Taus) ---")
    
    # Check prerequisites
    if not holmes_subtask_ranks_df.empty and \
       isinstance(holmes_subtask_ranks_df.index, pd.MultiIndex) and \
       'holmes_linguistic_competency' in holmes_subtask_ranks_df.index.names and \
       'holmes_subtask_id' in holmes_subtask_ranks_df.index.names and \
       not harness_subtask_ranks_df.empty and \
       isinstance(harness_subtask_ranks_df.index, pd.MultiIndex) and \
       'harness_group' in harness_subtask_ranks_df.index.names and \
       'subtask_cleaned' in harness_subtask_ranks_df.index.names:

        unique_holmes_competencies = sorted([
            str(c) for c in holmes_subtask_ranks_df.index.get_level_values('holmes_linguistic_competency').unique()
            if str(c) not in ['Unknown_Ling_Comp', 'nan', 'none', '']
        ])
        
        unique_harness_groups = sorted([
            str(g) for g in harness_subtask_ranks_df.index.get_level_values('harness_group').unique()
            if str(g) not in ['Unknown_Harness_Group', 'nan', 'none', '']
        ])

        if not unique_holmes_competencies or not unique_harness_groups:
            print("No valid Holmes competencies or Harness groups found for group-aggregated correlation.")
        else:
            group_vs_group_corr_matrix = pd.DataFrame(index=unique_holmes_competencies, columns=unique_harness_groups, dtype=float)

            for competency in unique_holmes_competencies:
                try:
                    # Get all rank series for subtasks under this Holmes competency
                    holmes_ranks_for_competency_df = holmes_subtask_ranks_df.xs(competency, level='holmes_linguistic_competency')
                    holmes_competency_subtask_rankings_list = []
                    for _, row_series in holmes_ranks_for_competency_df.iterrows():
                        ranks = row_series.dropna().tolist()
                        if ranks: # Only add if there are actual ranks
                            holmes_competency_subtask_rankings_list.append(ranks)
                    if not holmes_competency_subtask_rankings_list:
                        # print(f"No rank series found for Holmes competency: {competency}")
                        continue
                except KeyError:
                    print(f"KeyError: Holmes competency '{competency}' not found in holmes_subtask_ranks_df index.")
                    continue
                
                for harness_grp_name in unique_harness_groups:
                    try:
                        # Get all rank series for subtasks under this Harness group
                        harness_ranks_for_group_df = harness_subtask_ranks_df.xs(harness_grp_name, level='harness_group')
                        harness_group_subtask_rankings_list = []
                        for _, row_series in harness_ranks_for_group_df.iterrows():
                            ranks = row_series.dropna().tolist()
                            if ranks: # Only add if there are actual ranks
                                harness_group_subtask_rankings_list.append(ranks)
                        if not harness_group_subtask_rankings_list:
                            # print(f"No rank series found for Harness group: {harness_grp_name}")
                            continue
                    except KeyError:
                        print(f"KeyError: Harness group '{harness_grp_name}' not found in harness_subtask_ranks_df index.")
                        continue

                    # Calculate all pairwise Taus between subtasks of the two groups
                    current_pair_taus = []
                    for holmes_s_rank in holmes_competency_subtask_rankings_list:
                        for harness_s_rank in harness_group_subtask_rankings_list:
                            tau = calculate_kendall_tau_with_pandas(holmes_s_rank, harness_s_rank, COMPARABLE_MODEL_SHORT_NAMES)
                            if not pd.isna(tau):
                                current_pair_taus.append(tau)
                    
                    if current_pair_taus:
                        group_vs_group_corr_matrix.at[competency, harness_grp_name] = np.mean(current_pair_taus)
            
            group_vs_group_corr_matrix.dropna(how='all', axis=0, inplace=True) 
            group_vs_group_corr_matrix.dropna(how='all', axis=1, inplace=True)  
            
            if not group_vs_group_corr_matrix.empty:
                # Save the group-aggregated correlation matrix to a CSV file
                group_corr_csv_path = os.path.join(OUTPUT_DIR, "correlation_holmes_ling_competencies_vs_harness_groups.csv")
                group_vs_group_corr_matrix.to_csv(group_corr_csv_path, index=True)
                print(f"Saved group-aggregated correlation matrix to {group_corr_csv_path}")

                print("Holmes Linguistic Competencies vs Harness Leaderboard Groups Correlation Matrix (sample):\n", group_vs_group_corr_matrix.head())
                fig_group_corr = px.imshow(
                    group_vs_group_corr_matrix.astype(float).sort_index(axis=0).sort_index(axis=1), 
                    text_auto=".2f", aspect="auto",
                    color_continuous_scale=heatmap_color_scale, range_color=[-1,1],
                    title="Kendall's Tau: Holmes Ling. Competencies vs. Harness Leaderboard Groups<br>(Avg. of Subtask-Pair Taus)"
                )
                fig_group_corr.update_xaxes(tickangle=45, automargin=True, title_text="Harness Leaderboard Groups")
                fig_group_corr.update_yaxes(automargin=True, title_text="Holmes Linguistic Competencies")
                fig_group_corr.update_layout(
                    height=max(700, 25 * len(group_vs_group_corr_matrix.index)), 
                    width=max(1500, 25 * len(group_vs_group_corr_matrix.columns)),
                    **font_config
                )
                fig_group_corr.show()
            else: print("Holmes Linguistic Competencies vs Harness Leaderboard Groups correlation matrix is empty after NaN drop.")
    else:
        print("Skipping Holmes Linguistic Competencies vs Harness Leaderboard Groups Correlation: Prerequisite data or correct DataFrame structure missing.")
        if holmes_subtask_ranks_df.empty: print("- Holmes subtask ranks DF is empty.")
        elif not isinstance(holmes_subtask_ranks_df.index, pd.MultiIndex): print("- Holmes subtask ranks DF is not MultiIndexed as expected.")
        # Add more specific checks if needed for debugging
        if harness_subtask_ranks_df.empty: print("- Harness subtask ranks DF is empty.")
        elif not isinstance(harness_subtask_ranks_df.index, pd.MultiIndex): print("- Harness subtask ranks DF is not MultiIndexed as expected.")


    print("\n--- Script Finished ---")


Output directory: rankings_output
--- Loading Holmes Data ---
Holmes Summary (Main Tasks) DF loaded: (188, 15)
Holmes Subtasks (with linguistic competencies) DF loaded: (216, 20)

--- Loading Harness Data ---
Harness Summary DF loaded: (3, 15)
Harness Subtasks DF loaded: (42, 17)

--- Generating Holmes Rankings ---
Holmes Main Task Ranks DF head:
                                                   Rank 1          Rank 2  \
Task                                                                       
SemAntoNeg                                Linear (Coder)    Qwen2.5 Base   
arg-is-abstract                           Linear (Coder)    Ties (Coder)   
arg-is-kind                                 Qwen2.5 Math     Ties (Math)   
arg-is-particular                Task Arithmetic (Coder)  Linear (Coder)   
bioscope-negation-span-classify              Ties (Math)   Qwen2.5 Coder   

                                                  Rank 3         Rank 4  \
Task                                      


--- Calculating Holmes Linguistic Competencies vs Harness Leaderboard Groups Correlation Matrix (Group-Aggregated Taus) ---
Saved group-aggregated correlation matrix to rankings_output/correlation_holmes_ling_competencies_vs_harness_groups.csv
Holmes Linguistic Competencies vs Harness Leaderboard Groups Correlation Matrix (sample):
                  bbh      gpqa  leaderboard  math_hard      musr
discourse   0.144994  0.124298     0.199121   0.203768  0.171673
morphology  0.230869  0.209023     0.298446   0.273326  0.216708
reasoning   0.175898  0.187636     0.220652   0.267884  0.150543
semantics   0.136862  0.107985     0.175853   0.164626  0.165910
syntax      0.200198  0.169577     0.248730   0.221202  0.182099



--- Script Finished ---
