In [10]:
# -*- coding: utf-8 -*-
import os
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import re # Added for cleaning plot names

# Set default plotly template for better aesthetics
pio.templates.default = "plotly_white"

# --- Helper function for cleaning model names for plots ---
def clean_plot_name(name):
    if name is None: # Handle potential None input
        return "Unknown"
    name_str = str(name)
    # For merged models, remove the trailing _XX
    if not name_str.startswith("Qwen2.5") or "Merged_" in name_str :
        name_str = re.sub(r'_\d+$', '', name_str)
    return name_str

# --- Font configuration for plots ---
font_config = {
    "title_font_size": 24,
    "font_size": 18,
    "xaxis_title_font_size": 18,
    "yaxis_title_font_size": 18,
    "xaxis_tickfont_size": 16,
    "yaxis_tickfont_size": 16,
    "legend_title_font_size": 24,
    "legend_font_size": 22,
}

# --- Default Plot Dimensions ---
default_plot_height = 400
default_plot_width = 1400

# --- Core Analysis and Plotting Function ---
def run_analysis_for_experiment(experiment_name, models, short_names, tasks, paths):
    """
    Runs the full data loading, analysis, and plotting pipeline for a given experiment.
    Returns the processed comparison dataframe for later use in the combined plot.
    """
    print("\n" + "="*30)
    print(f" PROCESSING EXPERIMENT: {experiment_name.upper()} ")
    print("="*30 + "\n")

    # --- 1. Model Categorization ---
    instruct_model = None; coder_model = None; merged_models = []; base_model = None
    for m_full_name in models:
        m_short = short_names.get(m_full_name, "")
        is_instruct = (m_short == "Qwen2.5 Instruct")
        # Use .startswith for Coder/Math to handle variations
        is_coder = (m_short.startswith("Qwen2.5 Coder") or m_short.startswith("Qwen2.5 Math"))
        is_base = (m_short == "Qwen2.5 Base")
        
        is_merged = not (is_instruct or is_coder or is_base)
        if is_instruct:
            instruct_model = m_full_name
        elif is_coder:
            coder_model = m_full_name
        elif is_base:
            base_model = m_full_name
        elif is_merged:
            if m_full_name in models:
                merged_models.append(m_full_name)

    if not instruct_model: print("CRITICAL ERROR: Instruct model not identified."); exit()
    if not coder_model: print(f"CRITICAL ERROR: {experiment_name} model not identified."); exit()
    if not merged_models: print("WARNING: No merged models identified.")
    if not base_model: print("WARNING: Base model not identified in setup.")

    print(f"--- Model Categorization ({experiment_name}) ---")
    if base_model: print(f"Base Model: {base_model} ({short_names.get(base_model, 'N/A')})")
    print(f"Instruct Model: {instruct_model} ({short_names.get(instruct_model, 'N/A')})")
    print(f"Specialist Model: {coder_model} ({short_names.get(coder_model, 'N/A')})")
    print(f"Merged Models ({len(merged_models)}):")
    for m in merged_models: print(f"  - {m} ({short_names.get(m, 'N/A')})")
    print("-" * 25)

    comparison_models_ordered = []
    if base_model: comparison_models_ordered.append(base_model)
    if instruct_model: comparison_models_ordered.append(instruct_model)
    if coder_model: comparison_models_ordered.append(coder_model)
    comparison_models_ordered.extend([m for m in merged_models if m])
    comparison_models = list(dict.fromkeys(m for m in comparison_models_ordered if m))
    print(f"Models for comparison: {[clean_plot_name(short_names.get(m, m)) for m in comparison_models]}")
    print("-" * 25)

    # --- 2. Data Loading ---
    def load_leaderboard_with_groups(paths_dict, model_list):
        agg = defaultdict(dict); inv_group = {}
        leaderboard_paths = {m: paths_dict.get(m, {}).get('leaderboard') for m in model_list}
        leaderboard_paths = {m: p for m, p in leaderboard_paths.items() if p}
        if not any(os.path.isfile(fp) for fp in leaderboard_paths.values() if fp):
            return pd.DataFrame(columns=['subtask'] + model_list + ['group'])
        first_valid_file_checked_for_groups = False
        for m, fp in leaderboard_paths.items():
            if not fp or not os.path.isfile(fp): continue
            try:
                with open(fp, 'r') as f: data = json.load(f)
                if 'group_subtasks' in data and not inv_group and not first_valid_file_checked_for_groups:
                    for grp, subs in data['group_subtasks'].items():
                        clean_grp_name = grp.replace('leaderboard_', '') if isinstance(grp, str) else grp
                        for sub in subs: inv_group[sub] = clean_grp_name
                    first_valid_file_checked_for_groups = True
                for key, metrics in data.get('results', {}).items():
                    if isinstance(key, str) and key.startswith('leaderboard_') and key != 'leaderboard':
                        score = metrics.get('acc_norm,none', metrics.get('acc,none', metrics.get('exact_match,none', np.nan)))
                        if not pd.isna(score): agg[key][m] = score * 100
            except Exception as e: print(f"Error processing file {fp} for model {m}: {e}")
        if not agg: return pd.DataFrame(columns=['subtask'] + model_list + ['group'])
        df = pd.DataFrame.from_dict(agg, orient='index')
        for m_col in model_list:
            if m_col not in df.columns: df[m_col] = np.nan
        present_models_in_agg = [m for m in model_list if m in df.columns]; df = df[present_models_in_agg]
        df = df.dropna(subset=present_models_in_agg, how='all')
        if df.empty: return pd.DataFrame(columns=['subtask'] + model_list + ['group', 'subtask_cleaned'])
        df['group'] = df.index.map(lambda x: inv_group.get(x, 'Unknown'))
        df['subtask_cleaned'] = df.index.str.replace('leaderboard_', '', regex=False)
        final_cols = ['group', 'subtask_cleaned'] + present_models_in_agg
        return df.reset_index().rename(columns={'index': 'subtask'})[final_cols + ['subtask']]

    subtasks_df_all = load_leaderboard_with_groups(paths, models)
    
    leaderboard_main_subtasks = ["leaderboard_mmlu_pro", "leaderboard_bbh", "leaderboard_gpqa", "leaderboard_math_hard", "leaderboard_ifeval", "leaderboard_musr"]
    if not subtasks_df_all.empty and 'subtask' in subtasks_df_all.columns:
        subtasks_df = subtasks_df_all[subtasks_df_all['subtask'].isin(leaderboard_main_subtasks)].copy()
    else:
        subtasks_df = pd.DataFrame()

    models_in_subtasks_data = [m for m in comparison_models if m in subtasks_df.columns]
    if not subtasks_df.empty and models_in_subtasks_data:
        present_base_cols = [c for c in ['subtask', 'subtask_cleaned', 'group'] if c in subtasks_df.columns]
        subtasks_comp_df = subtasks_df[present_base_cols + models_in_subtasks_data].copy()
    else:
        subtasks_comp_df = pd.DataFrame(columns=['subtask', 'subtask_cleaned', 'group'] + models_in_subtasks_data)

    if not subtasks_comp_df.empty: print(f"\n--- Main Leaderboard Tasks DataFrame ({experiment_name}) ---"); print(subtasks_comp_df); print("-" * 50)
    else: print(f"\n--- Main Leaderboard Tasks DataFrame is empty for {experiment_name} ---")
    
    return subtasks_comp_df, comparison_models, instruct_model, coder_model, base_model


# --- Configuration for Each Experiment ---

# --- CODER Experiment ---
models_coder = [
    "Qwen2.5-7B", "Qwen2.5-7B-Instruct", "Qwen2.5-Coder-7B",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29"
]
short_names_coder = {
    "Qwen2.5-7B": "Qwen2.5 Base", "Qwen2.5-7B-Instruct": "Qwen2.5 Instruct", "Qwen2.5-Coder-7B": "Qwen2.5 Coder",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29": "Task Arithmetic",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29": "DARE Ties",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29": "Ties",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29": "Slerp",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29": "Linear"
}
tasks_coder = ["leaderboard"]
paths_coder = {m: {t: f"organized_results/{t}/{m}/result.json" for t in tasks_coder} for m in models_coder}

# --- MATH Experiment ---
models_math = [
    "Qwen2.5-7B", "Qwen2.5-7B-Instruct", "Qwen2.5-Math-7B",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-task_arithmetic-26",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-dare_ties-27",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-ties-26",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-slerp-24",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-linear-24"
]
short_names_math = {
    "Qwen2.5-7B": "Qwen2.5 Base", "Qwen2.5-7B-Instruct": "Qwen2.5 Instruct", "Qwen2.5-Math-7B": "Qwen2.5 Math",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-task_arithmetic-26": "Task Arithmetic",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-dare_ties-27": "DARE Ties",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-ties-26": "Ties",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-slerp-24": "Slerp",
    "Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-linear-24": "Linear"
}
tasks_math = ["leaderboard"]
paths_math = {m: {t: f"organized_results/{t}/{m}/result.json" for t in tasks_math} for m in models_math}


# --- Run Analysis for Both Experiments ---
subtasks_comp_df_coder, comparison_models_coder, instruct_model_coder, specialist_model_coder, base_model_coder = run_analysis_for_experiment("Coder", models_coder, short_names_coder, tasks_coder, paths_coder)
subtasks_comp_df_math, comparison_models_math, instruct_model_math, specialist_model_math, base_model_math = run_analysis_for_experiment("Math", models_math, short_names_math, tasks_math, paths_math)


# --- 7. Combined Plotting Section ---
print("\n" + "="*30)
print(" GENERATING COMBINED PLOT ")
print("="*30 + "\n")

def prepare_plot_data(df, comparison_models, short_names_map):
    plot_data_list = []
    if df.empty or 'subtask_cleaned' not in df.columns:
        return pd.DataFrame()
        
    models_to_plot_bar = [m for m in comparison_models if m in df.columns and not df[m].isna().all()]
    tasks_to_plot = sorted(df['subtask_cleaned'].unique())

    for task_name in tasks_to_plot:
        task_data = df[df['subtask_cleaned'] == task_name]
        if not task_data.empty:
            for model_full_name in comparison_models:
                if model_full_name in models_to_plot_bar:
                    score = task_data.iloc[0][model_full_name]
                    if not pd.isna(score):
                        model_short_clean = clean_plot_name(short_names_map.get(model_full_name, model_full_name))
                        
                        current_model_type = 'Merged' # Default
                        s_name = short_names_map.get(model_full_name)
                        if s_name == "Qwen2.5 Base": current_model_type = "Base"
                        elif s_name == "Qwen2.5 Instruct": current_model_type = "Instruct"
                        elif s_name.startswith("Qwen2.5 Coder") or s_name.startswith("Qwen2.5 Math"): current_model_type = "Specialist"
                        
                        plot_data_list.append({
                            'Task': task_name,
                            'Model Short Name': model_short_clean,
                            'Score': score,
                            'Model Type': current_model_type,
                        })
    return pd.DataFrame(plot_data_list)

# Prepare data for both experiments
plot_df_coder = prepare_plot_data(subtasks_comp_df_coder, comparison_models_coder, short_names_coder)
plot_df_math = prepare_plot_data(subtasks_comp_df_math, comparison_models_math, short_names_math)

# Define model order with core models at the bottom
all_tasks = sorted(list(set(plot_df_coder['Task'].unique()) | set(plot_df_math['Task'].unique())))

# For Coder plot
all_coder_models = plot_df_coder['Model Short Name'].unique()
core_coder = ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Coder']
core_coder_present = [m for m in core_coder if m in all_coder_models]
merged_coder = sorted([m for m in all_coder_models if m not in core_coder_present])
models_coder_plot = core_coder_present + merged_coder

# For Math plot
all_math_models = plot_df_math['Model Short Name'].unique()
core_math = ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Math']
core_math_present = [m for m in core_math if m in all_math_models]
merged_math = sorted([m for m in all_math_models if m not in core_math_present])
models_math_plot = core_math_present + merged_math


if not all_tasks or (not models_coder_plot and not models_math_plot):
    print("No data available to generate the combined plot. Exiting plot generation.")
else:
    # Create subplot titles
    subplot_titles = []
    for task in all_tasks:
        subplot_titles.append(f"Coder - {task}")
        subplot_titles.append(f"Math - {task}")

    fig = make_subplots(
        rows=len(all_tasks),
        cols=2,
        subplot_titles=subplot_titles,
        horizontal_spacing=0.03,
        vertical_spacing=0.05
    )

    color_map = {
        "Base": 'rgb(100, 149, 237)',
        "Instruct": 'rgb(50, 205, 50)',
        "Specialist": 'rgb(255, 165, 0)',
        "Merged": 'rgb(192, 192, 192)'
    }
    
    # Loop through each task to create a row of subplots
    for i, task in enumerate(all_tasks):
        row_num = i + 1
        
        # --- Column 1: Coder ---
        df_c = plot_df_coder[plot_df_coder['Task'] == task]
        if not df_c.empty:
            for model_type in ["Base", "Instruct", "Specialist", "Merged"]:
                df_c_type = df_c[df_c['Model Type'] == model_type]
                if not df_c_type.empty:
                    text_threshold = 15
                    text_positions = ['outside' if score < text_threshold else 'inside' for score in df_c_type['Score']]
                    text_colors = ['black' if score < text_threshold else 'white' for score in df_c_type['Score']]
                    
                    fig.add_trace(go.Bar(
                        y=df_c_type['Model Short Name'],
                        x=df_c_type['Score'],
                        name=model_type,
                        marker_color=color_map.get(model_type),
                        orientation='h',
                        text=df_c_type.apply(lambda row: f"<b>{row['Model Short Name']}</b>: {row['Score']:.1f}%", axis=1),
                        textposition=text_positions,
                        insidetextanchor='middle',
                        textfont=dict(size=12, color=text_colors),
                        cliponaxis=False
                    ), row=row_num, col=1)
            
            task_data_row = subtasks_comp_df_coder[subtasks_comp_df_coder['subtask_cleaned'] == task].iloc[0]
            instruct_score = task_data_row.get(instruct_model_coder)
            specialist_score = task_data_row.get(specialist_model_coder)
            
            if pd.notna(instruct_score) and pd.notna(specialist_score):
                fig.add_shape(type="rect", x0=min(instruct_score, specialist_score), x1=max(instruct_score, specialist_score), y0=-0.5, y1=len(models_coder_plot)-0.5, fillcolor="rgba(255, 128, 128, 0.2)", line_width=0, layer="below", row=row_num, col=1)
                fig.add_shape(type="line", x0=instruct_score, x1=instruct_score, y0=-0.5, y1=len(models_coder_plot)-0.5, line=dict(color=color_map['Instruct'], dash="dash", width=2), layer="above", row=row_num, col=1)
                fig.add_shape(type="line", x0=specialist_score, x1=specialist_score, y0=-0.5, y1=len(models_coder_plot)-0.5, line=dict(color=color_map['Specialist'], dash="dash", width=2), layer="above", row=row_num, col=1)

            # --- NEW: Dynamic X-axis calculation for Coder plot ---
            max_score_c = df_c['Score'].max()
            fig.update_xaxes(range=[0, max_score_c * 1.20], row=row_num, col=1)


        # --- Column 2: Math ---
        df_m = plot_df_math[plot_df_math['Task'] == task]
        if not df_m.empty:
            for model_type in ["Base", "Instruct", "Specialist", "Merged"]:
                 df_m_type = df_m[df_m['Model Type'] == model_type]
                 if not df_m_type.empty:
                    text_threshold = 15
                    text_positions = ['outside' if score < text_threshold else 'inside' for score in df_m_type['Score']]
                    text_colors = ['black' if score < text_threshold else 'white' for score in df_m_type['Score']]

                    fig.add_trace(go.Bar(
                        y=df_m_type['Model Short Name'],
                        x=df_m_type['Score'],
                        name=model_type,
                        marker_color=color_map.get(model_type),
                        orientation='h',
                        text=df_m_type.apply(lambda row: f"<b>{row['Model Short Name']}</b>: {row['Score']:.1f}%", axis=1),
                        textposition=text_positions,
                        insidetextanchor='middle',
                        textfont=dict(size=22, color=text_colors),
                        cliponaxis=False
                    ), row=row_num, col=2)

            task_data_row = subtasks_comp_df_math[subtasks_comp_df_math['subtask_cleaned'] == task].iloc[0]
            instruct_score = task_data_row.get(instruct_model_math)
            specialist_score = task_data_row.get(specialist_model_math)

            if pd.notna(instruct_score) and pd.notna(specialist_score):
                fig.add_shape(type="rect", x0=min(instruct_score, specialist_score), x1=max(instruct_score, specialist_score), y0=-0.5, y1=len(models_math_plot)-0.5, fillcolor="rgba(255, 128, 128, 0.2)", line_width=0, layer="below", row=row_num, col=2)
                fig.add_shape(type="line", x0=instruct_score, x1=instruct_score, y0=-0.5, y1=len(models_math_plot)-0.5, line=dict(color=color_map['Instruct'], dash="dash", width=2), layer="above", row=row_num, col=2)
                fig.add_shape(type="line", x0=specialist_score, x1=specialist_score, y0=-0.5, y1=len(models_math_plot)-0.5, line=dict(color=color_map['Specialist'], dash="dash", width=2), layer="above", row=row_num, col=2)

            # --- NEW: Dynamic X-axis calculation for Math plot ---
            max_score_m = df_m['Score'].max()
            fig.update_xaxes(range=[0, max_score_m * 1.20], row=row_num, col=2)

    # Update layout for the entire figure
    height_per_facet = max(len(models_coder_plot), len(models_math_plot)) * 20 + 40 
    total_height = height_per_facet * len(all_tasks)

    fig.update_layout(
        title_text='Absolute Performance Comparison by Task: Coder vs. Math Experiments',
        barmode='stack',
        height=total_height,
        width=2000,
        showlegend=False, # Hide the legend
        margin=dict(t=100, l=10, r=10, b=50),
        **font_config
    )

    # Update all y-axes
    fig.update_yaxes(categoryorder='array', categoryarray=models_coder_plot, showticklabels=False, col=1)
    fig.update_yaxes(categoryorder='array', categoryarray=models_math_plot, showticklabels=False, col=2)
    
    # Clean up subplot titles
    fig.for_each_annotation(lambda a: a.update(text=a.text.split(" - ")[-1], font=dict(size=16)))
    
    # Add main column titles
    fig.add_annotation(x=0.18, y=1.02, yanchor='bottom', text="<b>Coder Experiment</b>", showarrow=False, xref="paper", yref="paper", font=dict(size=20))
    fig.add_annotation(x=0.80, y=1.02, yanchor='bottom', text="<b>Math Experiment</b>", showarrow=False, xref="paper", yref="paper", font=dict(size=20))


    fig.show()
    print("Generated combined plot: Absolute Performance Comparison by Task: Coder vs. Math Experiments")

print("\n--- Script Finished ---")



 PROCESSING EXPERIMENT: CODER 

--- Model Categorization (Coder) ---
Base Model: Qwen2.5-7B (Qwen2.5 Base)
Instruct Model: Qwen2.5-7B-Instruct (Qwen2.5 Instruct)
Specialist Model: Qwen2.5-Coder-7B (Qwen2.5 Coder)
Merged Models (5):
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29 (Task Arithmetic)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29 (DARE Ties)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29 (Ties)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29 (Slerp)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29 (Linear)
-------------------------
Models for comparison: ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Coder', 'Task Arithmetic', 'DARE Ties', 'Ties', 'Slerp', 'Linear']
-------------------------

--- Main Leaderboard Tasks DataFrame (Coder) ---
                  subtask subtask_cleaned        group  Qwen2.5-7B  \
0         leaderboard_bbh             bbh  leaderboard   51.432043   
25       leaderboard_gpqa           

Generated combined plot: Absolute Performance Comparison by Task: Coder vs. Math Experiments

--- Script Finished ---
