In [3]:
# -*- coding: utf-8 -*-
import os
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import re # Added for cleaning plot names

# Set default plotly template for better aesthetics if needed
pio.templates.default = "plotly_white"

# --- Helper function for cleaning model names for plots ---
def clean_plot_name(name):
    if name is None: # Handle potential None input
        return "Unknown"
    name_str = str(name)

    # For merged models (which won't start with "Qwen2.5" after short_names mapping)
    # remove the trailing _XX
    # Also handles cases where "Merged_" might still be there if short_names wasn't fully applied before this function
    if not name_str.startswith("Qwen2.5") or "Merged_" in name_str : # Handles names like "Linear_24" -> "Linear" or "Merged_Linear_24" -> "Merged_Linear"
        name_str = re.sub(r'_\d+$', '', name_str)
    return name_str

# --- Font configuration for plots ---
font_config = {
    "title_font_size": 30,
    "font_size": 20,
    "xaxis_title_font_size": 20,
    "yaxis_title_font_size": 20,
    "xaxis_tickfont_size": 20, # Adjusted for potentially dense plots
    "yaxis_tickfont_size": 20, # Adjusted for potentially dense plots
    "legend_title_font_size": 20, # Kept for other plots that might use legends
    "legend_font_size": 15,       # Kept for other plots
}
# --- Default Plot Dimensions ---
default_plot_height = 400
default_plot_width = 1400 # Added for wider plots


# --- 1. Configuration ---

# --- Paths and Model Definitions ---
models = [
    "Qwen2.5-7B",
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-Coder-7B",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29"
]

# Updated short_names for harness script
# Della and DARE_Ties entries have been removed
short_names = {
    "Qwen2.5-7B": "Qwen2.5 Base",
    "Qwen2.5-7B-Instruct": "Qwen2.5 Instruct",
    "Qwen2.5-Coder-7B": "Qwen2.5 Coder",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29": "Task Arithmetic",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29": "DARE Ties",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29": "Ties",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29": "Slerp",
    "Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29": "Linear "
}

tasks = ["gsm8k", "mmlu", "leaderboard"] # This list defines the order of tasks
paths = {m: {t: f"organized_results/{t}/{m}/result.json" for t in tasks} for m in models}

# Updated model categorization logic for harness script
instruct_model = None; coder_model = None; merged_models = []; base_model = None
for m_full_name in models: # Iterate through the original full model names
    m_short = short_names.get(m_full_name, "") # Get the NEW short name

    is_instruct = (m_short == "Qwen2.5 Instruct")
    is_coder = (m_short == "Qwen2.5 Coder")
    is_base = (m_short == "Qwen2.5 Base")
    
    # A model is considered merged if its short name does not match Base, Instruct, or Coder
    is_merged = not (is_instruct or is_coder or is_base)

    if is_instruct:
        instruct_model = m_full_name
    elif is_coder:
        coder_model = m_full_name
    elif is_base:
        base_model = m_full_name
    elif is_merged:
         # This ensures we only add models from our initial `models` list
         # that are intended to be merged.
        if m_full_name in models:
            merged_models.append(m_full_name)


if not instruct_model: print("CRITICAL ERROR: Instruct model not identified."); exit()
if not coder_model: print("CRITICAL ERROR: Coder model not identified."); exit()
if not merged_models: print("WARNING: No merged models identified.")
if not base_model: print("WARNING: Base model not identified in harness script setup.")


print("--- Model Categorization (Harness) ---")
if base_model: print(f"Base Model: {base_model} ({short_names.get(base_model, 'N/A')})")
print(f"Instruct Model: {instruct_model} ({short_names.get(instruct_model, 'N/A')})")
print(f"Coder Model: {coder_model} ({short_names.get(coder_model, 'N/A')})")
print(f"Merged Models ({len(merged_models)}):")
for m in merged_models: print(f"  - {m} ({short_names.get(m, 'N/A')})")
print("-" * 25)

# Define the order and inclusion for comparison_models
comparison_models_ordered = []
if base_model:
    comparison_models_ordered.append(base_model)
if instruct_model:
    comparison_models_ordered.append(instruct_model)
if coder_model:
    comparison_models_ordered.append(coder_model)
comparison_models_ordered.extend([m for m in merged_models if m])
comparison_models = list(dict.fromkeys(m for m in comparison_models_ordered if m)) # Filter out None and duplicates

print(f"Models for comparison (in order): {[clean_plot_name(short_names.get(m, m)) for m in comparison_models]}")
print("-" * 25)

# --- 2. Data Loading ---
def load_summary(paths_dict, model_list):
    df = pd.DataFrame(index=tasks, columns=model_list, dtype=float)
    key_map = {"gsm8k": "exact_match,strict-match", "mmlu": "acc,none", "leaderboard": "acc_norm,none"}
    for m in model_list:
        for t in tasks:
            fp = paths_dict.get(m, {}).get(t)
            if not fp: df.at[t, m] = np.nan; continue
            if os.path.isfile(fp):
                try:
                    with open(fp, 'r') as f: data = json.load(f)
                    results_for_task = data.get('results', {}).get(t, {})
                    if not results_for_task and t == 'leaderboard': results_for_task = data.get('results', {}).get('leaderboard', {})
                    val = results_for_task.get(key_map[t], np.nan)
                    df.at[t, m] = val * 100 if val is not None and not pd.isna(val) else np.nan
                except Exception as e: print(f"Error loading file {fp}: {e}"); df.at[t, m] = np.nan
            else: print(f"Warning: File not found {fp}"); df.at[t, m] = np.nan
    return df.reindex(tasks).dropna(how='all', axis=1).dropna(how='all', axis=0)

def load_leaderboard_with_groups(paths_dict, model_list):
    agg = defaultdict(dict); inv_group = {}
    leaderboard_paths = {m: paths_dict.get(m, {}).get('leaderboard') for m in model_list}
    leaderboard_paths = {m: p for m, p in leaderboard_paths.items() if p}
    if not any(os.path.isfile(fp) for fp in leaderboard_paths.values() if fp):
        return pd.DataFrame(columns=['subtask'] + model_list + ['group'])
    first_valid_file_checked_for_groups = False
    for m, fp in leaderboard_paths.items():
        if not fp or not os.path.isfile(fp): continue
        try:
            with open(fp, 'r') as f: data = json.load(f)
            if 'group_subtasks' in data and not inv_group and not first_valid_file_checked_for_groups:
                for grp, subs in data['group_subtasks'].items():
                    clean_grp_name = grp.replace('leaderboard_', '') if isinstance(grp, str) else grp
                    for sub in subs: inv_group[sub] = clean_grp_name
                first_valid_file_checked_for_groups = True
            for key, metrics in data.get('results', {}).items():
                 if isinstance(key, str) and key.startswith('leaderboard_') and key != 'leaderboard':
                    score = metrics.get('acc_norm,none', metrics.get('acc,none', metrics.get('exact_match,none', np.nan)))
                    if not pd.isna(score): agg[key][m] = score * 100
        except Exception as e: print(f"Error processing file {fp} for model {m}: {e}")
    if not agg: return pd.DataFrame(columns=['subtask'] + model_list + ['group'])
    df = pd.DataFrame.from_dict(agg, orient='index')
    for m_col in model_list:
        if m_col not in df.columns: df[m_col] = np.nan
    present_models_in_agg = [m for m in model_list if m in df.columns]; df = df[present_models_in_agg]
    df = df.dropna(subset=present_models_in_agg, how='all')
    if df.empty: return pd.DataFrame(columns=['subtask'] + model_list + ['group', 'subtask_cleaned'])
    df['group'] = df.index.map(lambda x: inv_group.get(x, 'Unknown'))
    df['subtask_cleaned'] = df.index.str.replace('leaderboard_', '', regex=False)
    final_cols = ['group', 'subtask_cleaned'] + present_models_in_agg
    return df.reset_index().rename(columns={'index': 'subtask'})[final_cols + ['subtask']]

summary_df = load_summary(paths, models)
subtasks_df = load_leaderboard_with_groups(paths, models)

models_in_summary_data = [m for m in comparison_models if m in summary_df.columns]
summary_comp_df = summary_df.loc[:, models_in_summary_data].copy() if models_in_summary_data else pd.DataFrame()


models_in_subtasks_data = [m for m in comparison_models if m in subtasks_df.columns]
if not subtasks_df.empty and models_in_subtasks_data:
    present_base_cols = [c for c in ['subtask', 'subtask_cleaned', 'group'] if c in subtasks_df.columns]
    subtasks_comp_df = subtasks_df[present_base_cols + models_in_subtasks_data].copy()
else:
    subtasks_comp_df = pd.DataFrame(columns=['subtask', 'subtask_cleaned', 'group'] + models_in_subtasks_data)


print("\n--- Summary DataFrame (Comparison Models) ---"); print(summary_comp_df); print("-" * 50)
if not subtasks_comp_df.empty: print("\n--- Subtasks DataFrame (Comparison Models Head) ---"); print(subtasks_comp_df.head()); print("-" * 50)
else: print("\n--- Subtasks DataFrame is empty or could not be loaded/filtered for comparison models ---")

# --- 3. Calculate Differences ---
can_calc_diffs = True
if instruct_model not in summary_comp_df.columns or coder_model not in summary_comp_df.columns:
     print("Warning: Instruct or Coder model data missing from summary_comp_df. Difference calculations involving them will be skipped or result in NaN."); can_calc_diffs = False
diff_cols_main = []; diff_cols_subtasks = []

if can_calc_diffs:
    if instruct_model in summary_comp_df.columns and coder_model in summary_comp_df.columns:
        summary_comp_df['d_coder'] = summary_comp_df[instruct_model] - summary_comp_df[coder_model]; diff_cols_main.append('d_coder')
    else:
        summary_comp_df['d_coder'] = np.nan

    for merged_m in merged_models:
        if merged_m in summary_comp_df.columns and instruct_model in summary_comp_df.columns:
            merged_short_name = clean_plot_name(short_names.get(merged_m, merged_m)) # Apply clean_plot_name
            col_name = f"d_merged_{merged_short_name}"
            summary_comp_df[col_name] = summary_comp_df[instruct_model] - summary_comp_df[merged_m]; diff_cols_main.append(col_name)
        
    print("\n--- Summary DataFrame with Differences ---");
    diff_cols_main_present = [col for col in diff_cols_main if col in summary_comp_df.columns]
    if diff_cols_main_present: print(summary_comp_df[diff_cols_main_present])
    else: print("No difference columns to show for main tasks.")
    print("-" * 50)

    if not subtasks_comp_df.empty:
        if instruct_model in subtasks_comp_df.columns and coder_model in subtasks_comp_df.columns:
            subtasks_comp_df['d_coder'] = subtasks_comp_df[instruct_model] - subtasks_comp_df[coder_model]; diff_cols_subtasks.append('d_coder')
        else: subtasks_comp_df['d_coder'] = np.nan

        for merged_m in merged_models:
            merged_short_name = clean_plot_name(short_names.get(merged_m, merged_m)) # Apply clean_plot_name
            col_name = f"d_merged_{merged_short_name}"
            if instruct_model in subtasks_comp_df.columns and merged_m in subtasks_comp_df.columns:
                subtasks_comp_df[col_name] = subtasks_comp_df[instruct_model] - subtasks_comp_df[merged_m]; diff_cols_subtasks.append(col_name)
            
        print("\n--- Subtasks DataFrame with Differences (Head) ---")
        diff_cols_sub_present = [col for col in diff_cols_subtasks if col in subtasks_comp_df.columns]
        cols_to_show_sub_diff = [c for c in ['subtask_cleaned', 'group'] + models_in_subtasks_data + diff_cols_sub_present if c in subtasks_comp_df.columns]
        if cols_to_show_sub_diff: print(subtasks_comp_df[cols_to_show_sub_diff].head())
        else: print("No difference columns to show for subtasks or base columns missing.")
        print("-" * 50)
else: print("Skipping difference calculations as instruct or coder model data is critically missing from summary_comp_df.")


# --- 4. Ranking Generation ---
def generate_rankings(summary_data, subtask_data, model_names_for_ranking, short_names_map):
    ranking_results = {};
    models_in_summary_for_ranking = [m for m in model_names_for_ranking if m in summary_data.columns]

    if not models_in_summary_for_ranking:
        print("No models available in summary_data for ranking.")
    else:
        main_rankings = []
        for task_item in summary_data.index:
            scores = summary_data.loc[task_item, models_in_summary_for_ranking]; ranked_models = scores.sort_values(ascending=False, na_position='last').index.tolist()
            ranked_short_names = [clean_plot_name(short_names_map.get(m, m)) for m in ranked_models]; row = {'Task': task_item} # Apply clean_plot_name
            for i, name in enumerate(ranked_short_names): row[f'Rank {i+1}'] = name
            main_rankings.append(row)
        if main_rankings: ranking_results['main_tasks'] = pd.DataFrame(main_rankings).set_index('Task')
        else: ranking_results['main_tasks'] = pd.DataFrame()


    if not subtask_data.empty:
        models_in_subtasks_for_ranking = [m for m in model_names_for_ranking if m in subtask_data.columns]
        if not models_in_subtasks_for_ranking:
            print("No models available in subtask_data for ranking.")
        else:
            subtask_rankings = []; subtask_name_col = 'subtask_cleaned' if 'subtask_cleaned' in subtask_data.columns else 'subtask'; group_col = 'group' if 'group' in subtask_data.columns else None
            for idx, row_data in subtask_data.iterrows():
                if isinstance(row_data, pd.Series) and all(m in row_data.index for m in models_in_subtasks_for_ranking):
                     scores = row_data[models_in_subtasks_for_ranking]; ranked_models = scores.sort_values(ascending=False, na_position='last').index.tolist()
                     ranked_short_names = [clean_plot_name(short_names_map.get(m, m)) for m in ranked_models]; row = {'Subtask': row_data[subtask_name_col]} # Apply clean_plot_name
                     if group_col and group_col in row_data.index: row['Group'] = row_data[group_col]
                     for i, name in enumerate(ranked_short_names): row[f'Rank {i+1}'] = name
                     subtask_rankings.append(row)
            if subtask_rankings:
                 rank_df_sub = pd.DataFrame(subtask_rankings)
                 base_cols = ['Subtask'] + (['Group'] if group_col and 'Group' in rank_df_sub.columns else []); rank_cols = [f'Rank {i+1}' for i in range(len(models_in_subtasks_for_ranking))]
                 cols_order = base_cols + rank_cols
                 for c in cols_order:
                     if c not in rank_df_sub.columns: rank_df_sub[c] = np.nan
                 rank_df_sub = rank_df_sub[cols_order]
                 if group_col and 'Group' in rank_df_sub.columns: rank_df_sub = rank_df_sub.sort_values(by=['Group', 'Subtask']).set_index(['Group', 'Subtask'])
                 else: rank_df_sub = rank_df_sub.set_index('Subtask')
                 ranking_results['subtasks'] = rank_df_sub
            else: ranking_results['subtasks'] = pd.DataFrame()


            if 'subtasks' in ranking_results and group_col and models_in_subtasks_for_ranking and 'group' in subtask_data.columns:
                 try:
                     avg_scores_group = subtask_data.groupby('group')[models_in_subtasks_for_ranking].mean(numeric_only=True); group_rankings = []
                     for group_name_iter in avg_scores_group.index:
                         scores = avg_scores_group.loc[group_name_iter]; ranked_models = scores.sort_values(ascending=False, na_position='last').index.tolist()
                         ranked_short_names = [clean_plot_name(short_names_map.get(m, m)) for m in ranked_models]; row = {'Group': group_name_iter} # Apply clean_plot_name
                         for i, name in enumerate(ranked_short_names): row[f'Rank {i+1}'] = name
                         group_rankings.append(row)
                     if group_rankings: ranking_results['group_avg'] = pd.DataFrame(group_rankings).set_index('Group')
                     else: ranking_results['group_avg'] = pd.DataFrame()
                 except Exception as e: print(f"Could not calculate group average rankings: {e}")
    return ranking_results

rankings = generate_rankings(summary_comp_df, subtasks_comp_df, comparison_models, short_names)
print("\n" + "="*20 + " MODEL RANKINGS " + "="*20)
if 'main_tasks' in rankings and not rankings['main_tasks'].empty: print("\n--- Main Task Rankings ---"); print(rankings['main_tasks'])
if 'subtasks' in rankings and not rankings['subtasks'].empty: print("\n--- Subtask Rankings ---"); print(rankings['subtasks'].head(10))
if 'group_avg' in rankings and not rankings['group_avg'].empty: print("\n--- Group Average Rankings ---"); print(rankings['group_avg'])
output_dir_harness = "rankings_output_harness"
os.makedirs(output_dir_harness, exist_ok=True); print(f"\n--- Saving Rankings to CSV in '{output_dir_harness}/' ---")
for name, df_rank in rankings.items():
    if isinstance(df_rank, pd.DataFrame) and not df_rank.empty:
        try: csv_filename = os.path.join(output_dir_harness, f"{name}_rankings_harness.csv"); df_rank.to_csv(csv_filename, index=True); print(f"Saved {name} rankings to {csv_filename}")
        except Exception as e: print(f"Error saving {name} rankings: {e}")
print("="*58)


# --- 5. Merged Model Performance Categorization & Task Scenario Table ---

def generate_performance_analysis(df_analyze, df_name_suffix, task_id_col, instruct_m, coder_m, merged_m_list, short_names_map, out_dir):
    print(f"\n--- Merged Model Performance Categorization for: {df_name_suffix} ---")
    analysis_results = []
    task_scenario_data = []

    if not (instruct_m and coder_m and instruct_m in df_analyze.columns and coder_m in df_analyze.columns):
        print(f"Skipping analysis for {df_name_suffix}: Instruct or Coder model data missing from DataFrame.")
        return

    if task_id_col is None: # For summary_df, index is the task
        task_iterable = df_analyze.index
        get_task_name_from_val = lambda task_val: task_val
        get_task_data_row_from_val = lambda task_val: df_analyze.loc[task_val]
    elif task_id_col in df_analyze.columns: # For subtasks_df
        task_iterable = df_analyze[task_id_col].unique()
        get_task_name_from_val = lambda task_val: task_val
        get_task_data_row_from_val = lambda task_val: df_analyze[df_analyze[task_id_col] == task_val].iloc[0] if not df_analyze[df_analyze[task_id_col] == task_val].empty else None
    else:
        print(f"Error: Task identifier '{task_id_col}' not found for {df_name_suffix}.")
        return

    task_categorization_for_table = {get_task_name_from_val(task_val): {"Better_than_both": [], "Worse_than_both": [], "Between_Equal": []}
                                         for task_val in task_iterable}

    for merged_model_full_name in merged_m_list:
        if merged_model_full_name in df_analyze.columns:
            better_count, worse_count, between_count = 0, 0, 0
            merged_model_short_name = short_names_map.get(merged_model_full_name, merged_model_full_name)
            cleaned_merged_model_short_name_for_plot = clean_plot_name(merged_model_short_name)

            for task_value in task_iterable:
                task_data_row = get_task_data_row_from_val(task_value)
                if task_data_row is None: continue

                merged_score = task_data_row[merged_model_full_name]
                instruct_score = task_data_row[instruct_m]
                coder_score = task_data_row[coder_m]

                if pd.isna(merged_score) or pd.isna(instruct_score) or pd.isna(coder_score):
                    continue

                min_im = min(instruct_score, coder_score)
                max_im = max(instruct_score, coder_score)
                current_task_name = get_task_name_from_val(task_value)

                if merged_score > max_im:
                    better_count += 1
                    task_categorization_for_table[current_task_name]["Better_than_both"].append(cleaned_merged_model_short_name_for_plot)
                elif merged_score < min_im:
                    worse_count += 1
                    task_categorization_for_table[current_task_name]["Worse_than_both"].append(cleaned_merged_model_short_name_for_plot)
                elif min_im <= merged_score <= max_im:
                    between_count += 1
                    task_categorization_for_table[current_task_name]["Between_Equal"].append(cleaned_merged_model_short_name_for_plot)

            analysis_results.append({
                "Merged Model": cleaned_merged_model_short_name_for_plot,
                "Better than Instruct & Coder": better_count,
                "Worse than Instruct & Coder": worse_count,
                "Between/Equal to Instruct & Coder": between_count,
            })
        else:
            print(f"Skipping categorization for {merged_model_full_name} ({df_name_suffix}) as it's not in the DataFrame.")

    for task_name_iter, categories in task_categorization_for_table.items():
        task_scenario_data.append({
            "Task": task_name_iter,
            "Better_Count": len(categories["Better_than_both"]),
            "Better_Models": ", ".join(sorted(list(set(categories["Better_than_both"])))),
            "Worse_Count": len(categories["Worse_than_both"]),
            "Worse_Models": ", ".join(sorted(list(set(categories["Worse_than_both"])))),
            "Between_Equal_Count": len(categories["Between_Equal"]),
            "Between_Equal_Models": ", ".join(sorted(list(set(categories["Between_Equal"])))),
        })
    task_scenario_df = pd.DataFrame(task_scenario_data)
    print(f"\n--- Task Scenario Ranking Table ({df_name_suffix}) ---")
    print(task_scenario_df.head())
    try:
        task_scenario_csv_filename = os.path.join(out_dir, f"task_scenario_rankings_{df_name_suffix.lower().replace(' ','_')}.csv")
        task_scenario_df.to_csv(task_scenario_csv_filename, index=False)
        print(f"Saved task scenario rankings for {df_name_suffix} to {task_scenario_csv_filename}")
    except Exception as e:
        print(f"Error saving task scenario rankings for {df_name_suffix} to CSV: {e}")
    print("="*30)

    if analysis_results:
        counts_df = pd.DataFrame(analysis_results)
        print(f"\n--- Counts of Merged Model Performance Categories ({df_name_suffix} Overall) ---")
        print(counts_df)
        counts_df_melted = counts_df.melt(id_vars="Merged Model",
                                          value_vars=["Better than Instruct & Coder", "Worse than Instruct & Coder", "Between/Equal to Instruct & Coder"],
                                          var_name="Category", value_name="Number of Tasks")
        fig_counts_title = f"Merged Model Performance vs. Instruct & Coder ({df_name_suffix} Overall)"
        fig_counts = px.bar(counts_df_melted, x="Merged Model", y="Number of Tasks", color="Category",
                            title=fig_counts_title,
                            barmode='stack',
                            labels={"Number of Tasks": f"Number of {df_name_suffix.replace('Subtasks', 'Subtasks')}"})
        fig_counts.update_xaxes(categoryorder="array", categoryarray=counts_df["Merged Model"].tolist())
        fig_counts.update_layout(
            height=default_plot_height + 50, 
            width=default_plot_width, 
            legend=dict(orientation="h", yanchor="bottom", y=-0.3, xanchor="center", x=0.5),
            **font_config
        )
        fig_counts.show()
        print(f"Generated plot: {fig_counts_title}")

        if len(counts_df) > 1:
            pivot_counts_df = counts_df.set_index("Merged Model")
            categories_to_correlate = ["Better than Instruct & Coder", "Worse than Instruct & Coder", "Between/Equal to Instruct & Coder"]
            pivot_counts_df_filtered = pivot_counts_df[[cat for cat in categories_to_correlate if cat in pivot_counts_df.columns]]
            if len(pivot_counts_df_filtered.columns) > 0 and len(pivot_counts_df_filtered) >1:
                correlation_matrix = pivot_counts_df_filtered.T.corr()
                print(f"\n--- Correlation Matrix of Performance Categories Between Merged Models ({df_name_suffix} Overall) ---")
                print(correlation_matrix)
                fig_corr_heatmap_title = f"Correlation of Perf. Categories Between Merged Models ({df_name_suffix})"
                fig_corr_heatmap = px.imshow(correlation_matrix, text_auto=True, aspect="auto",
                                             color_continuous_scale='RdBu_r', range_color=[-1,1],
                                             title=fig_corr_heatmap_title)
                fig_corr_heatmap.update_layout(width=default_plot_width, height=600, **font_config)
                fig_corr_heatmap.show()
                print(f"Generated plot: {fig_corr_heatmap_title}")
            else:
                print(f"Not enough data or categories to calculate category correlation for {df_name_suffix}.")
        else:
            print(f"Not enough merged models with data to calculate category correlation for {df_name_suffix}.")
    else:
        print(f"No merged model comparison counts generated for {df_name_suffix}.")
    print("="*58)

if not summary_comp_df.empty:
    generate_performance_analysis(
        summary_comp_df,
        df_name_suffix="MainTasks",
        task_id_col=None, # Indicates using DataFrame index for tasks
        instruct_m=instruct_model,
        coder_m=coder_model,
        merged_m_list=merged_models,
        short_names_map=short_names,
        out_dir=output_dir_harness
    )

if not subtasks_comp_df.empty and 'subtask_cleaned' in subtasks_comp_df.columns:
    generate_performance_analysis(
        subtasks_comp_df,
        df_name_suffix="LeaderboardSubtasks",
        task_id_col='subtask_cleaned', # Column name for subtask identifiers
        instruct_m=instruct_model,
        coder_m=coder_model,
        merged_m_list=merged_models,
        short_names_map=short_names,
        out_dir=output_dir_harness
    )
else:
    print("Skipping performance categorization for Leaderboard Subtasks: DataFrame is empty or 'subtask_cleaned' column is missing.")
print("="*58)


# --- 6. Original Plotting Section ---
print("\n--- Generating Original Plots (Harness Data) ---")
instruct_short_label = clean_plot_name(short_names.get(instruct_model, "Instruct")) # Will be "Qwen2.5 Instruct"
coder_short_label = clean_plot_name(short_names.get(coder_model, "Coder")) # Will be "Qwen2.5 Coder"

# Plot: Difference Trends on Main Tasks (vs Instruct)
if not summary_comp_df.empty and 'd_coder' in summary_comp_df.columns :
    fig1 = go.Figure()
    if 'd_coder' in summary_comp_df.columns and not summary_comp_df['d_coder'].isna().all():
        fig1.add_trace(go.Scatter(x=summary_comp_df.index, y=summary_comp_df['d_coder'], mode='lines+markers', name=f'{instruct_short_label}–{coder_short_label}', marker=dict(symbol='circle', size=8), line=dict(dash='dash'), hovertemplate='Task: %{x}<br>Difference: %{y:.2f}%<extra></extra>'))

    colors = px.colors.qualitative.Plotly; merged_plot_idx = 0
    for diff_col in diff_cols_main:
        if diff_col.startswith('d_merged_') and diff_col in summary_comp_df.columns and not summary_comp_df[diff_col].isna().all():
            merged_short_name_plot = diff_col.replace('d_merged_', '')
            cleaned_merged_name_plot = clean_plot_name(merged_short_name_plot) # Ensures _XX is removed
            fig1.add_trace(go.Scatter(x=summary_comp_df.index, y=summary_comp_df[diff_col], mode='lines+markers', name=f'{instruct_short_label}–{cleaned_merged_name_plot}', marker=dict(symbol='square', size=8, color=colors[merged_plot_idx % len(colors)]), hovertemplate='Task: %{x}<br>Difference: %{y:.2f}%<extra></extra>')); merged_plot_idx +=1
        
    if fig1.data:
        fig1_title = 'Difference Trends on Main Tasks (vs Instruct) - All Merged'
        fig1.update_layout(
            title=fig1_title,
            xaxis_title='Task', yaxis_title='Performance Difference (%)',
            legend_title_text='Difference Type', hovermode='x unified',
            width=default_plot_width, 
            **font_config
        )
        fig1.show()
        print(f"Generated plot: {fig1_title}")
    else: print("Skipping plot 'Difference Trends on Main Tasks': No data to plot.")
else: print("Skipping plot 'Difference Trends on Main Tasks': summary_comp_df is empty or 'd_coder' column missing.")


# Plot: Absolute Performance on Main Tasks (Line Chart)
if not summary_comp_df.empty:
    fig1_abs = go.Figure(); colors_line = px.colors.qualitative.Plotly
    models_to_plot_abs = [m for m in comparison_models if m in summary_comp_df.columns and not summary_comp_df[m].isna().all()]

    for plot_idx, model_name_abs in enumerate(models_to_plot_abs):
        short_name_abs = short_names.get(model_name_abs, model_name_abs)
        cleaned_short_name_abs = clean_plot_name(short_name_abs) # Apply cleaning
        
        current_symbol = 'circle'
        current_line_style = 'solid'

        if model_name_abs == base_model:
            current_symbol = 'star'
            current_line_style = 'dashdot'
        elif model_name_abs == instruct_model:
            current_symbol = 'circle'
            current_line_style = 'solid'
        elif model_name_abs == coder_model:
            current_symbol = 'diamond'
            current_line_style = 'dash'
        elif model_name_abs in merged_models:
            current_symbol = 'square'
            current_line_style = 'dot'
                
        fig1_abs.add_trace(go.Scatter(x=summary_comp_df.index, y=summary_comp_df[model_name_abs],
                                      mode='lines+markers', name=cleaned_short_name_abs,
                                      marker=dict(symbol=current_symbol, size=8, color=colors_line[plot_idx % len(colors_line)]),
                                      line=dict(dash=current_line_style),
                                      hovertemplate='Task: %{x}<br>Score: %{y:.2f}%<extra></extra>'))
    if fig1_abs.data:
        fig1_abs_title = 'Absolute Performance on Main Tasks (Line Chart)'
        fig1_abs.update_layout(
            title=fig1_abs_title,
            xaxis_title='Task', yaxis_title='Performance Score (%)',
            legend_title_text='Model', hovermode='x unified',
            width=default_plot_width, 
            **font_config
        )
        fig1_abs.show()
        print(f"Generated plot: {fig1_abs_title}")
    else: print("Skipping plot 'Absolute Performance on Main Tasks (Line Chart)': No data to plot.")
else: print("Skipping plot 'Absolute Performance on Main Tasks (Line Chart)': summary_comp_df is empty.")

# New Plot: Absolute Performance Comparison on Main Tasks (Faceted Horizontal Bar Chart with Shaded Area)
if not summary_comp_df.empty:
    models_to_plot_bar = [m for m in comparison_models if m in summary_comp_df.columns and not summary_comp_df[m].isna().all()]
    if models_to_plot_bar:
        
        # Define color map based on NEW short names for legend/coloring
        color_map_specific = {}
        if base_model: color_map_specific["Qwen2.5 Base"] = 'rgb(100, 149, 237)'
        if instruct_model: color_map_specific["Qwen2.5 Instruct"] = 'rgb(50, 205, 50)'
        if coder_model: color_map_specific["Qwen2.5 Coder"] = 'rgb(255, 165, 0)'
        color_map_specific['Merged'] = 'rgb(192, 192, 192)' # For all merged types

        plot_data_list = []
        for task_idx, task_name in enumerate(tasks):
            if task_name in summary_comp_df.index:
                for model_idx, model_full_name in enumerate(comparison_models):
                    if model_full_name in models_to_plot_bar:
                        score = summary_comp_df.loc[task_name, model_full_name]
                        if not pd.isna(score):
                            model_short_clean = clean_plot_name(short_names.get(model_full_name, model_full_name))
                            
                            # Determine Model Type for coloring based on new short names
                            current_model_type = 'Merged' # Default
                            s_name = short_names.get(model_full_name)
                            if s_name == "Qwen2.5 Base": current_model_type = "Qwen2.5 Base"
                            elif s_name == "Qwen2.5 Instruct": current_model_type = "Qwen2.5 Instruct"
                            elif s_name == "Qwen2.5 Coder": current_model_type = "Qwen2.5 Coder"
                            
                            plot_data_list.append({
                                'Task': task_name,
                                'Model Short Name': model_short_clean, # For Y-axis
                                'Score': score,
                                'Model Type': current_model_type, # For color
                                'Task Index': task_idx,
                                'Model Index': model_idx
                            })
        
        if not plot_data_list:
            print("Skipping 'Absolute Performance Comparison on Main Tasks (Faceted Bar Chart)': No data to plot after filtering.")
        else:
            horizontal_bar_df = pd.DataFrame(plot_data_list)
            horizontal_bar_df.sort_values(by=['Task Index', 'Model Index'], ascending=[True, True], inplace=True)

            fig_main_bar_faceted_title = 'Absolute Performance Comparison by Task'
            fig_main_bar_faceted = px.bar(
                horizontal_bar_df,
                x='Score',
                y='Model Short Name',
                color='Model Type', # Use the determined model type for color
                color_discrete_map=color_map_specific,
                orientation='h',
                title=fig_main_bar_faceted_title,
                labels={'Score': 'Performance Score (%)', 'Model Short Name': 'Model', 'Model Type': 'Model Category'},
                text='Score',
                facet_row='Task',
                category_orders={"Task": tasks}
            )
            
            fig_main_bar_faceted.update_traces(
                texttemplate='%{text:.2f}%',
                textposition='outside'
            )
            
            model_order_for_y = [clean_plot_name(short_names.get(m,m)) for m in comparison_models if m in models_to_plot_bar]
            fig_main_bar_faceted.update_yaxes(categoryorder='array', categoryarray=model_order_for_y, title=None)
            fig_main_bar_faceted.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

            num_y_categories = len(model_order_for_y)
            
            tasks_actually_plotted = [t for t in tasks if t in horizontal_bar_df['Task'].unique()]
            num_total_facets = len(tasks_actually_plotted)

            for facet_idx, task_name in enumerate(tasks_actually_plotted):

                instruct_score_val = summary_comp_df.loc[task_name, instruct_model] if instruct_model in summary_comp_df.columns and task_name in summary_comp_df.index and not pd.isna(summary_comp_df.loc[task_name, instruct_model]) else np.nan
                coder_score_val = summary_comp_df.loc[task_name, coder_model] if coder_model in summary_comp_df.columns and task_name in summary_comp_df.index and not pd.isna(summary_comp_df.loc[task_name, coder_model]) else np.nan
                
                if pd.isna(instruct_score_val) or pd.isna(coder_score_val):
                    print(f"Skipping lines/area for task '{task_name}' due to missing Instruct/Coder scores.")
                    continue

                axis_num_suffix = num_total_facets - facet_idx
                
                current_xaxis_ref = f'x{axis_num_suffix}' if axis_num_suffix > 1 else 'x'
                current_yaxis_ref = f'y{axis_num_suffix}' if axis_num_suffix > 1 else 'y'
                
                fig_main_bar_faceted.add_shape(
                    type="rect",
                    xref=current_xaxis_ref, yref=current_yaxis_ref,
                    x0=min(instruct_score_val, coder_score_val),
                    x1=max(instruct_score_val, coder_score_val),
                    y0=-0.5, y1=num_y_categories - 0.5,
                    fillcolor="rgba(255, 128, 128, 0.2)", # Light red shade
                    line_width=0,
                    layer="below"
                )
                fig_main_bar_faceted.add_shape(
                    type="line",
                    xref=current_xaxis_ref, yref=current_yaxis_ref,
                    x0=instruct_score_val, y0=-0.5,
                    x1=instruct_score_val, y1=num_y_categories - 0.5,
                    line=dict(color=color_map_specific.get('Qwen2.5 Instruct', 'green'), dash="dash", width=2),
                    layer="above" 
                )
                fig_main_bar_faceted.add_shape(
                    type="line",
                    xref=current_xaxis_ref, yref=current_yaxis_ref,
                    x0=coder_score_val, y0=-0.5,
                    x1=coder_score_val, y1=num_y_categories - 0.5,
                    line=dict(color=color_map_specific.get('Qwen2.5 Coder', 'orange'), dash="dash", width=2),
                    layer="above"
                )

            fig_main_bar_faceted.update_layout(
                xaxis_showgrid=True,
                yaxis_showgrid=False,
                showlegend=False, # Removed legend
                **font_config 
            )
            
            plot_height = max(400, num_total_facets * (num_y_categories * 25 + 70))
            fig_main_bar_faceted.update_layout(height=plot_height, width=default_plot_width, margin=dict(l=150, r=50, t=50, b=50))


            fig_main_bar_faceted.show()
            print(f"Generated plot: {fig_main_bar_faceted_title} (Faceted Horizontal with Colors & Shaded Area)")
    else:
        print("Skipping 'Absolute Performance Comparison on Main Tasks (Faceted Bar Chart)': No models with data to plot.")
else:
    print("Skipping 'Absolute Performance Comparison on Main Tasks (Faceted Bar Chart)': summary_comp_df is empty.")

# Modified Plot: Leaderboard Subtasks Performance Distribution by Model (Box Plot)
print("\n--- Generating Leaderboard Subtasks Performance Distribution Box Plot ---")
if not subtasks_comp_df.empty and 'subtask_cleaned' in subtasks_comp_df.columns and instruct_model and coder_model:
    models_for_plot_full_names = [
        m for m in comparison_models if m in subtasks_comp_df.columns
    ]

    if models_for_plot_full_names:
        id_vars_melt = [col for col in ['subtask_cleaned', 'group', 'subtask'] if col in subtasks_comp_df.columns]
        value_vars_melt = [m for m in models_for_plot_full_names if m in subtasks_comp_df.columns]

        if not value_vars_melt:
            print("Skipping 'Leaderboard Subtasks Performance Distribution Box Plot': No valid model columns found in subtasks_comp_df for melting.")
        else:
            leaderboard_subtasks_melted_df = subtasks_comp_df.melt(
                id_vars=id_vars_melt,
                value_vars=value_vars_melt,
                var_name='Model_Full_Name',
                value_name='Score'
            )
            leaderboard_subtasks_melted_df.dropna(subset=['Score'], inplace=True)

            if not leaderboard_subtasks_melted_df.empty:
                leaderboard_subtasks_melted_df['Model'] = leaderboard_subtasks_melted_df['Model_Full_Name'].map(
                    lambda x: clean_plot_name(short_names.get(x, x))
                )
                model_order_for_plot = [
                    clean_plot_name(short_names.get(m, m)) for m in comparison_models if m in value_vars_melt
                ]
                # Ensure order only contains models present in the melted data
                model_order_for_plot = [m for m in model_order_for_plot if m in leaderboard_subtasks_melted_df['Model'].unique()]


                fig_leaderboard_subtasks_box_title = 'Leaderboard Subtasks Performance Distribution by Model'
                fig_leaderboard_subtasks_box = px.box(
                    leaderboard_subtasks_melted_df,
                    x='Model',
                    y='Score',
                    color='Model', # Color by model, legend will be removed
                    points=False, 
                    title=fig_leaderboard_subtasks_box_title,
                    labels={'Score': 'Performance Score (%) on Subtasks', 'Model': 'Model'},
                    category_orders={"Model": model_order_for_plot},
                    hover_data=['subtask_cleaned'] if 'subtask_cleaned' in leaderboard_subtasks_melted_df.columns else None
                )
                
                # Calculate medians for Instruct and Coder models
                instruct_median_subtasks = np.nan
                coder_median_subtasks = np.nan

                if instruct_model in value_vars_melt:
                    instruct_scores = leaderboard_subtasks_melted_df[leaderboard_subtasks_melted_df['Model_Full_Name'] == instruct_model]['Score']
                    if not instruct_scores.empty:
                        instruct_median_subtasks = instruct_scores.median()
                
                if coder_model in value_vars_melt:
                    coder_scores = leaderboard_subtasks_melted_df[leaderboard_subtasks_melted_df['Model_Full_Name'] == coder_model]['Score']
                    if not coder_scores.empty:
                        coder_median_subtasks = coder_scores.median()

                # Add shaded area and lines if medians are valid
                if pd.notna(instruct_median_subtasks) and pd.notna(coder_median_subtasks):
                    fig_leaderboard_subtasks_box.add_shape(
                        type="rect", xref="paper", yref="y", x0=0, x1=1,
                        y0=min(instruct_median_subtasks, coder_median_subtasks),
                        y1=max(instruct_median_subtasks, coder_median_subtasks),
                        fillcolor="rgba(128, 128, 128, 0.2)", line_width=0, layer="below"
                    )
                    fig_leaderboard_subtasks_box.add_hline(
                        y=instruct_median_subtasks, line_dash="dash", line_color="green",
                        annotation_text=f"Instruct Median: {instruct_median_subtasks:.2f}",
                        annotation_position="bottom right", layer="above"
                    )
                    fig_leaderboard_subtasks_box.add_hline(
                        y=coder_median_subtasks, line_dash="dash", line_color="orange",
                        annotation_text=f"Coder Median: {coder_median_subtasks:.2f}",
                        annotation_position="top right", layer="above"
                    )
                elif pd.notna(instruct_median_subtasks):
                     fig_leaderboard_subtasks_box.add_hline(
                        y=instruct_median_subtasks, line_dash="dash", line_color="green",
                        annotation_text=f"Instruct Median: {instruct_median_subtasks:.2f}",
                        annotation_position="bottom right", layer="above"
                    )
                elif pd.notna(coder_median_subtasks):
                    fig_leaderboard_subtasks_box.add_hline(
                        y=coder_median_subtasks, line_dash="dash", line_color="orange",
                        annotation_text=f"Coder Median: {coder_median_subtasks:.2f}",
                        annotation_position="top right", layer="above"
                    )
                
                # Calculate and add mean points for each model
                mean_scores_data = leaderboard_subtasks_melted_df.groupby('Model')['Score'].mean().reset_index()
                
                model_order_df = pd.DataFrame({'Model': model_order_for_plot})
                mean_scores_to_plot = pd.merge(model_order_df, mean_scores_data, on='Model', how='left')


                fig_leaderboard_subtasks_box.add_trace(go.Scatter(
                    x=mean_scores_to_plot['Model'],
                    y=mean_scores_to_plot['Score'],
                    mode='markers',
                    marker=dict(symbol='diamond', size=8, color='black'),
                    name='Mean Score',
                    showlegend=True # Explicitly show this trace in legend if others are hidden by main showlegend=False
                ))


                fig_leaderboard_subtasks_box.update_layout(
                    height=600, width=default_plot_width, 
                    showlegend=False, # Remove main legend for model colors
                    **font_config
                ) 
                fig_leaderboard_subtasks_box.update_xaxes(tickangle=45)
                fig_leaderboard_subtasks_box.show()
                print(f"Generated plot: {fig_leaderboard_subtasks_box_title}")
            else:
                print("Skipping 'Leaderboard Subtasks Performance Distribution Box Plot': No data to plot after melting and NaN removal.")
    else:
        print("Skipping 'Leaderboard Subtasks Performance Distribution Box Plot': No relevant models found in subtasks_comp_df columns.")
else:
    print("Skipping 'Leaderboard Subtasks Performance Distribution Box Plot': subtasks_comp_df is empty, 'subtask_cleaned' column is missing, or Instruct/Coder models not defined.")


# New Plot: Absolute Performance Comparison for Leaderboard Subtasks (Faceted Horizontal Bar Chart Grid)
if not subtasks_comp_df.empty and 'group' in subtasks_comp_df.columns and 'subtask_cleaned' in subtasks_comp_df.columns:
    print("\n--- Generating Subtask Performance Comparison Chart ---")
    
    subtask_models_to_plot = [m for m in comparison_models if m in subtasks_comp_df.columns]
    
    valid_subtask_models = []
    for m in subtask_models_to_plot:
        if m in subtasks_comp_df.columns and not subtasks_comp_df[m].isna().all():
            valid_subtask_models.append(m)
    subtask_models_to_plot = valid_subtask_models
    
    if not subtask_models_to_plot:
        print("Skipping Subtask Performance Comparison Chart: No models with non-NaN data in subtasks_comp_df.")
    else:
        subtask_plot_data_list = []
        
        subtask_model_order_y = [clean_plot_name(short_names.get(m, m)) for m in comparison_models if m in subtask_models_to_plot]
        
        filtered_subtasks_df_for_plot = subtasks_comp_df[['group', 'subtask_cleaned'] + subtask_models_to_plot].copy()
        filtered_subtasks_df_for_plot.dropna(subset=subtask_models_to_plot, how='all', inplace=True)

        sorted_groups_from_data = sorted(filtered_subtasks_df_for_plot['group'].unique())
        all_subtasks_sorted_from_data = sorted(filtered_subtasks_df_for_plot['subtask_cleaned'].unique())


        for group_name in sorted_groups_from_data:
            group_data = filtered_subtasks_df_for_plot[filtered_subtasks_df_for_plot['group'] == group_name]
            for subtask_name in sorted(group_data['subtask_cleaned'].unique()):
                subtask_data_for_model = group_data[group_data['subtask_cleaned'] == subtask_name]
                for model_full_name in comparison_models:
                    if model_full_name in subtask_models_to_plot and model_full_name in subtask_data_for_model.columns:
                        score_series = subtask_data_for_model[model_full_name]
                        if not score_series.empty:
                            score = score_series.iloc[0]
                            if not pd.isna(score):
                                model_short_clean = clean_plot_name(short_names.get(model_full_name, model_full_name))
                                
                                # Determine Model Type for coloring
                                current_model_type = 'Merged' # Default
                                s_name = short_names.get(model_full_name)
                                if s_name == "Qwen2.5 Base": current_model_type = "Qwen2.5 Base"
                                elif s_name == "Qwen2.5 Instruct": current_model_type = "Qwen2.5 Instruct"
                                elif s_name == "Qwen2.5 Coder": current_model_type = "Qwen2.5 Coder"
                                
                                subtask_plot_data_list.append({
                                    'Group': group_name,
                                    'Subtask': subtask_name,
                                    'Model Short Name': model_short_clean,
                                    'Score': score,
                                    'Model Type': current_model_type # For color
                                })

        if not subtask_plot_data_list:
            print("Skipping Subtask Performance Comparison Chart: No data to plot after detailed filtering.")
        else:
            subtask_horizontal_bar_df = pd.DataFrame(subtask_plot_data_list)

            model_type_color_map = {
                "Qwen2.5 Base": 'rgb(100, 149, 237)', "Qwen2.5 Instruct": 'rgb(50, 205, 50)',
                "Qwen2.5 Coder": 'rgb(255, 165, 0)', 'Merged': 'rgb(192, 192, 192)'
            }
            
            facet_col_wrap_value = 4 

            fig_subtasks_faceted_title = 'Subtask Performance Comparison by Group and Subtask'
            fig_subtasks_faceted = px.bar(
                subtask_horizontal_bar_df,
                x='Score', y='Model Short Name', color='Model Type',
                color_discrete_map=model_type_color_map, orientation='h',
                title=fig_subtasks_faceted_title,
                labels={'Score': 'Perf. Score (%)', 'Model Short Name': 'Model', 'Model Type': 'Category'},
                text='Score', facet_row='Group', facet_col='Subtask',
                category_orders={
                    "Group": sorted_groups_from_data,
                    "Subtask": all_subtasks_sorted_from_data,
                    "Model Short Name": subtask_model_order_y
                },
                facet_col_wrap=facet_col_wrap_value
            )
            
            fig_subtasks_faceted.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
            fig_subtasks_faceted.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
            
            fig_subtasks_faceted.update_yaxes(matches=None, showticklabels=True, title=None, categoryorder='array', categoryarray=subtask_model_order_y)
            fig_subtasks_faceted.update_xaxes(matches=None, showticklabels=True, title=None, tickangle=0)


            num_y_cats_sub = len(subtask_model_order_y)
            
            for r_idx, group_val in enumerate(sorted_groups_from_data):
                for c_idx, subtask_val in enumerate(all_subtasks_sorted_from_data):
                    cell_df = subtask_horizontal_bar_df[
                        (subtask_horizontal_bar_df['Group'] == group_val) &
                        (subtask_horizontal_bar_df['Subtask'] == subtask_val)
                    ]
                    if cell_df.empty:
                        continue

                    instruct_s = np.nan
                    coder_s = np.nan
                    
                    original_data_point = subtasks_comp_df[
                        (subtasks_comp_df['group'] == group_val) &
                        (subtasks_comp_df['subtask_cleaned'] == subtask_val)
                    ]

                    if not original_data_point.empty:
                        if instruct_model in original_data_point.columns:
                            instruct_s = original_data_point[instruct_model].iloc[0]
                        if coder_model in original_data_point.columns:
                            coder_s = original_data_point[coder_model].iloc[0]

                    if pd.isna(instruct_s) or pd.isna(coder_s):
                        continue
                    
                    plot_row_num_for_shape = r_idx + 1
                    plot_col_num_for_shape = c_idx + 1
                    
                    fig_subtasks_faceted.add_shape(
                        type="rect", x0=min(instruct_s, coder_s), x1=max(instruct_s, coder_s),
                        y0=-0.5, y1=num_y_cats_sub - 0.5,
                        fillcolor="rgba(128, 128, 128, 0.2)", line_width=0, layer="below",
                        row=plot_row_num_for_shape, col=plot_col_num_for_shape
                    )
                    fig_subtasks_faceted.add_shape(
                        type="line", x0=instruct_s, y0=-0.5, x1=instruct_s, y1=num_y_cats_sub - 0.5,
                        line=dict(color=model_type_color_map.get('Qwen2.5 Instruct'), dash="dash", width=1.5), layer="above",
                        row=plot_row_num_for_shape, col=plot_col_num_for_shape
                    )
                    fig_subtasks_faceted.add_shape(
                        type="line", x0=coder_s, y0=-0.5, x1=coder_s, y1=num_y_cats_sub - 0.5,
                        line=dict(color=model_type_color_map.get('Qwen2.5 Coder'), dash="dash", width=1.5), layer="above",
                        row=plot_row_num_for_shape, col=plot_col_num_for_shape
                    )
            
            fig_subtasks_faceted.update_layout(showlegend=False, **font_config) # Removed legend
            
            num_facet_r = len(sorted_groups_from_data)
            num_facet_c = len(all_subtasks_sorted_from_data)
            
            row_height_per_facet = (len(subtask_model_order_y) * 20 + 60) 
            
            wrap = facet_col_wrap_value if facet_col_wrap_value > 0 and facet_col_wrap_value <= num_facet_c else num_facet_c
            actual_cols_on_screen = min(num_facet_c, wrap)
            actual_rows_on_screen = -(-num_facet_c // wrap) * num_facet_r 

            plot_h_wrapped = max(400, actual_rows_on_screen * row_height_per_facet + 100) 
            
            col_width_per_facet = 250 
            plot_w_wrapped = max(default_plot_width, actual_cols_on_screen * col_width_per_facet + 150) 


            fig_subtasks_faceted.update_layout(
                height=plot_h_wrapped, 
                width=plot_w_wrapped, 
                margin=dict(l=100, t=80, b=50, r=50) 
            )
            
            if actual_cols_on_screen > 3: 
                 fig_subtasks_faceted.for_each_annotation(lambda a: a.update(font=dict(size=10)) if "Subtask=" in a.text else a)
            if actual_rows_on_screen > 3: 
                 fig_subtasks_faceted.for_each_annotation(lambda a: a.update(font=dict(size=10)) if "Group=" in a.text else a)


            fig_subtasks_faceted.show()
            print(f"Generated plot: {fig_subtasks_faceted_title} (Grid with Colors & Shaded Area)")
else:
    print("Skipping Subtask Performance Comparison Chart: `subtasks_comp_df` is empty or critical columns missing.")


# Plot: Subtask Difference Boxplot(s)
plot_dcoder_col = 'd_coder' if 'd_coder' in subtasks_comp_df.columns else None
if not subtasks_comp_df.empty and plot_dcoder_col and 'group' in subtasks_comp_df.columns and not subtasks_comp_df[plot_dcoder_col].isna().all():
    print("Generating Subtask Difference Boxplot(s)...")
    merged_diff_cols_sub_present = [col for col in diff_cols_subtasks if col.startswith('d_merged_') and col in subtasks_comp_df.columns and not subtasks_comp_df[col].isna().all()]

    if not merged_diff_cols_sub_present:
        plot_data_box_im_only = subtasks_comp_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=[plot_dcoder_col], var_name='difference_type', value_name='difference_value'); plot_data_box_im_only.dropna(subset=['difference_value'], inplace=True)
        if not plot_data_box_im_only.empty:
            label_map_im_only = {plot_dcoder_col: f'{instruct_short_label}–{coder_short_label}'}; plot_data_box_im_only['difference_label'] = plot_data_box_im_only['difference_type'].map(label_map_im_only)
            fig2_im_only_title = f'Subtask Differences: {instruct_short_label}–{coder_short_label}'
            fig2_im_only = px.box(plot_data_box_im_only, x='group', y='difference_value', color='difference_label', hover_data=['subtask_cleaned'], labels={'group': 'Group', 'difference_value': 'Difference (%)', 'difference_label': 'Difference Type', 'subtask_cleaned': 'Subtask'}, title=fig2_im_only_title, category_orders={"group": sorted(plot_data_box_im_only['group'].unique())})
            fig2_im_only.update_xaxes(tickangle=45)
            fig2_im_only.update_layout(width=default_plot_width, **font_config) 
            fig2_im_only.show(); print(f"  - Generated plot: {fig2_im_only_title}")

    for merged_diff_col in merged_diff_cols_sub_present:
        merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name = clean_plot_name(merged_short_name)
        cols_for_this_plot = [plot_dcoder_col, merged_diff_col]
        
        plot_data_box_single = subtasks_comp_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=cols_for_this_plot, var_name='difference_type', value_name='difference_value'); plot_data_box_single.dropna(subset=['difference_value'], inplace=True)
        if not plot_data_box_single.empty and 'group' in plot_data_box_single.columns:
            label_map = {plot_dcoder_col: f'{instruct_short_label}–{coder_short_label}', merged_diff_col: f'{instruct_short_label}–{cleaned_merged_short_name}'}; plot_data_box_single['difference_label'] = plot_data_box_single['difference_type'].map(label_map)
            fig2_single_title = f'Subtask Diffs: {instruct_short_label}–{coder_short_label} vs {instruct_short_label}–{cleaned_merged_short_name}'
            fig2_single = px.box(plot_data_box_single, x='group', y='difference_value', color='difference_label', hover_data=['subtask_cleaned'], labels={'group': 'Group', 'difference_value': 'Difference (%)', 'difference_label': 'Difference Type', 'subtask_cleaned': 'Subtask'}, title=fig2_single_title, category_orders={"group": sorted(plot_data_box_single['group'].unique())})
            fig2_single.update_xaxes(tickangle=45); fig2_single.update_layout(boxmode='group', width=default_plot_width, **font_config); fig2_single.show(); print(f"  - Generated plot: {fig2_single_title}") 
else: print("Skipping Subtask Difference Boxplot(s): subtasks_comp_df empty, 'd_coder' or 'group' column missing, or 'd_coder' has no data.")

base_comparison_models_plot3 = []
if base_model and base_model in subtasks_df.columns: base_comparison_models_plot3.append(base_model)
if instruct_model and instruct_model in subtasks_df.columns: base_comparison_models_plot3.append(instruct_model)
if coder_model and coder_model in subtasks_df.columns: base_comparison_models_plot3.append(coder_model)
base_comparison_models_plot3 = list(dict.fromkeys(base_comparison_models_plot3))

if not subtasks_df.empty and 'group' in subtasks_df.columns and base_comparison_models_plot3:
    print("Generating Absolute Score Boxplot(s) for Subtasks...")
    present_merged_models_in_subtasks = [m for m in merged_models if m in subtasks_df.columns]
    
    if not present_merged_models_in_subtasks: print("  - Info: No merged models found in subtask data to pair with base/instruct/coder for boxplots.")

    if not present_merged_models_in_subtasks and base_comparison_models_plot3:
        plot_data_base_only = subtasks_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=base_comparison_models_plot3, var_name='model_full_name', value_name='score'); plot_data_base_only.dropna(subset=['score'], inplace=True)
        if not plot_data_base_only.empty and 'group' in plot_data_base_only.columns:
            plot_data_base_only['model_short_name'] = plot_data_base_only['model_full_name'].map(lambda x: clean_plot_name(short_names.get(x,x)))
            fig_abs_base_title = f'Absolute Subtask Perf: Base/Instruct/Coder Models'
            fig_abs_base = px.box(plot_data_base_only, x='group', y='score', color='model_short_name', hover_data=['subtask_cleaned'], labels={'group': 'Group', 'score': 'Absolute Score (%)', 'model_short_name': 'Model', 'subtask_cleaned': 'Subtask'}, title=fig_abs_base_title, category_orders={"group": sorted(plot_data_base_only['group'].unique())})
            fig_abs_base.update_xaxes(tickangle=45); fig_abs_base.update_layout(boxmode='group', width=default_plot_width, **font_config); fig_abs_base.show(); print(f"  - Generated plot: {fig_abs_base_title}") 

    for merged_m in present_merged_models_in_subtasks:
        merged_short_name = short_names.get(merged_m, merged_m)
        cleaned_merged_short_name = clean_plot_name(merged_short_name)
        
        models_for_this_plot = base_comparison_models_plot3 + [merged_m]
        models_for_this_plot = [m for m in models_for_this_plot if m in subtasks_df.columns]
        models_for_this_plot = list(dict.fromkeys(models_for_this_plot))

        if len(models_for_this_plot) < 1:
            print(f"  - Skipping boxplot for {cleaned_merged_short_name}: No valid models to plot from {models_for_this_plot}")
            continue

        plot_data_abs_single = subtasks_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=models_for_this_plot, var_name='model_full_name', value_name='score'); plot_data_abs_single.dropna(subset=['score'], inplace=True)
        
        if not plot_data_abs_single.empty and 'group' in plot_data_abs_single.columns:
            plot_data_abs_single['model_short_name'] = plot_data_abs_single['model_full_name'].map(lambda x: clean_plot_name(short_names.get(x,x)))
            fig_abs_single_title = f'Absolute Subtask Perf: Incl. {cleaned_merged_short_name}'
            fig_abs_single = px.box(plot_data_abs_single, x='group', y='score', color='model_short_name', hover_data=['subtask_cleaned'], labels={'group': 'Group', 'score': 'Absolute Score (%)', 'model_short_name': 'Model', 'subtask_cleaned': 'Subtask'}, title=fig_abs_single_title, category_orders={"group": sorted(plot_data_abs_single['group'].unique())})
            fig_abs_single.update_xaxes(tickangle=45); fig_abs_single.update_layout(boxmode='group', width=default_plot_width, **font_config); fig_abs_single.show(); print(f"  - Generated plot: {fig_abs_single_title}") 
else: print(f"Skipping Absolute Score Boxplot(s) for Subtasks: subtasks_df empty, 'group' column missing, or no base/instruct/coder models with data.")


plot_dcoder_col = 'd_coder' if 'd_coder' in subtasks_comp_df.columns else None
plot_merged_diff_cols_sub = [c for c in diff_cols_subtasks if c.startswith('d_merged_') and c in subtasks_comp_df.columns and not subtasks_comp_df[c].isna().all()]
if not subtasks_comp_df.empty and plot_dcoder_col and not subtasks_comp_df[plot_dcoder_col].isna().all() and plot_merged_diff_cols_sub:
    print("Generating Jointplot(s) for Subtask Differences (I-M vs I-MergedX)...")
    for i, merged_diff_col in enumerate(plot_merged_diff_cols_sub):
        merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name = clean_plot_name(merged_short_name)
        required_cols_joint = [plot_dcoder_col, merged_diff_col, 'subtask_cleaned', 'group']
        if all(c in subtasks_comp_df.columns for c in required_cols_joint):
            plot_data_joint = subtasks_comp_df[required_cols_joint].dropna(subset=[plot_dcoder_col, merged_diff_col])
            if not plot_data_joint.empty:
                plot_data_joint['hover_name_joint'] = plot_data_joint['subtask_cleaned'] + " (Group: " + plot_data_joint['group'].astype(str) + ")"
                fig_joint_title = f'Joint Dist: (I-M) vs (I-{cleaned_merged_short_name}) Diffs'
                fig_joint = px.scatter(plot_data_joint, x=plot_dcoder_col, y=merged_diff_col,
                                       marginal_x="histogram", marginal_y="histogram", trendline="ols",
                                       hover_name='hover_name_joint',
                                       hover_data={plot_dcoder_col: ':.2f', merged_diff_col: ':.2f', 'group': True, 'subtask_cleaned': True, 'hover_name_joint': False},
                                       labels={plot_dcoder_col: f'{instruct_short_label}–{coder_short_label} Diff (%)', merged_diff_col: f'{instruct_short_label}–{cleaned_merged_short_name} Diff (%)', 'subtask_cleaned': 'Subtask', 'group': 'Group'},
                                       title=fig_joint_title)
                fig_joint.update_layout(width=default_plot_width, **font_config) 
                fig_joint.show(); print(f"  - Generated plot: {fig_joint_title}")
else: print("Skipping Jointplot(s): subtasks_comp_df empty, 'd_coder' missing/all_NaN, or no valid merged diff columns.")

if not subtasks_comp_df.empty and plot_dcoder_col and not subtasks_comp_df[plot_dcoder_col].isna().all() and plot_merged_diff_cols_sub:
    print("Generating Scatter Plot(s) for Subtask Differences (I-M vs I-MergedX)...")
    for i, merged_diff_col in enumerate(plot_merged_diff_cols_sub):
        merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name = clean_plot_name(merged_short_name)
        required_cols_scatter = [plot_dcoder_col, merged_diff_col, 'subtask_cleaned', 'group']
        if all(c in subtasks_comp_df.columns for c in required_cols_scatter):
             plot_data_scatter = subtasks_comp_df[required_cols_scatter].dropna(subset=[plot_dcoder_col, merged_diff_col])
             if not plot_data_scatter.empty and 'group' in plot_data_scatter.columns:
                fig_scatter_comp_title = f'Subtask Diffs Comp: (I-M) vs (I-{cleaned_merged_short_name})'
                fig_scatter_comp = px.scatter(plot_data_scatter, x=plot_dcoder_col, y=merged_diff_col, color='group',
                                              trendline="ols", hover_name='subtask_cleaned',
                                              hover_data={'group':True, plot_dcoder_col:':.2f', merged_diff_col:':.2f'},
                                              labels={plot_dcoder_col: f'{instruct_short_label}–{coder_short_label} Diff (%)', merged_diff_col: f'{instruct_short_label}–{cleaned_merged_short_name} Diff (%)', 'subtask_cleaned': 'Subtask', 'group': 'Group'},
                                              title=fig_scatter_comp_title)
                fig_scatter_comp.update_layout(width=default_plot_width, **font_config) 
                fig_scatter_comp.show(); print(f"  - Generated plot: {fig_scatter_comp_title}")
else: print(f"Skipping Scatter Plot(s): subtasks_comp_df empty, 'd_coder' missing/all_NaN, or no valid merged diff columns.")

if not subtasks_comp_df.empty and plot_dcoder_col and not subtasks_comp_df[plot_dcoder_col].isna().all() and plot_merged_diff_cols_sub:
    print("Generating Top/Bottom Subtask Impact Plot(s)..."); N = 5
    for i, merged_diff_col in enumerate(plot_merged_diff_cols_sub):
        merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name = clean_plot_name(merged_short_name)
        impact_col = f'impact_{cleaned_merged_short_name}'
        subtasks_comp_df[impact_col] = subtasks_comp_df[plot_dcoder_col] - subtasks_comp_df[merged_diff_col]
        subtasks_sorted = subtasks_comp_df.dropna(subset=[impact_col]).sort_values(impact_col)
        
        if len(subtasks_sorted) >= N * 2:
            top_n = subtasks_sorted.nlargest(N, impact_col); bottom_n = subtasks_sorted.nsmallest(N, impact_col); plot_data_bar = pd.concat([top_n, bottom_n]).drop_duplicates(subset=['subtask_cleaned'])
            if not plot_data_bar.empty:
                hover_cols = {'subtask_cleaned': False, 'group': True}; labels_dict = {'subtask_cleaned': 'Subtask', 'group':'Group'}
                if impact_col in plot_data_bar.columns: hover_cols[impact_col] = ':.2f'; labels_dict[impact_col] = f'Impact: ({coder_short_label} - {cleaned_merged_short_name}) Rel. to {instruct_short_label} (%)'
                if plot_dcoder_col in plot_data_bar.columns: hover_cols[plot_dcoder_col] = ':.2f'; labels_dict[plot_dcoder_col] = f'I-M Diff (%)'
                if merged_diff_col in plot_data_bar.columns: hover_cols[merged_diff_col] = ':.2f'; labels_dict[merged_diff_col] = f'I-{cleaned_merged_short_name} Diff (%)'
                
                fig_bar_title = f'Top/Bottom {N} Subtasks: Rel. Impact of {cleaned_merged_short_name} vs {coder_short_label}'
                fig_bar = px.bar(plot_data_bar, x=impact_col, y='subtask_cleaned', orientation='h',
                                 color=impact_col, color_continuous_scale=px.colors.diverging.RdBu, color_continuous_midpoint=0,
                                 hover_data=hover_cols, labels=labels_dict, title=fig_bar_title)
                fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}, width=default_plot_width, **font_config); fig_bar.show(); print(f"  - Generated plot: {fig_bar_title}") 
else: print("Skipping Top/Bottom Subtask Impact Plot(s): subtasks_comp_df empty, 'd_coder' missing/all_NaN, or no valid merged diff columns.")

if not subtasks_comp_df.empty and plot_dcoder_col and not subtasks_comp_df[plot_dcoder_col].isna().all() and plot_merged_diff_cols_sub:
    print("Generating Clustermap/Dendrogram(s) for Subtasks (I-M vs I-MergedX)...")
    for merged_diff_col in plot_merged_diff_cols_sub:
        merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name = clean_plot_name(merged_short_name)
        cluster_cols_single = [plot_dcoder_col, merged_diff_col]
        if 'subtask_cleaned' in subtasks_comp_df.columns:
            diff_matrix_single = subtasks_comp_df.set_index('subtask_cleaned')[cluster_cols_single].dropna(how='any')
            if not diff_matrix_single.empty and len(diff_matrix_single) > 1:
                scaler = StandardScaler(); scaled_data_single = scaler.fit_transform(diff_matrix_single.values)
                try:
                    row_linkage_single = linkage(pdist(scaled_data_single), method='average', metric='euclidean'); ordered_row_indices_single = leaves_list(row_linkage_single)
                    heatmap_data_ordered_single = scaled_data_single[ordered_row_indices_single]; ordered_row_labels_single = diff_matrix_single.index[ordered_row_indices_single].tolist()
                    heatmap_col_labels_single = [f'I-M', f'I-{cleaned_merged_short_name}']
                    
                    fig_heatmap_single_title = f'Clustered Heatmap: Profile for {cleaned_merged_short_name} (Scaled)'
                    fig_heatmap_single = px.imshow(heatmap_data_ordered_single, labels=dict(x="Difference Type (vs Instruct)", y="Subtask", color="Scaled Value"), x=heatmap_col_labels_single, y=ordered_row_labels_single, aspect="auto", color_continuous_scale='RdBu_r', title=fig_heatmap_single_title)
                    fig_heatmap_single.update_xaxes(side="top"); fig_heatmap_single.update_layout(height=max(600, 20 * len(ordered_row_labels_single)), width=default_plot_width, **font_config); fig_heatmap_single.show(); print(f"  - Generated plot: {fig_heatmap_single_title}") 
                    
                    if len(scaled_data_single) > 1:
                        fig_dendro_row_single_title = f'Row Dendrogram: Profile for {cleaned_merged_short_name} (Scaled)'
                        fig_dendro_row_single = ff.create_dendrogram(scaled_data_single, orientation='right', labels=diff_matrix_single.index.tolist(), linkagefun=lambda x: linkage(x, method='average', metric='euclidean'))
                        fig_dendro_row_single.update_layout(title=fig_dendro_row_single_title, height=max(600, 20 * len(diff_matrix_single)), width=default_plot_width, **font_config); fig_dendro_row_single.show(); print(f"  - Generated plot: {fig_dendro_row_single_title}") 
                except Exception as e: print(f"Error during clustering or plotting for {cleaned_merged_short_name}: {e}")
else: print("Skipping Clustermap/Dendrogram(s) for Subtasks: subtasks_comp_df empty, 'd_coder' missing/all_NaN, or no valid merged diff columns.")


plot_dcoder_col_main = 'd_coder' if 'd_coder' in summary_comp_df.columns and not summary_comp_df['d_coder'].isna().all() else None
plot_merged_diff_cols_main = [c for c in diff_cols_main if c.startswith('d_merged_') and c in summary_comp_df.columns and not summary_comp_df[c].isna().all()]

if plot_dcoder_col_main and plot_merged_diff_cols_main:
     print("Generating Main Task Dendrogram(s) (I-M vs I-MergedX)...")
     for merged_diff_col_main in plot_merged_diff_cols_main:
        merged_short_name = merged_diff_col_main.replace('d_merged_', '')
        cleaned_merged_short_name = clean_plot_name(merged_short_name)
        cols_main_single = [plot_dcoder_col_main, merged_diff_col_main]
        
        main_matrix_data_single = summary_comp_df[cols_main_single].dropna(how='any')
        if len(main_matrix_data_single) >= 2:
            try:
                fig_dendro_main_s_title = f'Dendrogram: Main Tasks based on Profile for {cleaned_merged_short_name}'
                fig_dendro_main_s = ff.create_dendrogram(main_matrix_data_single.values, labels=main_matrix_data_single.index.tolist(), linkagefun=lambda x: linkage(x, method='ward'))
                dynamic_width_dendro_main = max(default_plot_width, 30 * len(main_matrix_data_single.index))
                fig_dendro_main_s.update_layout(title=fig_dendro_main_s_title, yaxis_title='Distance', xaxis_title='Task', width=dynamic_width_dendro_main, height=default_plot_height, **font_config); fig_dendro_main_s.show(); print(f"  - Generated plot: {fig_dendro_main_s_title}") 
            except Exception as e: print(f"Could not generate Main Tasks Dendrogram for {cleaned_merged_short_name}: {e}")
else: print("Skipping Main Task Dendrogram(s): 'd_coder' on main tasks missing/all_NaN or no valid merged diff columns for main tasks.")

plot_dcoder_col_sub = 'd_coder' if 'd_coder' in subtasks_comp_df.columns and not subtasks_comp_df['d_coder'].isna().all() else None
plot_merged_diff_cols_sub = [c for c in diff_cols_subtasks if c.startswith('d_merged_') and c in subtasks_comp_df.columns and not subtasks_comp_df[c].isna().all()]

if not subtasks_comp_df.empty and plot_dcoder_col_sub and plot_merged_diff_cols_sub:
    print("Generating Subtask Dendrogram(s) (I-M vs I-MergedX)...")
    for merged_diff_col_sub_dendro in plot_merged_diff_cols_sub:
       merged_short_name = merged_diff_col_sub_dendro.replace('d_merged_', '')
       cleaned_merged_short_name = clean_plot_name(merged_short_name)
       cols_sub_single = [plot_dcoder_col_sub, merged_diff_col_sub_dendro]
       if 'subtask_cleaned' in subtasks_comp_df.columns:
           sub_matrix_df_dendro_s = subtasks_comp_df.set_index('subtask_cleaned')[cols_sub_single].dropna(how='any')
           if len(sub_matrix_df_dendro_s) >= 2:
               sub_values_s = sub_matrix_df_dendro_s.values; sub_labels_s = sub_matrix_df_dendro_s.index.tolist()
               try:
                   fig_dendro_sub_s_title = f'Dendrogram: Subtasks based on Profile for {cleaned_merged_short_name}'
                   fig_dendro_sub_s = ff.create_dendrogram(sub_values_s, labels=sub_labels_s, linkagefun=lambda x: linkage(x, method='ward'))
                   dynamic_width_dendro_sub = max(default_plot_width, 30 * len(sub_labels_s)) # Dynamic width
                   fig_dendro_sub_s.update_layout(title=fig_dendro_sub_s_title, yaxis_title='Distance', xaxis_title='Subtask', height=max(600, 15 * len(sub_labels_s)), width=dynamic_width_dendro_sub, xaxis=dict(tickangle=-90), **font_config); fig_dendro_sub_s.show(); print(f"  - Generated plot: {fig_dendro_sub_s_title}") 
               except Exception as e: print(f"Could not generate Subtasks Dendrogram for {cleaned_merged_short_name}: {e}")
else: print("Skipping Subtask Dendrogram(s): subtasks_comp_df empty, 'd_coder' for subtasks missing/all_NaN, or no valid merged diff columns for subtasks.")
print("\n--- Script Finished ---")


--- Model Categorization (Harness) ---
Base Model: Qwen2.5-7B (Qwen2.5 Base)
Instruct Model: Qwen2.5-7B-Instruct (Qwen2.5 Instruct)
Coder Model: Qwen2.5-Coder-7B (Qwen2.5 Coder)
Merged Models (5):
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29 (Task Arithmetic)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29 (DARE Ties)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29 (Ties)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29 (Slerp)
  - Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29 (Linear )
-------------------------
Models for comparison (in order): ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Coder', 'Task Arithmetic', 'DARE Ties', 'Ties', 'Slerp', 'Linear ']
-------------------------

--- Summary DataFrame (Comparison Models) ---
             Qwen2.5-7B  Qwen2.5-7B-Instruct  Qwen2.5-Coder-7B  \
gsm8k         79.150872            70.583776         77.862017   
mmlu          69.733656            69.904572         65.175901   
lead

Generated plot: Merged Model Performance vs. Instruct & Coder (MainTasks Overall)

--- Correlation Matrix of Performance Categories Between Merged Models (MainTasks Overall) ---
Merged Model     Task Arithmetic  DARE Ties  Ties  Slerp  Linear 
Merged Model                                                     
Task Arithmetic              1.0        1.0   1.0    0.0      1.0
DARE Ties                    1.0        1.0   1.0    0.0      1.0
Ties                         1.0        1.0   1.0    0.0      1.0
Slerp                        0.0        0.0   0.0    1.0      0.0
Linear                       1.0        1.0   1.0    0.0      1.0


Generated plot: Correlation of Perf. Categories Between Merged Models (MainTasks)

--- Merged Model Performance Categorization for: LeaderboardSubtasks ---

--- Task Scenario Ranking Table (LeaderboardSubtasks) ---
                      Task  Better_Count Better_Models  Worse_Count  \
0                      bbh             0                          4   
1  bbh_boolean_expressions             0                          4   
2     bbh_causal_judgement             0                          5   
3   bbh_date_understanding             0                          4   
4    bbh_disambiguation_qa             1         Slerp            4   

                                       Worse_Models  Between_Equal_Count  \
0         DARE Ties, Linear , Task Arithmetic, Ties                    1   
1         DARE Ties, Linear , Task Arithmetic, Ties                    1   
2  DARE Ties, Linear , Slerp, Task Arithmetic, Ties                    0   
3         DARE Ties, Linear , Task Arithmetic, Ties   

Generated plot: Merged Model Performance vs. Instruct & Coder (LeaderboardSubtasks Overall)

--- Correlation Matrix of Performance Categories Between Merged Models (LeaderboardSubtasks Overall) ---
Merged Model     Task Arithmetic  DARE Ties      Ties     Slerp   Linear 
Merged Model                                                             
Task Arithmetic         1.000000   0.999208  0.999151  0.227901  0.999951
DARE Ties               0.999208   1.000000  0.996720  0.188982  0.998766
Ties                    0.999151   0.996720  1.000000  0.267828  0.999510
Slerp                   0.227901   0.188982  0.267828  1.000000  0.237527
Linear                  0.999951   0.998766  0.999510  0.237527  1.000000


Generated plot: Correlation of Perf. Categories Between Merged Models (LeaderboardSubtasks)

--- Generating Original Plots (Harness Data) ---


Generated plot: Difference Trends on Main Tasks (vs Instruct) - All Merged


Generated plot: Absolute Performance on Main Tasks (Line Chart)


Generated plot: Absolute Performance Comparison by Task (Faceted Horizontal with Colors & Shaded Area)

--- Generating Leaderboard Subtasks Performance Distribution Box Plot ---


Generated plot: Leaderboard Subtasks Performance Distribution by Model

--- Generating Subtask Performance Comparison Chart ---


Generated plot: Subtask Performance Comparison by Group and Subtask (Grid with Colors & Shaded Area)
Generating Subtask Difference Boxplot(s)...


  - Generated plot: Subtask Diffs: Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Task Arithmetic


  - Generated plot: Subtask Diffs: Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–DARE Ties


  - Generated plot: Subtask Diffs: Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Ties


  - Generated plot: Subtask Diffs: Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Slerp


  - Generated plot: Subtask Diffs: Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Linear 
Generating Absolute Score Boxplot(s) for Subtasks...


  - Generated plot: Absolute Subtask Perf: Incl. Task Arithmetic


  - Generated plot: Absolute Subtask Perf: Incl. DARE Ties


  - Generated plot: Absolute Subtask Perf: Incl. Ties


  - Generated plot: Absolute Subtask Perf: Incl. Slerp


  - Generated plot: Absolute Subtask Perf: Incl. Linear 
Generating Jointplot(s) for Subtask Differences (I-M vs I-MergedX)...


  - Generated plot: Joint Dist: (I-M) vs (I-Task Arithmetic) Diffs


  - Generated plot: Joint Dist: (I-M) vs (I-DARE Ties) Diffs


  - Generated plot: Joint Dist: (I-M) vs (I-Ties) Diffs


  - Generated plot: Joint Dist: (I-M) vs (I-Slerp) Diffs


  - Generated plot: Joint Dist: (I-M) vs (I-Linear ) Diffs
Generating Scatter Plot(s) for Subtask Differences (I-M vs I-MergedX)...


  - Generated plot: Subtask Diffs Comp: (I-M) vs (I-Task Arithmetic)


  - Generated plot: Subtask Diffs Comp: (I-M) vs (I-DARE Ties)


  - Generated plot: Subtask Diffs Comp: (I-M) vs (I-Ties)


  - Generated plot: Subtask Diffs Comp: (I-M) vs (I-Slerp)


  - Generated plot: Subtask Diffs Comp: (I-M) vs (I-Linear )
Generating Top/Bottom Subtask Impact Plot(s)...


  - Generated plot: Top/Bottom 5 Subtasks: Rel. Impact of Task Arithmetic vs Qwen2.5 Coder


  - Generated plot: Top/Bottom 5 Subtasks: Rel. Impact of DARE Ties vs Qwen2.5 Coder


  - Generated plot: Top/Bottom 5 Subtasks: Rel. Impact of Ties vs Qwen2.5 Coder


  - Generated plot: Top/Bottom 5 Subtasks: Rel. Impact of Slerp vs Qwen2.5 Coder


  - Generated plot: Top/Bottom 5 Subtasks: Rel. Impact of Linear  vs Qwen2.5 Coder
Generating Clustermap/Dendrogram(s) for Subtasks (I-M vs I-MergedX)...


  - Generated plot: Clustered Heatmap: Profile for Task Arithmetic (Scaled)


  - Generated plot: Row Dendrogram: Profile for Task Arithmetic (Scaled)


  - Generated plot: Clustered Heatmap: Profile for DARE Ties (Scaled)


  - Generated plot: Row Dendrogram: Profile for DARE Ties (Scaled)


  - Generated plot: Clustered Heatmap: Profile for Ties (Scaled)


  - Generated plot: Row Dendrogram: Profile for Ties (Scaled)


  - Generated plot: Clustered Heatmap: Profile for Slerp (Scaled)


  - Generated plot: Row Dendrogram: Profile for Slerp (Scaled)


  - Generated plot: Clustered Heatmap: Profile for Linear  (Scaled)


  - Generated plot: Row Dendrogram: Profile for Linear  (Scaled)
Generating Main Task Dendrogram(s) (I-M vs I-MergedX)...


  - Generated plot: Dendrogram: Main Tasks based on Profile for Task Arithmetic


  - Generated plot: Dendrogram: Main Tasks based on Profile for DARE Ties


  - Generated plot: Dendrogram: Main Tasks based on Profile for Ties


  - Generated plot: Dendrogram: Main Tasks based on Profile for Slerp


  - Generated plot: Dendrogram: Main Tasks based on Profile for Linear 
Generating Subtask Dendrogram(s) (I-M vs I-MergedX)...


  - Generated plot: Dendrogram: Subtasks based on Profile for Task Arithmetic


  - Generated plot: Dendrogram: Subtasks based on Profile for DARE Ties


  - Generated plot: Dendrogram: Subtasks based on Profile for Ties


  - Generated plot: Dendrogram: Subtasks based on Profile for Slerp


  - Generated plot: Dendrogram: Subtasks based on Profile for Linear 

--- Script Finished ---
