In [1]:
# -*- coding: utf-8 -*-

import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import re # Added for cleaning plot names
import json # Added to handle potential JSON loading if paths were used

# Set default plotly template for better aesthetics if needed
pio.templates.default = "plotly_white"

# Create directory for CSV exports
csv_export_dir = "plot_data_csv_exports"
os.makedirs(csv_export_dir, exist_ok=True)

# --- Helper function for cleaning model names for plots ---
def clean_plot_name(name):
    if name is None: # Handle potential None input
        return "Unknown"
    name_str = str(name)

    # For merged models (which won't start with "Qwen2.5" after short_names mapping)
    # remove the trailing _XX
    if not name_str.startswith("Qwen2.5"): # Handles names like "Linear_24" -> "Linear"
        name_str = re.sub(r'_\d+$', '', name_str)
    return name_str

# --- Font configuration for plots ---
font_config = {
    "title_font_size": 30,
    "font_size": 20,
    "xaxis_title_font_size": 20,
    "yaxis_title_font_size": 20,
    "xaxis_tickfont_size": 20, # Adjusted for potentially dense plots
    "yaxis_tickfont_size": 20, # Adjusted for potentially dense plots
    "legend_title_font_size": 30, # Kept for other plots that might use legends
    "legend_font_size": 25,      # Kept for other plots
}

# --- Default Plot Dimensions ---
default_plot_height = 520
default_plot_width = 2300 # Added for wider plots

# --- 0. Helper Function from process_results.py (adapted) ---
def process_frame(frame):
    """
    Processes the DataFrame by selecting relevant columns,
    handling potential renaming and grouping data.
    """
    if "Unnamed: 0" in frame.columns:
        del frame["Unnamed: 0"]

    if "linguistic subfield" in frame.columns and "linguistic competencies" not in frame.columns:
        frame["linguistic competencies"] = frame["linguistic subfield"]
        del frame["linguistic subfield"]
    elif "linguistic subfield" in frame.columns and "linguistic competencies" in frame.columns:
        del frame["linguistic subfield"]
    return frame

# --- 1. Configuration ---
# Switched to Coder models
models = [
    "Qwen__Qwen2.5-7B",
    "Qwen__Qwen2.5-7B-Instruct",
    "Qwen__Qwen2.5-Coder-7B",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29",
]

# Updated short_names for Coder models
short_names = {
    "Qwen__Qwen2.5-7B": "Qwen2.5 Base",
    "Qwen__Qwen2.5-7B-Instruct": "Qwen2.5 Instruct",
    "Qwen__Qwen2.5-Coder-7B": "Qwen2.5 Coder",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29": "Linear_29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29": "Task_Arith_29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29": "DARE_Ties_29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29": "Ties_29",
    "Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29": "Slerp_29",
}

# tasks list is used to define the order of main task categories (linguistic competencies)
# This will be dynamically updated after data loading based on summary_comp_df.index
tasks_initial_fallback = ["Syntax", "Semantics", "Morphology", "Discourse", "Pragmatics", "Reasoning"]

paths = {m: {t: f"organized_results/{t}/{m}/result.json" for t in tasks_initial_fallback} for m in models} # This paths dict might not be used if data comes from CSVs

abs_data_file = "results_flash-holmes.csv"
group_data_file = "transformed_results.csv"

MAIN_TASK_COL = "probing dataset"
SUBTASK_COL = "probe"
SUBTASK_GROUP_COL = "linguistic competencies"
SUBTASK_PHENOMENA_COL = "linguistic phenomena"

# Updated model categorization logic
instruct_model = None; coder_model = None; merged_models = []; base_model = None
for m_full_name in models:
    m_short = short_names.get(m_full_name, "") # Get the NEW short name

    is_instruct = (m_short == "Qwen2.5 Instruct")
    is_coder = (m_short == "Qwen2.5 Coder")
    is_base = (m_short == "Qwen2.5 Base")
    
    is_merged = not (is_instruct or is_coder or is_base)

    if is_instruct:
        instruct_model = m_full_name
    elif is_coder:
        coder_model = m_full_name
    elif is_base:
        base_model = m_full_name
    elif is_merged:
        if m_full_name in models:
            merged_models.append(m_full_name)

if not instruct_model: print("CRITICAL ERROR: Instruct model not identified.")
if not coder_model: print("CRITICAL ERROR: Coder model not identified.")
if not merged_models: print(f"WARNING: No merged models identified ({merged_models}).")
if not base_model: print("WARNING: Base model not identified.")

print("--- Model Categorization ---")
if base_model: print(f"Base Model: {base_model} ({short_names.get(base_model, 'N/A')})")
if instruct_model: print(f"Instruct Model: {instruct_model} ({short_names.get(instruct_model, 'N/A')})")
if coder_model: print(f"Coder Model: {coder_model} ({short_names.get(coder_model, 'N/A')})")
print(f"Merged Models ({len(merged_models)}):")
for m_idx, m in enumerate(merged_models): print(f"  - {m} ({short_names.get(m, 'N/A')})")
print("-" * 25)

comparison_models_ordered = []
if base_model: comparison_models_ordered.append(base_model)
if instruct_model: comparison_models_ordered.append(instruct_model)
if coder_model: comparison_models_ordered.append(coder_model)
comparison_models_ordered.extend([m for m in merged_models if m])
comparison_models = list(dict.fromkeys(m for m in comparison_models_ordered if m))

if not comparison_models: print("CRITICAL ERROR: No models for comparison identified. Exiting."); exit()
print(f"Models for comparison (in order): {[clean_plot_name(short_names.get(m, m)) for m in comparison_models]}") # Apply clean_plot_name here for accurate listing
print("-" * 25)

# --- 2. Data Loading ---
def load_data_from_csv(abs_filepath, group_filepath, model_list,
                       main_task_col_name_in_group_file,
                       sub_task_col_name_in_group_file,
                       group_col_name_in_group_file,
                       phenomena_col_name_in_group_file):
    try:
        raw_abs_df = pd.read_csv(abs_filepath)
    except FileNotFoundError:
        print(f"Error: Absolute scores file not found at {abs_filepath}")
        return pd.DataFrame(), pd.DataFrame()

    if "Unnamed: 0" in raw_abs_df.columns:
        del raw_abs_df["Unnamed: 0"]

    raw_abs_df.rename(columns={'probing_dataset': 'subtask_cleaned', 'model_name': 'model'}, inplace=True)

    if 'encoding' in raw_abs_df.columns:
        print(f"Filtering absolute scores for 'encoding' == 'full'. Original rows: {len(raw_abs_df)}")
        raw_abs_df = raw_abs_df[raw_abs_df['encoding'] == 'full'].copy()
        print(f"Rows after filtering for encoding == 'full': {len(raw_abs_df)}")
        if len(raw_abs_df) == 0:
            print("Warning: No rows remaining after filtering for 'encoding' == 'full'. Subsequent data might be empty.")
    else:
        print("Warning: 'encoding' column not found in absolute scores file. Cannot filter by encoding.")

    raw_abs_df['score'] = pd.to_numeric(raw_abs_df['score'], errors='coerce')
    abs_df_grouped = raw_abs_df.groupby(['subtask_cleaned', 'model'])['score'].mean().reset_index()
    abs_pivot_df = abs_df_grouped.pivot_table(index='subtask_cleaned', columns='model', values='score').reset_index()

    available_models_in_abs = [m for m in model_list if m in abs_pivot_df.columns]
    missing_models_from_abs = [m for m in model_list if m not in abs_pivot_df.columns]
    if missing_models_from_abs:
        print(f"Warning: Models from model_list not found in absolute scores data (after filtering/pivoting): {missing_models_from_abs}")

    for model_col in model_list:
        if model_col not in abs_pivot_df.columns:
            abs_pivot_df[model_col] = np.nan

    cols_to_keep_abs = ['subtask_cleaned'] + [m for m in model_list if m in abs_pivot_df.columns]
    abs_final_df = abs_pivot_df[cols_to_keep_abs].copy()

    try:
        raw_group_df = pd.read_csv(group_filepath)
    except FileNotFoundError:
        print(f"Error: Group info file not found at {group_filepath}.")
        for model_col in available_models_in_abs:
            if model_col in abs_final_df.columns: abs_final_df[model_col] = abs_final_df[model_col] * 100
        return pd.DataFrame(columns=available_models_in_abs), abs_final_df

    group_df_processed = process_frame(raw_group_df.copy())
    rename_map_group = {
        sub_task_col_name_in_group_file: 'subtask_cleaned',
        group_col_name_in_group_file: 'group',
        main_task_col_name_in_group_file: 'main_task_category'
    }
    group_df_processed.rename(columns=rename_map_group, inplace=True)

    id_cols_group = ['main_task_category', 'subtask_cleaned', 'group']
    original_phenomena_col_name = phenomena_col_name_in_group_file
    phenomena_col_to_check = rename_map_group.get(original_phenomena_col_name, original_phenomena_col_name)
    if phenomena_col_to_check in group_df_processed.columns:
        id_cols_group.append(phenomena_col_to_check)
    elif original_phenomena_col_name in group_df_processed.columns:
        id_cols_group.append(original_phenomena_col_name)
    if 'probe type' in group_df_processed.columns: id_cols_group.append('probe type')
    id_cols_group_present = [col for col in id_cols_group if col in group_df_processed.columns]

    if 'subtask_cleaned' not in group_df_processed.columns:
        print(f"Critical Error: Subtask column for merging ('subtask_cleaned') not found in group file after renaming. Expected from '{sub_task_col_name_in_group_file}'.")
        for model_col in available_models_in_abs:
            if model_col in abs_final_df.columns: abs_final_df[model_col] = abs_final_df[model_col] * 100
        return pd.DataFrame(columns=available_models_in_abs), abs_final_df

    group_info_to_merge = group_df_processed[id_cols_group_present].drop_duplicates(subset=['subtask_cleaned'])
    subtasks_df = pd.merge(abs_final_df, group_info_to_merge, on='subtask_cleaned', how='left')

    models_in_subtasks_df = [m for m in model_list if m in subtasks_df.columns]
    for model_col in models_in_subtasks_df:
        subtasks_df[model_col] = pd.to_numeric(subtasks_df[model_col], errors='coerce') * 100

    summary_df = pd.DataFrame()
    if 'main_task_category' in subtasks_df.columns and models_in_subtasks_df:
        subtasks_df['main_task_category'] = subtasks_df['main_task_category'].astype(str).fillna('Unknown_Category')
        summary_df = subtasks_df.groupby('main_task_category')[models_in_subtasks_df].mean()
    else:
        cols_for_empty_summary = [m for m in model_list if m in abs_pivot_df.columns]
        print(f"Warning: 'main_task_category' not found or no models for summary. Summary DF might be empty or based on limited models.")
        summary_df = pd.DataFrame(columns=cols_for_empty_summary)
        if not subtasks_df.empty and 'main_task_category' not in subtasks_df.columns:
            print("  Specifically, 'main_task_category' column is missing in the merged subtasks_df.")

    return summary_df, subtasks_df

summary_comp_df, subtasks_comp_df = load_data_from_csv(abs_data_file, group_data_file, comparison_models, MAIN_TASK_COL, SUBTASK_COL, SUBTASK_GROUP_COL, SUBTASK_PHENOMENA_COL)

if not summary_comp_df.empty:
    tasks = summary_comp_df.index.tolist()
else:
    print("Warning: summary_comp_df is empty after loading. 'tasks' list for ordering might not be accurate.")
    tasks = tasks_initial_fallback

print("\n--- Summary DataFrame (Comparison Models from Absolute Scores) ---")
if not summary_comp_df.empty:
    print(summary_comp_df.head())
else:
    print("Summary DataFrame is empty.")
print("-" * 50)

if not subtasks_comp_df.empty:
    print("\n--- Subtasks DataFrame (Comparison Models from Absolute Scores, Head) ---")
    if 'group' in subtasks_comp_df.columns: print(f"Subtasks grouped by: {SUBTASK_GROUP_COL} (column name in df: 'group')")
    else: print(f"Warning: 'group' column (expected from {SUBTASK_GROUP_COL}) is missing in subtasks_comp_df.")
    print(subtasks_comp_df.head())
else: print("\n--- Subtasks DataFrame is empty or could not be loaded ---")
print("-" * 50)

# --- 3. Calculate Differences ---
can_calc_diffs = True
if instruct_model is None:
    print("Critical Error: Instruct model (instruct_model) is not defined. Cannot calculate differences.")
    can_calc_diffs = False
elif instruct_model not in comparison_models :
    print(f"Critical Error: Instruct model '{instruct_model}' not in comparison_models. Cannot calculate differences.")
    can_calc_diffs = False

if can_calc_diffs and (instruct_model not in summary_comp_df.columns and not summary_comp_df.empty):
    print(f"Warning: Instruct model '{instruct_model}' missing from summary_comp_df columns. Diff calculation for main tasks might fail.")
    
diff_cols_main = []; diff_cols_subtasks = []

if can_calc_diffs:
    if not summary_comp_df.empty and instruct_model in summary_comp_df.columns:
        if coder_model and coder_model in summary_comp_df.columns:
            summary_comp_df['d_coder'] = summary_comp_df[instruct_model] - summary_comp_df[coder_model]; diff_cols_main.append('d_coder')
        else: print(f"Warning: Coder model '{coder_model}' not in summary_comp_df or not defined. Skipping 'd_coder' for main tasks."); summary_comp_df['d_coder'] = np.nan

        for merged_m in merged_models:
            if merged_m in summary_comp_df.columns:
                merged_short_name = clean_plot_name(short_names.get(merged_m, merged_m.split("__")[-1])) # Apply clean_plot_name here for diff col name
                col_name = f"d_merged_{merged_short_name}"
                summary_comp_df[col_name] = summary_comp_df[instruct_model] - summary_comp_df[merged_m]; diff_cols_main.append(col_name)
            else: print(f"Warning: Merged model {merged_m} not in summary_comp_df. Skipping diff for main tasks.")

        if diff_cols_main:
            print("\n--- Summary DataFrame with Differences (Head) ---")
            present_main_diff_cols = [c for c in diff_cols_main if c in summary_comp_df.columns]
            if present_main_diff_cols: print(summary_comp_df[present_main_diff_cols].head())
            else: print("No diff columns for main tasks created/present.")
            print("-" * 50)
    elif summary_comp_df.empty :
        print(f"Info: summary_comp_df is empty. Skipping difference calculation for main tasks.")
    else:
        print(f"Warning: Instruct model '{instruct_model}' not in summary_comp_df (but summary_comp_df not empty). Cannot calculate differences for main tasks.")

    if not subtasks_comp_df.empty and instruct_model in subtasks_comp_df.columns:
        if coder_model and coder_model in subtasks_comp_df.columns:
            subtasks_comp_df['d_coder'] = subtasks_comp_df[instruct_model] - subtasks_comp_df[coder_model]; diff_cols_subtasks.append('d_coder')
        else: print(f"Warning: Coder model '{coder_model}' not in subtasks_comp_df or not defined. Skipping 'd_coder' for subtasks."); subtasks_comp_df['d_coder'] = np.nan

        for merged_m in merged_models:
            if merged_m in subtasks_comp_df.columns:
                merged_short_name = clean_plot_name(short_names.get(merged_m, merged_m.split("__")[-1])) # Apply clean_plot_name here
                col_name = f"d_merged_{merged_short_name}"
                subtasks_comp_df[col_name] = subtasks_comp_df[instruct_model] - subtasks_comp_df[merged_m]; diff_cols_subtasks.append(col_name)
            else: print(f"Warning: Merged model {merged_m} not in subtasks_comp_df. Skipping diff for subtasks.")

        if diff_cols_subtasks:
            print("\n--- Subtasks DataFrame with Differences (Head) ---")
            present_sub_diff_cols = [col for col in diff_cols_subtasks if col in subtasks_comp_df.columns]
            base_id_cols_sub = [c for c in ['subtask_cleaned', 'group', 'main_task_category'] if c in subtasks_comp_df.columns]

            models_to_show_sub_list = []
            if instruct_model and instruct_model in subtasks_comp_df.columns: models_to_show_sub_list.append(instruct_model)
            if coder_model and coder_model in subtasks_comp_df.columns: models_to_show_sub_list.append(coder_model)
            for m_model in merged_models:
                if m_model in subtasks_comp_df.columns: models_to_show_sub_list.append(m_model)

            cols_to_show_sub = base_id_cols_sub + models_to_show_sub_list + present_sub_diff_cols
            cols_to_show_sub = [c for c in cols_to_show_sub if c in subtasks_comp_df.columns]
            if present_sub_diff_cols and cols_to_show_sub:
                print(subtasks_comp_df[cols_to_show_sub].head())
            elif not present_sub_diff_cols:
                print("No difference columns for subtasks created/present.")
            else:
                print("No columns available to display for subtask differences (or base ID columns missing).")
        else: print("No diff columns for subtasks were created.")
        print("-" * 50)
    elif subtasks_comp_df.empty:
        print("Info: Subtasks DataFrame (subtasks_comp_df) empty, skipping subtask diffs.")
    else:
        print(f"Warning: Instruct model '{instruct_model}' missing from subtasks_comp_df (but subtasks_comp_df not empty). Cannot calculate subtask diffs.")
else:
    print("Skipping difference calculations due to earlier critical errors or missing instruct model in data.")

# --- 4. Ranking Generation ---
def generate_rankings(summary_data, subtask_data, model_names, short_names_map,
                      main_task_index_name='main_task_category',
                      subtask_name_col='subtask_cleaned', group_col_in_df='group'):
    ranking_results = {}

    models_to_rank_summary = [m for m in model_names if m in summary_data.columns] if not summary_data.empty else []
    models_to_rank_subtasks = [m for m in model_names if m in subtask_data.columns] if not subtask_data.empty else []

    if not models_to_rank_summary and not models_to_rank_subtasks:
        print("No models available in dataframes to generate rankings.")
        return ranking_results

    if not summary_data.empty and models_to_rank_summary:
        main_rankings = []
        current_idx_name = summary_data.index.name if summary_data.index.name else main_task_index_name
        for task_val in summary_data.index:
            scores = summary_data.loc[task_val, models_to_rank_summary].astype(float)
            ranked_models = scores.sort_values(ascending=False, na_position='last').index.tolist()
            ranked_short_names_for_table = [clean_plot_name(short_names_map.get(m, m)) for m in ranked_models] # Apply clean_plot_name
            row = {'Task': task_val}
            for i, name in enumerate(ranked_short_names_for_table):
                row[f'Rank {i+1}'] = name
            main_rankings.append(row)
        if main_rankings:
            main_rankings_df = pd.DataFrame(main_rankings).set_index('Task')
            main_rankings_df.index.name = current_idx_name
            ranking_results['main_tasks'] = main_rankings_df

    if not subtask_data.empty and models_to_rank_subtasks and subtask_name_col in subtask_data.columns:
        subtask_rankings = []
        actual_group_col_for_ranking = group_col_in_df if group_col_in_df in subtask_data.columns else None
        for idx, row_data in subtask_data.iterrows():
            if isinstance(row_data, pd.Series) and all(m in row_data.index for m in models_to_rank_subtasks):
                try:
                    scores = row_data[models_to_rank_subtasks].astype(float)
                except ValueError:
                    print(f"Warning: Scores for subtask row {idx} (name: {row_data.get(subtask_name_col, 'N/A')}) not numeric. Skipping for ranking.")
                    continue
                ranked_models = scores.sort_values(ascending=False, na_position='last').index.tolist()
                ranked_short_names_for_table = [clean_plot_name(short_names_map.get(m, m)) for m in ranked_models] # Apply clean_plot_name
                row = {'Subtask': row_data[subtask_name_col]}
                if actual_group_col_for_ranking:
                    row['Group'] = row_data[actual_group_col_for_ranking]
                for i, name in enumerate(ranked_short_names_for_table):
                    row[f'Rank {i+1}'] = name
                subtask_rankings.append(row)
        if subtask_rankings:
            rank_df_sub = pd.DataFrame(subtask_rankings)
            base_cols_sub = ['Subtask'] + (['Group'] if actual_group_col_for_ranking and 'Group' in rank_df_sub.columns else [])
            rank_cols_sub_list = [f'Rank {i+1}' for i in range(len(models_to_rank_subtasks))]
            for r_col in rank_cols_sub_list:
                if r_col not in rank_df_sub.columns: rank_df_sub[r_col] = np.nan
            cols_order_sub = base_cols_sub + rank_cols_sub_list
            rank_df_sub = rank_df_sub[cols_order_sub]
            if actual_group_col_for_ranking and 'Group' in rank_df_sub.columns and 'Subtask' in rank_df_sub.columns:
                rank_df_sub = rank_df_sub.sort_values(by=['Group', 'Subtask']).set_index(['Group', 'Subtask'])
            elif 'Subtask' in rank_df_sub.columns:
                rank_df_sub = rank_df_sub.set_index('Subtask')
            ranking_results['subtasks'] = rank_df_sub

    if not subtask_data.empty and group_col_in_df in subtask_data.columns and models_to_rank_subtasks:
        subtask_rankings_defined = 'subtask_rankings' in locals() and subtask_rankings is not None
        if 'subtasks' in ranking_results or not subtask_rankings_defined or (subtask_rankings_defined and not subtask_rankings) :
            try:
                temp_subtask_data = subtask_data.copy()
                for m in models_to_rank_subtasks:
                    if m in temp_subtask_data.columns:
                        temp_subtask_data[m] = pd.to_numeric(temp_subtask_data[m], errors='coerce')

                avg_scores_group_data = temp_subtask_data.dropna(subset=models_to_rank_subtasks, how='all')
                if not avg_scores_group_data.empty:
                    avg_scores_group = avg_scores_group_data.groupby(group_col_in_df)[models_to_rank_subtasks].mean()
                    group_rankings_list = []
                    for grp_name in avg_scores_group.index:
                        scores = avg_scores_group.loc[grp_name]
                        if scores.isna().all():
                            print(f"Warning: All scores for group '{grp_name}' are NaN after averaging. Skipping group ranking.")
                            continue
                        ranked_models = scores.sort_values(ascending=False, na_position='last').index.tolist()
                        ranked_short_names_for_table = [clean_plot_name(short_names_map.get(m, m)) for m in ranked_models] # Apply clean_plot_name
                        row = {'Group': grp_name}
                        for i, name in enumerate(ranked_short_names_for_table):
                            row[f'Rank {i+1}'] = name
                        group_rankings_list.append(row)
                    if group_rankings_list:
                        ranking_results['group_avg'] = pd.DataFrame(group_rankings_list).set_index('Group')
                else:
                    print("No data for group average ranking after NaN removal (all model scores were NaN).")
            except Exception as e:
                print(f"Could not calculate group average rankings: {e}")
    return ranking_results

rankings = generate_rankings(summary_comp_df, subtasks_comp_df, comparison_models, short_names, main_task_index_name=MAIN_TASK_COL, group_col_in_df='group')

print("\n" + "="*20 + " MODEL RANKINGS (from Absolute Scores) " + "="*20)
if 'main_tasks' in rankings and not rankings['main_tasks'].empty: print("\n--- Main Task Rankings ---"); print(rankings['main_tasks'])
if 'subtasks' in rankings and not rankings['subtasks'].empty: print("\n--- Subtask Rankings (Head)---"); print(rankings['subtasks'].head())
if 'group_avg' in rankings and not rankings['group_avg'].empty: print("\n--- Group Average Rankings ---"); print(rankings['group_avg'])
output_dir = "rankings_output_flash_holmes_absolute"; os.makedirs(output_dir, exist_ok=True); print(f"\n--- Saving Rankings to CSV in '{output_dir}/' ---")
for name, df_to_save in rankings.items():
    if isinstance(df_to_save, pd.DataFrame) and not df_to_save.empty:
        try: csv_filename = os.path.join(output_dir, f"{name}_rankings_fh_absolute.csv"); df_to_save.to_csv(csv_filename, index=True); print(f"Saved {name} rankings to {csv_filename}")
        except Exception as e: print(f"Error saving {name} rankings: {e}")
print("="*58)

# --- 5. Merged Model Performance Categorization (vs Instruct & Coder) on Main Tasks ---
print("\n--- Merged Model Performance Categorization (vs Instruct & Coder) on Main Tasks (from Absolute Scores) ---")
merged_model_comparison_counts = []
task_scenario_ranking_data = []

if instruct_model and coder_model and \
    (not summary_comp_df.empty and instruct_model in summary_comp_df.columns and coder_model in summary_comp_df.columns):

    tasks_categorization = {task: {"Better_than_both": [], "Worse_than_both": [], "Between_Equal": []} for task in summary_comp_df.index}

    for merged_m in merged_models:
        if merged_m in summary_comp_df.columns:
            better_than_both_count = 0; worse_than_both_count = 0; between_equal_count = 0
            for task_idx in summary_comp_df.index:
                merged_score = summary_comp_df.loc[task_idx, merged_m]
                instruct_score = summary_comp_df.loc[task_idx, instruct_model]
                coder_score = summary_comp_df.loc[task_idx, coder_model]

                if pd.isna(merged_score) or pd.isna(instruct_score) or pd.isna(coder_score): continue

                min_ic = min(instruct_score, coder_score); max_ic = max(instruct_score, coder_score)
                merged_m_short_cleaned = clean_plot_name(short_names.get(merged_m, merged_m))

                if merged_score > max_ic:
                    better_than_both_count += 1
                    tasks_categorization[task_idx]["Better_than_both"].append(merged_m_short_cleaned)
                elif merged_score < min_ic:
                    worse_than_both_count += 1
                    tasks_categorization[task_idx]["Worse_than_both"].append(merged_m_short_cleaned)
                elif min_ic <= merged_score <= max_ic:
                    between_equal_count += 1
                    tasks_categorization[task_idx]["Between_Equal"].append(merged_m_short_cleaned)
            merged_model_comparison_counts.append({"Merged Model": clean_plot_name(short_names.get(merged_m, merged_m)), "Better than Instruct & Coder": better_than_both_count, "Worse than Instruct & Coder": worse_than_both_count, "Between/Equal to Instruct & Coder": between_equal_count})
        else: print(f"Skipping categorization for {merged_m} as it's not in summary_comp_df.")

    for task_name, categories in tasks_categorization.items():
        task_scenario_ranking_data.append({
            "Task": task_name,
            "Better_Count": len(categories["Better_than_both"]),
            "Better_Models": ", ".join(sorted(list(set(categories["Better_than_both"])))),
            "Worse_Count": len(categories["Worse_than_both"]),
            "Worse_Models": ", ".join(sorted(list(set(categories["Worse_than_both"])))),
            "Between_Equal_Count": len(categories["Between_Equal"]),
            "Between_Equal_Models": ", ".join(sorted(list(set(categories["Between_Equal"]))))
        })
    task_scenario_df = pd.DataFrame(task_scenario_ranking_data)
    if not task_scenario_df.empty:
        print("\n--- Task Scenario Ranking Table (Merged Models vs Instruct & Coder) ---"); print(task_scenario_df)
        try: task_scenario_csv_filename = os.path.join(output_dir, "task_scenario_merged_model_rankings_fh_absolute.csv"); task_scenario_df.to_csv(task_scenario_csv_filename, index=False); print(f"Saved task scenario rankings to {task_scenario_csv_filename}")
        except Exception as e: print(f"Error saving task scenario rankings to CSV: {e}")
    else: print("Task Scenario Ranking Table is empty.")
    print("="*58)

    if merged_model_comparison_counts:
        counts_df = pd.DataFrame(merged_model_comparison_counts)
        print("\n--- Counts of Merged Model Performance Categories (Main Tasks Overall) ---"); print(counts_df)
        if not counts_df.empty:
            counts_df_melted = counts_df.melt(id_vars="Merged Model", value_vars=["Better than Instruct & Coder", "Worse than Instruct & Coder", "Between/Equal to Instruct & Coder"], var_name="Category", value_name="Number of Tasks")
            fig_counts = px.bar(counts_df_melted, x="Merged Model", y="Number of Tasks", color="Category", title="Merged Model Performance vs. Instruct & Coder (Main Tasks Overall)", barmode='stack', labels={"Number of Tasks": "Number of Main Tasks"})
            fig_counts.update_xaxes(categoryorder="array", categoryarray=counts_df["Merged Model"].tolist())
            fig_counts.update_layout(
                height=default_plot_height,
                width=default_plot_width,
                legend=dict(orientation="h", yanchor="bottom", y=-0.3, xanchor="center", x=0.5),
                **font_config
            )
            fig_counts.show(); print("Generated Plot: Merged Model Performance Category Counts")
            
            # Export data for this plot
            counts_df_melted.to_csv(os.path.join(csv_export_dir, "merged_model_performance_counts.csv"), index=False)
            print(f"Exported plot data to: {csv_export_dir}/merged_model_performance_counts.csv")

            if len(counts_df["Merged Model"].unique()) > 1:
                pivot_counts_df = counts_df.set_index("Merged Model")
                categories_to_correlate = ["Better than Instruct & Coder", "Worse than Instruct & Coder", "Between/Equal to Instruct & Coder"]
                categories_present = [cat for cat in categories_to_correlate if cat in pivot_counts_df.columns]
                if len(categories_present) > 1 and not pivot_counts_df[categories_present].T.empty and len(pivot_counts_df[categories_present].T.columns) > 1 :
                    correlation_matrix = pivot_counts_df[categories_present].T.corr()
                    print("\n--- Correlation Matrix of Performance Categories Between Merged Models (Main Tasks Overall) ---"); print(correlation_matrix)
                    
                    corr_heatmap_height = 600
                    fig_corr_heatmap = px.imshow(correlation_matrix, text_auto=True, aspect="auto", color_continuous_scale='RdBu_r', range_color=[-1,1], title="Correlation of Performance Categories Between Merged Models (Main Tasks Overall)")
                    fig_corr_heatmap.update_layout(height=corr_heatmap_height, width=default_plot_width, **font_config)
                    fig_corr_heatmap.show(); print("Generated Plot: Correlation Heatmap of Performance Categories")
                    
                    # Export correlation matrix
                    correlation_matrix.to_csv(os.path.join(csv_export_dir, "performance_categories_correlation_matrix.csv"))
                    print(f"Exported correlation matrix to: {csv_export_dir}/performance_categories_correlation_matrix.csv")
                else: print("Not enough categories/models with data for correlation matrix.")
            else: print("Not enough unique merged models for category correlation.")
        else: print("Counts DataFrame empty, cannot generate plots/correlations.")
    else: print("No merged model comparison counts generated.")
elif summary_comp_df.empty: print("Skipping Merged Model Performance Categorization & Task Scenario Table: summary_comp_df is empty.")
else: print(f"Skipping Merged Model Performance Categorization & Task Scenario Table: Key models (Instruct: {instruct_model in summary_comp_df.columns if instruct_model else 'N/A'}, Coder: {coder_model in summary_comp_df.columns if coder_model else 'N/A'}) or data missing from summary_comp_df.")
print("="*58)

# --- 6. Original Plotting Section (Plots 1-8 from previous script) ---
print("\n--- Generating Original Plots (Based on Absolute Scores) ---")
instruct_short_label = clean_plot_name(short_names.get(instruct_model, "Instruct")) if instruct_model else "Instruct"
coder_short_label = clean_plot_name(short_names.get(coder_model, "Coder")) if coder_model else "Coder"
xaxis_title_main_tasks = summary_comp_df.index.name if not summary_comp_df.empty and summary_comp_df.index.name else 'Main Task Category'

# Plot: Difference Trends on Main Tasks
if can_calc_diffs and not summary_comp_df.empty and instruct_model in summary_comp_df.columns :
    fig1 = go.Figure()
    plot_data_diff_trends = []
    
    if coder_model and 'd_coder' in summary_comp_df.columns and not summary_comp_df['d_coder'].isna().all():
        fig1.add_trace(go.Scatter(x=summary_comp_df.index, y=summary_comp_df['d_coder'], mode='lines+markers', name=f'{instruct_short_label}–{coder_short_label}', marker=dict(symbol='circle', size=8), line=dict(dash='dash'), hovertemplate=f'Task: %{{x}}<br>{instruct_short_label}–{coder_short_label}: %{{y:.2f}}%<extra></extra>'))
        for idx, task in enumerate(summary_comp_df.index):
            plot_data_diff_trends.append({
                'Task': task,
                'Difference_Type': f'{instruct_short_label}–{coder_short_label}',
                'Difference_Value': summary_comp_df.loc[task, 'd_coder']
            })
    
    colors = px.colors.qualitative.Plotly; merged_plot_idx = 0
    for diff_col in diff_cols_main:
        if diff_col.startswith('d_merged_') and diff_col in summary_comp_df.columns and not summary_comp_df[diff_col].isna().all():
            original_merged_short_name = diff_col.replace('d_merged_', '')
            cleaned_merged_short_name_plot = clean_plot_name(original_merged_short_name)
            fig1.add_trace(go.Scatter(x=summary_comp_df.index, y=summary_comp_df[diff_col], mode='lines+markers', name=f'{instruct_short_label}–{cleaned_merged_short_name_plot}', marker=dict(symbol='square', size=8, color=colors[merged_plot_idx % len(colors)]), hovertemplate=f'Task: %{{x}}<br>{instruct_short_label}–{cleaned_merged_short_name_plot}: %{{y:.2f}}%<extra></extra>')); merged_plot_idx +=1
            for idx, task in enumerate(summary_comp_df.index):
                plot_data_diff_trends.append({
                    'Task': task,
                    'Difference_Type': f'{instruct_short_label}–{cleaned_merged_short_name_plot}',
                    'Difference_Value': summary_comp_df.loc[task, diff_col]
                })
    
    if fig1.data:
        fig1_title = f'Difference Trends on Main Tasks (vs {instruct_short_label})'
        fig1.update_layout(title=fig1_title, xaxis_title=xaxis_title_main_tasks, yaxis_title='Performance Difference (%)', legend_title_text='Difference Type', hovermode='x unified', height=default_plot_height, width=default_plot_width, **font_config); fig1.show()
        print(f"Generated plot: {fig1_title} (X-axis: {xaxis_title_main_tasks})")
        
        # Export data
        if plot_data_diff_trends:
            pd.DataFrame(plot_data_diff_trends).to_csv(os.path.join(csv_export_dir, "difference_trends_main_tasks.csv"), index=False)
            print(f"Exported plot data to: {csv_export_dir}/difference_trends_main_tasks.csv")
    else: print("Skipping Difference Trends plot: No data to plot (all difference series might be NaN or no valid diff columns).")
else: print(f"Skipping Difference Trends plot: Conditions not met (can_calc_diffs: {can_calc_diffs}, summary_empty: {summary_comp_df.empty}, instruct_in_summary: {instruct_model in summary_comp_df.columns if instruct_model else False}).")

# Plot: Absolute Performance on Main Tasks (Line Chart)
if not summary_comp_df.empty:
    fig1_abs = go.Figure(); colors = px.colors.qualitative.Plotly; plot_idx = 0
    models_to_plot_abs = [m for m in comparison_models if m and m in summary_comp_df.columns and not summary_comp_df[m].isna().all()]
    plot_data_abs_main = []

    for model_name_abs in models_to_plot_abs:
        short_name_abs_cleaned = clean_plot_name(short_names.get(model_name_abs, model_name_abs))
        current_symbol = 'circle'; current_line_style = 'solid'

        if model_name_abs == base_model:
            current_symbol = 'star'; current_line_style = 'dashdot'
        elif model_name_abs == coder_model:
            current_symbol = 'diamond'; current_line_style = 'dash'
        elif model_name_abs == instruct_model:
            pass
        elif model_name_abs in merged_models:
            current_symbol = 'square'; current_line_style = 'dot'
        
        fig1_abs.add_trace(go.Scatter(x=summary_comp_df.index, y=summary_comp_df[model_name_abs], mode='lines+markers', name=short_name_abs_cleaned, marker=dict(symbol=current_symbol, size=8, color=colors[plot_idx % len(colors)]), line=dict(dash=current_line_style), hovertemplate=f'Task: %{{x}}<br>{short_name_abs_cleaned} Score: %{{y:.2f}}%<extra></extra>')); plot_idx += 1
        
        for task in summary_comp_df.index:
            plot_data_abs_main.append({
                'Task': task,
                'Model': short_name_abs_cleaned,
                'Score': summary_comp_df.loc[task, model_name_abs]
            })
    
    if fig1_abs.data:
        fig1_abs_title = 'Absolute Performance on Main Tasks'
        fig1_abs.update_layout(title=fig1_abs_title, xaxis_title=xaxis_title_main_tasks, yaxis_title='Performance Score (%)', legend_title_text='Model', hovermode='x unified', height=default_plot_height, width=default_plot_width, **font_config); fig1_abs.show()
        print(f"Generated plot: {fig1_abs_title} (X-axis: {xaxis_title_main_tasks})")
        
        # Export data
        if plot_data_abs_main:
            pd.DataFrame(plot_data_abs_main).to_csv(os.path.join(csv_export_dir, "absolute_performance_main_tasks_line.csv"), index=False)
            print(f"Exported plot data to: {csv_export_dir}/absolute_performance_main_tasks_line.csv")
    else: print("Skipping Absolute Performance plot: No data to plot (all model series might be NaN).")
else: print("Skipping Absolute Performance plot (Absolute Scores): summary_comp_df is empty.")

# UPDATED Bar Chart for Main Task Categories (Linguistic Competencies from 'probing dataset' in summary_comp_df)
if not summary_comp_df.empty:
    models_to_plot_bar_main_holmes = [m for m in comparison_models if m in summary_comp_df.columns and not summary_comp_df[m].isna().all()]
    
    color_map_main_bar_holmes = {}
    if base_model: color_map_main_bar_holmes["Qwen2.5 Base"] = 'rgb(100, 149, 237)'
    if instruct_model: color_map_main_bar_holmes["Qwen2.5 Instruct"] = 'rgb(50, 205, 50)'
    if coder_model: color_map_main_bar_holmes["Qwen2.5 Coder"] = 'rgb(255, 165, 0)'
    color_map_main_bar_holmes['Merged'] = 'rgb(192, 192, 192)'

    if models_to_plot_bar_main_holmes:
        plot_data_list_main_bar = []
        competency_order_holmes = tasks

        for comp_idx, competency_name in enumerate(competency_order_holmes):
            if competency_name in summary_comp_df.index:
                for model_idx, model_full_name in enumerate(comparison_models):
                    if model_full_name in models_to_plot_bar_main_holmes:
                        score = summary_comp_df.loc[competency_name, model_full_name]
                        if not pd.isna(score):
                            model_short_clean = clean_plot_name(short_names.get(model_full_name, model_full_name))
                            
                            model_type_for_color = "Merged"
                            if model_full_name == base_model: model_type_for_color = "Qwen2.5 Base"
                            elif model_full_name == instruct_model: model_type_for_color = "Qwen2.5 Instruct"
                            elif model_full_name == coder_model: model_type_for_color = "Qwen2.5 Coder"
                            
                            plot_data_list_main_bar.append({
                                'Linguistic Competency': competency_name,
                                'Model Short Name': model_short_clean,
                                'Score': score,
                                'Model Type for Color': model_type_for_color,
                                'Competency Index': comp_idx,
                                'Model Index': model_idx
                            })
        
        if not plot_data_list_main_bar:
            print(f"Skipping 'Absolute Performance by {MAIN_TASK_COL} (Bar Chart)': No data to plot after filtering.")
        else:
            main_bar_df_holmes = pd.DataFrame(plot_data_list_main_bar)
            main_bar_df_holmes.sort_values(by=['Competency Index', 'Model Index'], ascending=[True, True], inplace=True)

            # Create a new column for the text inside the bar
            main_bar_df_holmes['bar_label'] = main_bar_df_holmes.apply(
                lambda row: f"{row['Model Short Name']}: {row['Score']:.2f}%", axis=1
            )

            fig_main_bar_title_holmes = f'Absolute Performance Comparison by {MAIN_TASK_COL}'
            
            num_competencies_holmes = main_bar_df_holmes['Linguistic Competency'].nunique()
            row_spacing_val_holmes = 0.02
            if num_competencies_holmes > 1:
                max_allowed_spacing_h = 1.0 / (num_competencies_holmes -1) if (num_competencies_holmes -1) > 0 else 1.0
                row_spacing_val_holmes = min(0.02, max_allowed_spacing_h * 0.9)

            fig_main_bar_holmes = px.bar(
                main_bar_df_holmes, x='Score', y='Model Short Name',
                color='Model Type for Color', color_discrete_map=color_map_main_bar_holmes,
                orientation='h', title=fig_main_bar_title_holmes,
                labels={'Score': 'Performance Score (%)', 'Model Short Name': '', 
                        'Model Type for Color': 'Model Category',
                        'Linguistic Competency': MAIN_TASK_COL},
                text='bar_label', # Use the new combined label
                facet_row='Linguistic Competency',
                category_orders={"Linguistic Competency": competency_order_holmes},
                facet_row_spacing=row_spacing_val_holmes
            )
            # Move text inside the bars and hide y-axis labels
            fig_main_bar_holmes.update_traces(textposition='inside', insidetextanchor='middle')
            model_order_y_main_holmes = [clean_plot_name(short_names.get(m,m)) for m in comparison_models if m in models_to_plot_bar_main_holmes]
            fig_main_bar_holmes.update_yaxes(categoryorder='array', categoryarray=model_order_y_main_holmes, title=None, showticklabels=False, ticks="")
            fig_main_bar_holmes.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

            num_y_cats_main_holmes = len(model_order_y_main_holmes)
            competencies_actually_plotted_holmes = [c for c in competency_order_holmes if c in main_bar_df_holmes['Linguistic Competency'].unique()]
            num_total_facets_main_holmes = len(competencies_actually_plotted_holmes)

            for facet_idx, comp_name in enumerate(competencies_actually_plotted_holmes):
                instruct_s_val = summary_comp_df.loc[comp_name, instruct_model] if instruct_model in summary_comp_df.columns and comp_name in summary_comp_df.index and not pd.isna(summary_comp_df.loc[comp_name, instruct_model]) else np.nan
                coder_s_val = summary_comp_df.loc[comp_name, coder_model] if coder_model in summary_comp_df.columns and comp_name in summary_comp_df.index and not pd.isna(summary_comp_df.loc[comp_name, coder_model]) else np.nan
                
                if pd.isna(instruct_s_val) or pd.isna(coder_s_val): continue

                axis_suffix = num_total_facets_main_holmes - facet_idx
                xaxis_ref = f'x{axis_suffix}' if axis_suffix > 1 else 'x'
                yaxis_ref = f'y{axis_suffix}' if axis_suffix > 1 else 'y'
                
                fig_main_bar_holmes.add_shape(type="rect", xref=xaxis_ref, yref=yaxis_ref, x0=min(instruct_s_val, coder_s_val), x1=max(instruct_s_val, coder_s_val), y0=-0.5, y1=num_y_cats_main_holmes - 0.5, fillcolor="rgba(128, 128, 128, 0.2)", line_width=0, layer="below")
                fig_main_bar_holmes.add_shape(type="line", xref=xaxis_ref, yref=yaxis_ref, x0=instruct_s_val, y0=-0.5, x1=instruct_s_val, y1=num_y_cats_main_holmes - 0.5, line=dict(color=color_map_main_bar_holmes.get('Qwen2.5 Instruct', 'green'), dash="dash", width=2), layer="above")
                fig_main_bar_holmes.add_shape(type="line", xref=xaxis_ref, yref=yaxis_ref, x0=coder_s_val, y0=-0.5, x1=coder_s_val, y1=num_y_cats_main_holmes - 0.5, line=dict(color=color_map_main_bar_holmes.get('Qwen2.5 Coder', 'orange'), dash="dash", width=2), layer="above")

            fig_main_bar_holmes.update_layout(xaxis_showgrid=True, yaxis_showgrid=False, showlegend=False, **font_config)
            plot_h_main_holmes = max(500, num_total_facets_main_holmes * (num_y_cats_main_holmes * 25 + 70))
            compact_plot_width = 1000 # Use a more compact width
            fig_main_bar_holmes.update_layout(height=plot_h_main_holmes, width=compact_plot_width, margin=dict(l=50, r=50, t=50, b=50))
            fig_main_bar_holmes.show()
            print(f"Generated plot: {fig_main_bar_title_holmes} (Faceted by {MAIN_TASK_COL})")
            
            # Export data
            main_bar_df_holmes.to_csv(os.path.join(csv_export_dir, "absolute_performance_main_tasks_bar.csv"), index=False)
            print(f"Exported plot data to: {csv_export_dir}/absolute_performance_main_tasks_bar.csv")
    else:
        print(f"Skipping 'Absolute Performance by {MAIN_TASK_COL} (Bar Chart)': No models with data to plot.")
else:
    print(f"Skipping 'Absolute Performance by {MAIN_TASK_COL} (Bar Chart)': summary_comp_df is empty.")

# UPDATED Bar Chart for Linguistic Competency Groups
if not subtasks_comp_df.empty and 'group' in subtasks_comp_df.columns:
    print(f"\n--- Generating Bar Chart by {SUBTASK_GROUP_COL} (Mean Scores) ---")
    
    models_to_plot_group_bar = [m for m in comparison_models if m in subtasks_comp_df.columns and not subtasks_comp_df[m].isna().all()]
    
    if models_to_plot_group_bar:
        subtasks_filtered_for_group_plot = subtasks_comp_df[subtasks_comp_df['group'].astype(str).str.lower() != 'nan'].copy()
        subtasks_filtered_for_group_plot.dropna(subset=models_to_plot_group_bar, how='all', inplace=True)

        if not subtasks_filtered_for_group_plot.empty:
            for model_col in models_to_plot_group_bar:
                subtasks_filtered_for_group_plot[model_col] = pd.to_numeric(subtasks_filtered_for_group_plot[model_col], errors='coerce')
            
            grouped_scores_by_competency = subtasks_filtered_for_group_plot.groupby('group')[models_to_plot_group_bar].mean().reset_index()

            plot_data_list_group_bar = []
            competency_group_order = sorted(grouped_scores_by_competency['group'].astype(str).unique())

            color_map_group_bar = {
                "Qwen2.5 Base": 'rgb(100, 149, 237)', "Qwen2.5 Instruct": 'rgb(50, 205, 50)',
                "Qwen2.5 Coder": 'rgb(255, 165, 0)', 'Merged': 'rgb(192, 192, 192)'
            }

            for comp_idx, competency_group_name in enumerate(competency_group_order):
                group_data = grouped_scores_by_competency[grouped_scores_by_competency['group'] == competency_group_name]
                if not group_data.empty:
                    for model_idx, model_full_name in enumerate(comparison_models):
                        if model_full_name in models_to_plot_group_bar and model_full_name in group_data.columns:
                            score = group_data[model_full_name].iloc[0]
                            if not pd.isna(score):
                                model_short_clean = clean_plot_name(short_names.get(model_full_name, model_full_name))
                                
                                model_type_for_color = "Merged"
                                if model_full_name == base_model: model_type_for_color = "Qwen2.5 Base"
                                elif model_full_name == instruct_model: model_type_for_color = "Qwen2.5 Instruct"
                                elif model_full_name == coder_model: model_type_for_color = "Qwen2.5 Coder"
                                
                                plot_data_list_group_bar.append({
                                    'Linguistic Competency Group': competency_group_name,
                                    'Model Short Name': model_short_clean,
                                    'Mean Score': score,
                                    'Model Type for Color': model_type_for_color,
                                    'Competency Index': comp_idx,
                                    'Model Index': model_idx
                                })
            
            if not plot_data_list_group_bar:
                print(f"Skipping 'Absolute Performance by {SUBTASK_GROUP_COL} (Bar Chart)': No data to plot after processing.")
            else:
                group_bar_df = pd.DataFrame(plot_data_list_group_bar)
                group_bar_df.sort_values(by=['Competency Index', 'Model Index'], ascending=[True, True], inplace=True)
                
                # Create a new column for the text inside the bar
                group_bar_df['bar_label'] = group_bar_df.apply(
                    lambda row: f"{row['Model Short Name']}: {row['Mean Score']:.2f}%", axis=1
                )

                fig_group_bar_title = f'Mean Absolute Performance by {SUBTASK_GROUP_COL}'
                
                num_competency_groups = group_bar_df['Linguistic Competency Group'].nunique()
                row_spacing_group_bar = 0.02
                if num_competency_groups > 1:
                    max_allowed_spacing_gb = 1.0 / (num_competency_groups - 1) if (num_competency_groups - 1) > 0 else 1.0
                    row_spacing_group_bar = min(0.03, max_allowed_spacing_gb * 0.9)

                fig_group_bar = px.bar(
                    group_bar_df, x='Mean Score', y='Model Short Name',
                    color='Model Type for Color', color_discrete_map=color_map_group_bar,
                    orientation='h', title=fig_group_bar_title,
                    labels={'Mean Score': 'Mean Performance Score (%)', 'Model Short Name': '',
                            'Model Type for Color': 'Model Category',
                            'Linguistic Competency Group': SUBTASK_GROUP_COL},
                    text='bar_label', # Use new combined label
                    facet_row='Linguistic Competency Group',
                    category_orders={"Linguistic Competency Group": competency_group_order},
                    facet_row_spacing=row_spacing_group_bar
                )
                fig_group_bar.update_traces(textposition='inside', insidetextanchor='middle')
                model_order_y_group = [clean_plot_name(short_names.get(m,m)) for m in comparison_models if m in models_to_plot_group_bar]
                fig_group_bar.update_yaxes(categoryorder='array', categoryarray=model_order_y_group, title=None, showticklabels=False, ticks="")
                fig_group_bar.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

                num_y_cats_group = len(model_order_y_group)
                num_total_facets_group = len(competency_group_order)

                for facet_idx, comp_group_name in enumerate(competency_group_order):
                    instruct_s_val_group_df = grouped_scores_by_competency[grouped_scores_by_competency['group'] == comp_group_name]
                    instruct_s_val_group = instruct_s_val_group_df[instruct_model].iloc[0] if instruct_model in instruct_s_val_group_df.columns and not instruct_s_val_group_df[instruct_model].empty and not pd.isna(instruct_s_val_group_df[instruct_model].iloc[0]) else np.nan
                    coder_s_val_group = instruct_s_val_group_df[coder_model].iloc[0] if coder_model in instruct_s_val_group_df.columns and not instruct_s_val_group_df[coder_model].empty and not pd.isna(instruct_s_val_group_df[coder_model].iloc[0]) else np.nan
                    
                    if pd.isna(instruct_s_val_group) or pd.isna(coder_s_val_group): continue

                    axis_suffix_group = num_total_facets_group - facet_idx
                    xaxis_ref_group = f'x{axis_suffix_group}' if axis_suffix_group > 1 else 'x'
                    yaxis_ref_group = f'y{axis_suffix_group}' if axis_suffix_group > 1 else 'y'

                    fig_group_bar.add_shape(type="rect", xref=xaxis_ref_group, yref=yaxis_ref_group, x0=min(instruct_s_val_group, coder_s_val_group), x1=max(instruct_s_val_group, coder_s_val_group), y0=-0.5, y1=num_y_cats_group - 0.5, fillcolor="rgba(255, 128, 128, 0.2)", line_width=0, layer="below")
                    fig_group_bar.add_shape(type="line", xref=xaxis_ref_group, yref=yaxis_ref_group, x0=instruct_s_val_group, y0=-0.5, x1=instruct_s_val_group, y1=num_y_cats_group - 0.5, line=dict(color=color_map_group_bar.get('Qwen2.5 Instruct', 'green'), dash="dash", width=2), layer="above")
                    fig_group_bar.add_shape(type="line", xref=xaxis_ref_group, yref=yaxis_ref_group, x0=coder_s_val_group, y0=-0.5, x1=coder_s_val_group, y1=num_y_cats_group - 0.5, line=dict(color=color_map_group_bar.get('Qwen2.5 Coder', 'orange'), dash="dash", width=2), layer="above")
                
                fig_group_bar.update_layout(xaxis_showgrid=True, yaxis_showgrid=False, showlegend=False, **font_config)
                plot_h_group = max(420, num_total_facets_group * (num_y_cats_group * 25 + 75))
                compact_plot_width = 1000 # Use a more compact width
                fig_group_bar.update_layout(height=plot_h_group, width=compact_plot_width, margin=dict(l=50, r=50, t=50, b=50))
                fig_group_bar.show()
                print(f"Generated plot: {fig_group_bar_title} (Faceted by {SUBTASK_GROUP_COL})")
                
                # Export data
                group_bar_df.to_csv(os.path.join(csv_export_dir, "mean_performance_by_competency_groups_bar.csv"), index=False)
                print(f"Exported plot data to: {csv_export_dir}/mean_performance_by_competency_groups_bar.csv")
        else:
            print(f"Skipping 'Absolute Performance by {SUBTASK_GROUP_COL} (Bar Chart)': No data to plot after grouping.")
    else:
        print(f"Skipping 'Absolute Performance by {SUBTASK_GROUP_COL} (Bar Chart)': No models with data to plot.")
else:
    print(f"Skipping 'Absolute Performance by {SUBTASK_GROUP_COL} (Bar Chart)': subtasks_comp_df is empty or 'group' column missing.")

# Plot 2: Subtask Difference Boxplots
plot_dcoder_col_sub = 'd_coder' if 'd_coder' in diff_cols_subtasks and not subtasks_comp_df.empty and 'd_coder' in subtasks_comp_df.columns else None
if not subtasks_comp_df.empty and 'group' in subtasks_comp_df.columns and instruct_model in subtasks_comp_df.columns and can_calc_diffs :
    plot_title_prefix = "Subtask Difference Boxplots"
    print(f"Generating {plot_title_prefix}: Subtask Difference Boxplot(s) grouped by '{SUBTASK_GROUP_COL}' (excluding 'nan' group)...")
    merged_diff_cols_sub_present = [col for col in diff_cols_subtasks if col.startswith('d_merged_') and col in subtasks_comp_df.columns and not subtasks_comp_df[col].isna().all()]

    if plot_dcoder_col_sub and not subtasks_comp_df[plot_dcoder_col_sub].isna().all():
        if not merged_diff_cols_sub_present :
            plot_data_box_ic_only = subtasks_comp_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=[plot_dcoder_col_sub], var_name='difference_type', value_name='difference_value')
            plot_data_box_ic_only.dropna(subset=['difference_value'], inplace=True)
            plot_data_box_ic_only = plot_data_box_ic_only[plot_data_box_ic_only['group'].astype(str).str.lower() != 'nan']
            if not plot_data_box_ic_only.empty:
                label_map_ic_only = {plot_dcoder_col_sub: f'{instruct_short_label}–{coder_short_label}'}
                plot_data_box_ic_only['difference_label'] = plot_data_box_ic_only['difference_type'].map(label_map_ic_only)
                fig2_ic_only_title = f'{plot_title_prefix} ({SUBTASK_GROUP_COL}): {instruct_short_label}–{coder_short_label}'
                fig2_ic_only = px.box(plot_data_box_ic_only, x='group', y='difference_value', color='difference_label', hover_data=['subtask_cleaned'], labels={'group': SUBTASK_GROUP_COL, 'difference_value': 'Difference (%)', 'difference_label': 'Difference Type', 'subtask_cleaned': SUBTASK_COL}, title=fig2_ic_only_title, category_orders={"group": sorted(plot_data_box_ic_only['group'].astype(str).unique())})
                fig2_ic_only.update_xaxes(tickangle=45); fig2_ic_only.update_layout(height=default_plot_height, width=default_plot_width, **font_config); fig2_ic_only.show();
                print(f"  - Generated plot: {fig2_ic_only_title}")
                
                # Export data
                plot_data_box_ic_only.to_csv(os.path.join(csv_export_dir, "subtask_difference_boxplot_instruct_coder_only.csv"), index=False)
                print(f"Exported plot data to: {csv_export_dir}/subtask_difference_boxplot_instruct_coder_only.csv")
            else: print("  - No valid data for Instruct-Coder difference boxplot (after 'nan' group filter or all values were NaN).")
        else:
            for merged_diff_col in merged_diff_cols_sub_present:
                original_merged_short_name = merged_diff_col.replace('d_merged_', '')
                cleaned_merged_short_name_plot = clean_plot_name(original_merged_short_name)
                cols_for_this_plot = [plot_dcoder_col_sub, merged_diff_col]
                plot_data_box_single = subtasks_comp_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=cols_for_this_plot, var_name='difference_type', value_name='difference_value')
                plot_data_box_single.dropna(subset=['difference_value'], inplace=True)
                plot_data_box_single = plot_data_box_single[plot_data_box_single['group'].astype(str).str.lower() != 'nan']
                if not plot_data_box_single.empty:
                    label_map = { col: (f'{instruct_short_label}–{coder_short_label}' if col == plot_dcoder_col_sub else f'{instruct_short_label}–{clean_plot_name(col.replace("d_merged_",""))}') for col in cols_for_this_plot}
                    plot_data_box_single['difference_label'] = plot_data_box_single['difference_type'].map(label_map)
                    fig2_single_title = f'{plot_title_prefix} ({SUBTASK_GROUP_COL}): {instruct_short_label}–{coder_short_label} vs {instruct_short_label}–{cleaned_merged_short_name_plot}'
                    fig2_single = px.box(plot_data_box_single, x='group', y='difference_value', color='difference_label', hover_data=['subtask_cleaned'], labels={'group': SUBTASK_GROUP_COL, 'difference_value': 'Difference (%)', 'difference_label': 'Difference Type', 'subtask_cleaned': SUBTASK_COL}, title=fig2_single_title, category_orders={"group": sorted(plot_data_box_single['group'].astype(str).unique())})
                    fig2_single.update_xaxes(tickangle=45); fig2_single.update_layout(boxmode='group', height=default_plot_height, width=default_plot_width, **font_config); fig2_single.show();
                    print(f"  - Generated plot: {fig2_single_title}")
                    
                    # Export data
                    plot_data_box_single.to_csv(os.path.join(csv_export_dir, f"subtask_difference_boxplot_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv"), index=False)
                    print(f"Exported plot data to: {csv_export_dir}/subtask_difference_boxplot_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv")
                else: print(f"  - No valid data for difference boxplot including {cleaned_merged_short_name_plot} (after 'nan' group filter or all values were NaN).")
    else: print(f"Skipping {plot_title_prefix} details: 'd_coder' for subtasks not available/all NaN, or merged diffs not available.")
elif not subtasks_comp_df.empty and 'group' not in subtasks_comp_df.columns: print(f"Skipping Subtask Difference Boxplots: Group column ('{SUBTASK_GROUP_COL}') missing.")
elif not can_calc_diffs: print("Skipping Subtask Difference Boxplots: Difference calculation disabled.")
else: print(f"Skipping Subtask Difference Boxplots: Other unmet conditions (subtasks_empty: {subtasks_comp_df.empty}, instruct_in_subtasks: {instruct_model in subtasks_comp_df.columns if instruct_model and not subtasks_comp_df.empty else 'N/A'}).")

# Plot 3: Absolute Score Boxplots
base_abs_models_plot = [m for m in [base_model, instruct_model, coder_model] if m and not subtasks_comp_df.empty and m in subtasks_comp_df.columns and not subtasks_comp_df[m].isna().all()]
if not subtasks_comp_df.empty and 'group' in subtasks_comp_df.columns and base_abs_models_plot:
    plot_title_prefix_abs = "Absolute Score Boxplots"
    print(f"Generating {plot_title_prefix_abs}: Absolute Score Boxplot(s) grouped by '{SUBTASK_GROUP_COL}' (excluding 'nan' group)...")
    present_merged_subtasks = [m for m in merged_models if m in subtasks_comp_df.columns and not subtasks_comp_df[m].isna().all()]

    if not present_merged_subtasks :
        plot_data_abs_base_only = subtasks_comp_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=base_abs_models_plot, var_name='model_full_name', value_name='score')
        plot_data_abs_base_only.dropna(subset=['score'], inplace=True)
        plot_data_abs_base_only = plot_data_abs_base_only[plot_data_abs_base_only['group'].astype(str).str.lower() != 'nan']
        if not plot_data_abs_base_only.empty:
            plot_data_abs_base_only['model_short_name'] = plot_data_abs_base_only['model_full_name'].map(lambda x: clean_plot_name(short_names.get(x,x)))
            fig_abs_base_title = f'{plot_title_prefix_abs} ({SUBTASK_GROUP_COL}): Base, Instruct & Coder'
            fig_abs_base = px.box(plot_data_abs_base_only, x='group', y='score', color='model_short_name', hover_data=['subtask_cleaned'], labels={'group': SUBTASK_GROUP_COL, 'score': 'Absolute Score (%)', 'model_short_name': 'Model', 'subtask_cleaned': SUBTASK_COL}, title=fig_abs_base_title, category_orders={"group": sorted(plot_data_abs_base_only['group'].astype(str).unique())})
            fig_abs_base.update_xaxes(tickangle=45); fig_abs_base.update_layout(boxmode='group', height=default_plot_height, width=default_plot_width, **font_config); fig_abs_base.show();
            print(f"  - Generated plot: {fig_abs_base_title}")
            
            # Export data
            plot_data_abs_base_only.to_csv(os.path.join(csv_export_dir, "absolute_score_boxplot_base_instruct_coder.csv"), index=False)
            print(f"Exported plot data to: {csv_export_dir}/absolute_score_boxplot_base_instruct_coder.csv")
        else: print("  - No valid data for Base/Instruct/Coder absolute score boxplot (after 'nan' group filter or all scores were NaN).")
    else:
        for merged_m_plot in present_merged_subtasks:
            cleaned_merged_short_name_plot = clean_plot_name(short_names.get(merged_m_plot, merged_m_plot))
            models_for_this_plot = base_abs_models_plot + [merged_m_plot]
            plot_data_abs_single = subtasks_comp_df.melt(id_vars=['group', 'subtask_cleaned'], value_vars=models_for_this_plot, var_name='model_full_name', value_name='score')
            plot_data_abs_single.dropna(subset=['score'], inplace=True)
            plot_data_abs_single = plot_data_abs_single[plot_data_abs_single['group'].astype(str).str.lower() != 'nan']
            if not plot_data_abs_single.empty:
                plot_data_abs_single['model_short_name'] = plot_data_abs_single['model_full_name'].map(lambda x: clean_plot_name(short_names.get(x,x)))
                fig_abs_single_title = f'{plot_title_prefix_abs} ({SUBTASK_GROUP_COL}): Incl. {cleaned_merged_short_name_plot}'
                fig_abs_single = px.box(plot_data_abs_single, x='group', y='score', color='model_short_name', hover_data=['subtask_cleaned'], labels={'group': SUBTASK_GROUP_COL, 'score': 'Absolute Score (%)', 'model_short_name': 'Model', 'subtask_cleaned': SUBTASK_COL}, title=fig_abs_single_title, category_orders={"group": sorted(plot_data_abs_single['group'].astype(str).unique())})
                fig_abs_single.update_xaxes(tickangle=45); fig_abs_single.update_layout(boxmode='group', height=default_plot_height, width=default_plot_width, **font_config); fig_abs_single.show();
                print(f"  - Generated plot: {fig_abs_single_title}")
                
                # Export data
                plot_data_abs_single.to_csv(os.path.join(csv_export_dir, f"absolute_score_boxplot_incl_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv"), index=False)
                print(f"Exported plot data to: {csv_export_dir}/absolute_score_boxplot_incl_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv")
            else: print(f"  - No valid data for absolute score boxplot including {cleaned_merged_short_name_plot} (after 'nan' group filter or all scores were NaN).")
elif not subtasks_comp_df.empty and 'group' not in subtasks_comp_df.columns: print(f"Skipping Absolute Score Boxplots: Group column ('{SUBTASK_GROUP_COL}') missing.")
else: print(f"Skipping Absolute Score Boxplots: Other unmet conditions (subtasks_empty: {subtasks_comp_df.empty}, base_abs_models_plot_empty: {not base_abs_models_plot}).")

# Plot 4/5: Jointplot/Scatter for Subtask Differences
plot_dcoder_col_sub = 'd_coder' if 'd_coder' in diff_cols_subtasks and not subtasks_comp_df.empty and 'd_coder' in subtasks_comp_df.columns and not subtasks_comp_df['d_coder'].isna().all() else None
plot_merged_diff_cols_sub_valid = [c for c in diff_cols_subtasks if not subtasks_comp_df.empty and c.startswith('d_merged_') and c in subtasks_comp_df.columns and not subtasks_comp_df[c].isna().all()]
if not subtasks_comp_df.empty and plot_dcoder_col_sub and plot_merged_diff_cols_sub_valid and can_calc_diffs and 'group' in subtasks_comp_df.columns and 'subtask_cleaned' in subtasks_comp_df.columns:
    print("Generating Jointplot/Scatter(s) for Subtask Differences (I-C vs I-MergedX) (excluding 'nan' group)...")
    for merged_diff_col in plot_merged_diff_cols_sub_valid:
        original_merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name_plot = clean_plot_name(original_merged_short_name)
        required_cols_joint = [plot_dcoder_col_sub, merged_diff_col, 'subtask_cleaned', 'group']
        plot_data_joint = subtasks_comp_df[required_cols_joint].copy()
        plot_data_joint.dropna(subset=[plot_dcoder_col_sub, merged_diff_col], inplace=True)
        plot_data_joint = plot_data_joint[plot_data_joint['group'].astype(str).str.lower() != 'nan']
        if not plot_data_joint.empty:
            plot_data_joint['hover_name_joint'] = plot_data_joint['subtask_cleaned'] + " (Group: " + plot_data_joint['group'].astype(str) + ")"
            title_joint = f'Joint Dist: ({instruct_short_label}-{coder_short_label}) vs ({instruct_short_label}-{cleaned_merged_short_name_plot}) Subtask Diffs'; labels_joint = {plot_dcoder_col_sub: f'{instruct_short_label}–{coder_short_label} Diff (%)', merged_diff_col: f'{instruct_short_label}–{cleaned_merged_short_name_plot} Diff (%)'}

            fig_joint = px.scatter(plot_data_joint, x=plot_dcoder_col_sub, y=merged_diff_col, marginal_x="histogram", marginal_y="histogram", trendline="ols", hover_name='hover_name_joint', hover_data={plot_dcoder_col_sub:':.2f', merged_diff_col:':.2f', 'group':True, 'subtask_cleaned':False, 'hover_name_joint': False}, labels=labels_joint, title=title_joint); fig_joint.update_layout(height=default_plot_height, width=default_plot_width, **font_config); fig_joint.show();
            print(f"  - Generated Jointplot for {cleaned_merged_short_name_plot}")

            fig_scatter_comp = px.scatter(plot_data_joint, x=plot_dcoder_col_sub, y=merged_diff_col, color='group', trendline="ols", hover_name='subtask_cleaned', hover_data={'group':True, plot_dcoder_col_sub:':.2f', merged_diff_col:':.2f'}, labels=labels_joint, title=title_joint.replace("Joint Dist", "Scatter Comp"), color_discrete_map={'nan': 'rgba(0,0,0,0)'})
            fig_scatter_comp.update_layout(height=default_plot_height, width=default_plot_width, **font_config); fig_scatter_comp.show();
            print(f"  - Generated Scatter Plot for {cleaned_merged_short_name_plot}")
            
            # Export data for both plots (same data)
            plot_data_joint.to_csv(os.path.join(csv_export_dir, f"jointplot_scatter_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}_vs_coder.csv"), index=False)
            print(f"Exported plot data to: {csv_export_dir}/jointplot_scatter_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}_vs_coder.csv")
        else: print(f"  - No valid data for Jointplot/Scatter for {cleaned_merged_short_name_plot} after NaN drop and 'nan' group filter.")
else: print(f"Skipping Jointplot/Scatter plots: Conditions not met.")

# Plot 6: Top/Bottom Subtask Impact Plot
if not subtasks_comp_df.empty and plot_dcoder_col_sub and plot_merged_diff_cols_sub_valid and 'subtask_cleaned' in subtasks_comp_df.columns and can_calc_diffs and 'group' in subtasks_comp_df.columns:
    plot_title_prefix_impact = "Top/Bottom Subtask Impact"
    print(f"Generating {plot_title_prefix_impact} Plot(s) (excluding 'nan' group)..."); N_top_bottom = 5
    for merged_diff_col in plot_merged_diff_cols_sub_valid:
        original_merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name_plot = clean_plot_name(original_merged_short_name)
        impact_col_name = f'impact_{cleaned_merged_short_name_plot}'
        temp_df_impact = subtasks_comp_df.copy()
        if 'group' in temp_df_impact.columns:
            temp_df_impact = temp_df_impact[temp_df_impact['group'].astype(str).str.lower() != 'nan']

        if temp_df_impact.empty:
            print(f"  - Skipping Impact Bar Plot for {cleaned_merged_short_name_plot} as data is empty after 'nan' group filter.")
            continue

        temp_df_impact[merged_diff_col] = pd.to_numeric(temp_df_impact[merged_diff_col], errors='coerce')
        temp_df_impact[plot_dcoder_col_sub] = pd.to_numeric(temp_df_impact[plot_dcoder_col_sub], errors='coerce')

        temp_df_impact[impact_col_name] = temp_df_impact[plot_dcoder_col_sub] - temp_df_impact[merged_diff_col]
        subtasks_sorted_impact = temp_df_impact.dropna(subset=[impact_col_name]).sort_values(impact_col_name)

        if len(subtasks_sorted_impact) >= N_top_bottom * 2 :
            top_n_impact = subtasks_sorted_impact.nlargest(N_top_bottom, impact_col_name)
            bottom_n_impact = subtasks_sorted_impact.nsmallest(N_top_bottom, impact_col_name)
            plot_data_bar_impact = pd.concat([top_n_impact, bottom_n_impact]).drop_duplicates(subset=['subtask_cleaned'])
            if not plot_data_bar_impact.empty:
                hover_cols_bar = {'subtask_cleaned': False, 'group': True, impact_col_name: ':.2f', plot_dcoder_col_sub: ':.2f', merged_diff_col: ':.2f'};
                hover_cols_bar_present = {k:v for k,v in hover_cols_bar.items() if k in plot_data_bar_impact.columns or k in [plot_dcoder_col_sub, merged_diff_col]}
                labels_bar = {'subtask_cleaned': SUBTASK_COL, 'group':SUBTASK_GROUP_COL, impact_col_name: f'Impact ({cleaned_merged_short_name_plot} vs {coder_short_label}) Rel. to {instruct_short_label} (%)'}
                fig_bar_impact_title = f'{plot_title_prefix_impact}: Top/Bottom {N_top_bottom} Subtasks - Rel. Impact of {cleaned_merged_short_name_plot} vs {coder_short_label}'
                fig_bar_impact = px.bar(plot_data_bar_impact, x=impact_col_name, y='subtask_cleaned', orientation='h', color=impact_col_name, color_continuous_scale=px.colors.diverging.RdBu, color_continuous_midpoint=0, hover_data=hover_cols_bar_present, labels=labels_bar, title=fig_bar_impact_title)
                fig_bar_impact.update_layout(yaxis={'categoryorder':'total ascending'}, height=default_plot_height, width=default_plot_width, **font_config); fig_bar_impact.show();
                print(f"  - Generated plot: {fig_bar_impact_title}")
                
                # Export data
                plot_data_bar_impact.to_csv(os.path.join(csv_export_dir, f"top_bottom_impact_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv"), index=False)
                print(f"Exported plot data to: {csv_export_dir}/top_bottom_impact_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv")
            else: print(f"  - No data for Impact Bar Plot for {cleaned_merged_short_name_plot} after selecting top/bottom.")
        else: print(f"  - Not enough data for Top/Bottom {N_top_bottom} Impact plot for {cleaned_merged_short_name_plot}. Found {len(subtasks_sorted_impact)} after filtering and NaN drop.")
else: print(f"Skipping Top/Bottom Subtask Impact Plot(s): Conditions not met.")

# Plot 7: Clustermap/Dendrogram for Subtasks
if (not subtasks_comp_df.empty and 'subtask_cleaned' in subtasks_comp_df.columns and
    plot_dcoder_col_sub and plot_merged_diff_cols_sub_valid and can_calc_diffs):
    plot_title_prefix_cluster = "Clustered Heatmap/Dendrogram"
    print(f"Generating {plot_title_prefix_cluster}(s) for Subtasks (I-C vs I-MergedX) (excluding 'nan' group)...")
    for merged_diff_col in plot_merged_diff_cols_sub_valid:
        original_merged_short_name = merged_diff_col.replace('d_merged_', '')
        cleaned_merged_short_name_plot = clean_plot_name(original_merged_short_name)
        cluster_cols_single = [plot_dcoder_col_sub, merged_diff_col]

        plot7_data_source_df = subtasks_comp_df.copy()
        if 'group' in plot7_data_source_df.columns:
            plot7_data_source_df = plot7_data_source_df[plot7_data_source_df['group'].astype(str).str.lower() != 'nan']

        if plot7_data_source_df.empty:
            print(f"  - Skipping clustering for {cleaned_merged_short_name_plot} as data is empty after 'nan' group filter.")
            continue

        diff_matrix_single_prep = plot7_data_source_df.set_index('subtask_cleaned')[cluster_cols_single].copy()
        for col in cluster_cols_single: diff_matrix_single_prep[col] = pd.to_numeric(diff_matrix_single_prep[col], errors='coerce')
        diff_matrix_single = diff_matrix_single_prep.dropna(how='any')

        if len(diff_matrix_single) > 1:
            try:
                scaler = StandardScaler(); scaled_data_single = scaler.fit_transform(diff_matrix_single.values)
                if np.any(np.std(scaled_data_single, axis=0) < 1e-9):
                    print(f"  - Skipping clustering for {cleaned_merged_short_name_plot} due to near-zero standard deviation in one or more dimensions after scaling (possibly identical values)."); continue
                row_linkage_single = linkage(pdist(scaled_data_single), method='average', metric='euclidean')
                ordered_row_indices_single = leaves_list(row_linkage_single)
                heatmap_data_ordered_single = scaled_data_single[ordered_row_indices_single]
                ordered_row_labels_single = diff_matrix_single.index[ordered_row_indices_single].tolist()
                heatmap_col_labels_single = [f'{instruct_short_label}-{coder_short_label}', f'{instruct_short_label}-{cleaned_merged_short_name_plot}']

                fig_heatmap_single_title = f'{plot_title_prefix_cluster}: {cleaned_merged_short_name_plot} Profile (Scaled)'
                fig_heatmap_single = px.imshow(heatmap_data_ordered_single, labels=dict(x="Difference Type (vs Instruct)", y=SUBTASK_COL, color="Scaled Value"), x=heatmap_col_labels_single, y=ordered_row_labels_single, aspect="auto", color_continuous_scale='RdBu_r', title=fig_heatmap_single_title)
                fig_heatmap_single.update_xaxes(side="top"); fig_heatmap_single.update_layout(height=max(default_plot_height, 20*len(ordered_row_labels_single)), width=default_plot_width, **font_config); fig_heatmap_single.show();
                print(f"  - Generated plot: {fig_heatmap_single_title}")

                fig_dendro_row_single_title = f'{plot_title_prefix_cluster}: Row Dendrogram - {cleaned_merged_short_name_plot} Profile (Scaled Subtask Differences)'
                fig_dendro_row_single = ff.create_dendrogram(scaled_data_single, orientation='right', labels=diff_matrix_single.index.tolist(), linkagefun=lambda x: linkage(x, method='average', metric='euclidean'))
                fig_dendro_row_single.update_layout(title=fig_dendro_row_single_title, height=max(default_plot_height, 20*len(diff_matrix_single)), width=default_plot_width, **font_config); fig_dendro_row_single.show();
                print(f"  - Generated plot: {fig_dendro_row_single_title}")
                
                # Export clustering data
                clustering_data = pd.DataFrame(heatmap_data_ordered_single, 
                                               columns=heatmap_col_labels_single, 
                                               index=ordered_row_labels_single)
                clustering_data.to_csv(os.path.join(csv_export_dir, f"clustering_heatmap_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv"))
                print(f"Exported clustering data to: {csv_export_dir}/clustering_heatmap_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv")
                
                # Export original data used for clustering
                diff_matrix_single.to_csv(os.path.join(csv_export_dir, f"clustering_original_data_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv"))
                print(f"Exported original clustering data to: {csv_export_dir}/clustering_original_data_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv")
            except Exception as e: print(f"Error clustering/plotting for {cleaned_merged_short_name_plot}: {e}")
        else: print(f"  - Not enough data points (>1) for {cleaned_merged_short_name_plot} for clustering (after NaN drop and 'nan' group filter).")
else: print(f"Skipping Clustered Heatmap/Dendrogram plots: Conditions not met.")

# Plot 8a: Main Task Dendrograms
plot_dcoder_col_main = 'd_coder' if not summary_comp_df.empty and 'd_coder' in summary_comp_df.columns and not summary_comp_df['d_coder'].isna().all() else None
plot_merged_diff_cols_main_valid = [c for c in diff_cols_main if not summary_comp_df.empty and c.startswith('d_merged_') and c in summary_comp_df.columns and not summary_comp_df[c].isna().all()]
if plot_dcoder_col_main and plot_merged_diff_cols_main_valid and not summary_comp_df.empty and can_calc_diffs:
    plot_title_prefix_dendro_main = "Main Task Dendrogram"
    print(f"Generating {plot_title_prefix_dendro_main}(s) (I-C vs I-MergedX)...")
    for merged_diff_col_main in plot_merged_diff_cols_main_valid:
        original_merged_short_name = merged_diff_col_main.replace('d_merged_', '')
        cleaned_merged_short_name_plot = clean_plot_name(original_merged_short_name)
        cols_main_single = [plot_dcoder_col_main, merged_diff_col_main]
        main_matrix_data_single_prep = summary_comp_df[cols_main_single].copy()
        for col in cols_main_single: main_matrix_data_single_prep[col] = pd.to_numeric(main_matrix_data_single_prep[col], errors='coerce')
        main_matrix_data_single = main_matrix_data_single_prep.dropna(how='any')

        if len(main_matrix_data_single) >= 2:
            try:
                fig_dendro_main_s_title = f'{plot_title_prefix_dendro_main}: {xaxis_title_main_tasks} based on {cleaned_merged_short_name_plot} Profile'
                fig_dendro_main_s = ff.create_dendrogram(main_matrix_data_single.values, labels=main_matrix_data_single.index.tolist(), linkagefun=lambda x: linkage(x, method='ward'))
                dynamic_width_dendro_main = max(default_plot_width, 30 * len(main_matrix_data_single.index))
                fig_dendro_main_s.update_layout(title=fig_dendro_main_s_title, yaxis_title='Distance', xaxis_title='Task', width=dynamic_width_dendro_main, height=default_plot_height, **font_config); fig_dendro_main_s.show();
                print(f"  - Generated plot: {fig_dendro_main_s_title}")
                
                # Export dendrogram data
                main_matrix_data_single.to_csv(os.path.join(csv_export_dir, f"main_task_dendrogram_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv"))
                print(f"Exported dendrogram data to: {csv_export_dir}/main_task_dendrogram_{cleaned_merged_short_name_plot.lower().replace(' ', '_')}.csv")
            except Exception as e: print(f"Could not generate Main Tasks Dendrogram for {cleaned_merged_short_name_plot}: {e}")
        else: print(f"  - Not enough data points (>1) for Main Task Dendrogram for {cleaned_merged_short_name_plot} after NaN drop.")
else: print(f"Skipping Main Task Dendrogram(s): Conditions not met (plot_dcoder_col_main: {plot_dcoder_col_main is None}, plot_merged_diff_cols_main_valid_empty: {not plot_merged_diff_cols_main_valid}, or summary_comp_df empty/diffs not calculable).")

print("Skipping Subtask Dendrograms (previously Plot 8b) as it's covered by Plot 7's row dendrograms if subtask data is used for clustering.")

# Plot 9: Absolute Scores by Linguistic Competency Group (Box Plot)
plot_title_prefix_abs_group_box = f"Absolute Score Distribution by {SUBTASK_GROUP_COL}"
print(f"\n--- Generating {plot_title_prefix_abs_group_box} (Box Plot) ---")

if not subtasks_comp_df.empty and 'group' in subtasks_comp_df.columns and SUBTASK_GROUP_COL:
    plot_data_p9_box_filtered = subtasks_comp_df[subtasks_comp_df['group'].astype(str).str.lower() != 'nan'].copy()

    if not plot_data_p9_box_filtered.empty:
        models_to_plot_p9_box = [m for m in comparison_models if m in plot_data_p9_box_filtered.columns and not plot_data_p9_box_filtered[m].isna().all()]

        if models_to_plot_p9_box:
            melted_data_p9_box = plot_data_p9_box_filtered.melt(
                id_vars=['group', 'subtask_cleaned'],
                value_vars=models_to_plot_p9_box,
                var_name='model_full_name',
                value_name='score'
            )
            melted_data_p9_box.dropna(subset=['score'], inplace=True)

            if not melted_data_p9_box.empty:
                melted_data_p9_box['Model Short Name'] = melted_data_p9_box['model_full_name'].map(lambda x: clean_plot_name(short_names.get(x, x)))
                
                def get_model_type_for_color(full_name):
                    if full_name == base_model: return "Qwen2.5 Base"
                    if full_name == instruct_model: return "Qwen2.5 Instruct"
                    if full_name == coder_model: return "Qwen2.5 Coder"
                    if full_name in merged_models: return 'Merged'
                    return 'Other'
                melted_data_p9_box['Model Type for Color'] = melted_data_p9_box['model_full_name'].apply(get_model_type_for_color)

                color_map_p9_box = {
                    "Qwen2.5 Base": 'rgb(100, 149, 237)', 
                    "Qwen2.5 Instruct": 'rgb(50, 205, 50)',
                    "Qwen2.5 Coder": 'rgb(255, 165, 0)', 
                    'Merged': 'rgb(192, 192, 192)', 
                    'Other': 'grey'
                }

                fig9_box_title = f'{plot_title_prefix_abs_group_box}'
                fig9_box = px.box(
                    melted_data_p9_box,
                    x='group',              
                    y='score',              
                    color='Model Type for Color',
                    color_discrete_map=color_map_p9_box,
                    hover_data=['subtask_cleaned', 'Model Short Name'],
                    labels={'group': SUBTASK_GROUP_COL, 'score': 'Absolute Score (%)', 
                            'Model Type for Color': 'Model Category',
                            'subtask_cleaned': SUBTASK_COL},
                    title=fig9_box_title,
                    category_orders={"group": sorted(melted_data_p9_box['group'].astype(str).unique())}
                )
                
                fig9_box.update_xaxes(tickangle=45)
                fig9_box.update_layout(
                    boxmode='group',
                    height=max(default_plot_height, 700),
                    width=default_plot_width,
                    showlegend=False, # Removed legend
                    **font_config
                )
                fig9_box.show()
                print(f"Generated plot: {fig9_box_title}")
                
                # Export data
                melted_data_p9_box.to_csv(os.path.join(csv_export_dir, "absolute_score_distribution_by_competency_groups.csv"), index=False)
                print(f"Exported plot data to: {csv_export_dir}/absolute_score_distribution_by_competency_groups.csv")
            else:
                print(f"Skipping {plot_title_prefix_abs_group_box}: No data to plot after melting and NaN removal.")
        else:
            print(f"Skipping {plot_title_prefix_abs_group_box}: No valid models found in filtered subtask data for plotting.")
    else:
        print(f"Skipping {plot_title_prefix_abs_group_box}: Data is empty after filtering 'nan' group from {SUBTASK_GROUP_COL} column.")
else:
    if subtasks_comp_df.empty:
        print(f"Skipping {plot_title_prefix_abs_group_box}: subtasks_comp_df is empty.")
    elif 'group' not in subtasks_comp_df.columns:
        print(f"Skipping {plot_title_prefix_abs_group_box}: 'group' column (expected from {SUBTASK_GROUP_COL}) not in subtasks_comp_df.")
    elif not SUBTASK_GROUP_COL:
        print(f"Skipping {plot_title_prefix_abs_group_box}: SUBTASK_GROUP_COL is not defined.")

print(f"\n--- CSV Export Summary ---")
print(f"All plot data has been exported to the '{csv_export_dir}' directory.")
print(f"Each plot has its own CSV file with the data used to generate the visualization.")
print("The exported files include:")
print("- Line charts: data in long format with tasks/models/values")
print("- Bar charts: data with grouping variables and scores")
print("- Box plots: melted data with grouping and score variables")
print("- Scatter/joint plots: paired difference values with metadata")
print("- Impact plots: top/bottom ranked data with impact scores")
print("- Clustering plots: both original and processed (scaled/ordered) data")
print("- Dendrograms: data matrices used for hierarchical clustering")
print("- Correlation matrices: correlation coefficients between variables")

print("\n--- Script Finished ---")


--- Model Categorization ---
Base Model: Qwen__Qwen2.5-7B (Qwen2.5 Base)
Instruct Model: Qwen__Qwen2.5-7B-Instruct (Qwen2.5 Instruct)
Coder Model: Qwen__Qwen2.5-Coder-7B (Qwen2.5 Coder)
Merged Models (5):
  - Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29 (Linear_29)
  - Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29 (Task_Arith_29)
  - Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29 (DARE_Ties_29)
  - Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29 (Ties_29)
  - Yuuta208__Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29 (Slerp_29)
-------------------------
Models for comparison (in order): ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Coder', 'Linear', 'Task_Arith', 'DARE_Ties', 'Ties', 'Slerp']
-------------------------
Filtering absolute scores for 'encoding' == 'full'. Original rows: 30901
Rows after filtering for encoding == 'full': 28741

--- Summary DataFrame (Comparison Models from Absolute Scores) 

Generated Plot: Merged Model Performance Category Counts
Exported plot data to: plot_data_csv_exports/merged_model_performance_counts.csv

--- Correlation Matrix of Performance Categories Between Merged Models (Main Tasks Overall) ---
Merged Model    Linear  Task_Arith  DARE_Ties      Ties     Slerp
Merged Model                                                     
Linear        1.000000    0.999988   0.974642  0.997960 -0.195075
Task_Arith    0.999988    1.000000   0.973527  0.997634 -0.199908
DARE_Ties     0.974642    0.973527   1.000000  0.986938  0.029344
Ties          0.997960    0.997634   0.986938  1.000000 -0.132068
Slerp        -0.195075   -0.199908   0.029344 -0.132068  1.000000


Generated Plot: Correlation Heatmap of Performance Categories
Exported correlation matrix to: plot_data_csv_exports/performance_categories_correlation_matrix.csv

--- Generating Original Plots (Based on Absolute Scores) ---


Generated plot: Difference Trends on Main Tasks (vs Qwen2.5 Instruct) (X-axis: main_task_category)
Exported plot data to: plot_data_csv_exports/difference_trends_main_tasks.csv


Generated plot: Absolute Performance on Main Tasks (X-axis: main_task_category)
Exported plot data to: plot_data_csv_exports/absolute_performance_main_tasks_line.csv


Generated plot: Absolute Performance Comparison by probing dataset (Faceted by probing dataset)
Exported plot data to: plot_data_csv_exports/absolute_performance_main_tasks_bar.csv

--- Generating Bar Chart by linguistic competencies (Mean Scores) ---


Generated plot: Mean Absolute Performance by linguistic competencies (Faceted by linguistic competencies)
Exported plot data to: plot_data_csv_exports/mean_performance_by_competency_groups_bar.csv
Generating Subtask Difference Boxplots: Subtask Difference Boxplot(s) grouped by 'linguistic competencies' (excluding 'nan' group)...


  - Generated plot: Subtask Difference Boxplots (linguistic competencies): Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Linear
Exported plot data to: plot_data_csv_exports/subtask_difference_boxplot_linear.csv


  - Generated plot: Subtask Difference Boxplots (linguistic competencies): Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Task_Arith
Exported plot data to: plot_data_csv_exports/subtask_difference_boxplot_task_arith.csv


  - Generated plot: Subtask Difference Boxplots (linguistic competencies): Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–DARE_Ties
Exported plot data to: plot_data_csv_exports/subtask_difference_boxplot_dare_ties.csv


  - Generated plot: Subtask Difference Boxplots (linguistic competencies): Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Ties
Exported plot data to: plot_data_csv_exports/subtask_difference_boxplot_ties.csv


  - Generated plot: Subtask Difference Boxplots (linguistic competencies): Qwen2.5 Instruct–Qwen2.5 Coder vs Qwen2.5 Instruct–Slerp
Exported plot data to: plot_data_csv_exports/subtask_difference_boxplot_slerp.csv
Generating Absolute Score Boxplots: Absolute Score Boxplot(s) grouped by 'linguistic competencies' (excluding 'nan' group)...


  - Generated plot: Absolute Score Boxplots (linguistic competencies): Incl. Linear
Exported plot data to: plot_data_csv_exports/absolute_score_boxplot_incl_linear.csv


  - Generated plot: Absolute Score Boxplots (linguistic competencies): Incl. Task_Arith
Exported plot data to: plot_data_csv_exports/absolute_score_boxplot_incl_task_arith.csv


  - Generated plot: Absolute Score Boxplots (linguistic competencies): Incl. DARE_Ties
Exported plot data to: plot_data_csv_exports/absolute_score_boxplot_incl_dare_ties.csv


  - Generated plot: Absolute Score Boxplots (linguistic competencies): Incl. Ties
Exported plot data to: plot_data_csv_exports/absolute_score_boxplot_incl_ties.csv


  - Generated plot: Absolute Score Boxplots (linguistic competencies): Incl. Slerp
Exported plot data to: plot_data_csv_exports/absolute_score_boxplot_incl_slerp.csv
Generating Jointplot/Scatter(s) for Subtask Differences (I-C vs I-MergedX) (excluding 'nan' group)...


  - Generated Jointplot for Linear


  - Generated Scatter Plot for Linear
Exported plot data to: plot_data_csv_exports/jointplot_scatter_linear_vs_coder.csv


  - Generated Jointplot for Task_Arith


  - Generated Scatter Plot for Task_Arith
Exported plot data to: plot_data_csv_exports/jointplot_scatter_task_arith_vs_coder.csv


  - Generated Jointplot for DARE_Ties


  - Generated Scatter Plot for DARE_Ties
Exported plot data to: plot_data_csv_exports/jointplot_scatter_dare_ties_vs_coder.csv


  - Generated Jointplot for Ties


  - Generated Scatter Plot for Ties
Exported plot data to: plot_data_csv_exports/jointplot_scatter_ties_vs_coder.csv


  - Generated Jointplot for Slerp


  - Generated Scatter Plot for Slerp
Exported plot data to: plot_data_csv_exports/jointplot_scatter_slerp_vs_coder.csv
Generating Top/Bottom Subtask Impact Plot(s) (excluding 'nan' group)...


  - Generated plot: Top/Bottom Subtask Impact: Top/Bottom 5 Subtasks - Rel. Impact of Linear vs Qwen2.5 Coder
Exported plot data to: plot_data_csv_exports/top_bottom_impact_linear.csv


  - Generated plot: Top/Bottom Subtask Impact: Top/Bottom 5 Subtasks - Rel. Impact of Task_Arith vs Qwen2.5 Coder
Exported plot data to: plot_data_csv_exports/top_bottom_impact_task_arith.csv


  - Generated plot: Top/Bottom Subtask Impact: Top/Bottom 5 Subtasks - Rel. Impact of DARE_Ties vs Qwen2.5 Coder
Exported plot data to: plot_data_csv_exports/top_bottom_impact_dare_ties.csv


  - Generated plot: Top/Bottom Subtask Impact: Top/Bottom 5 Subtasks - Rel. Impact of Ties vs Qwen2.5 Coder
Exported plot data to: plot_data_csv_exports/top_bottom_impact_ties.csv


  - Generated plot: Top/Bottom Subtask Impact: Top/Bottom 5 Subtasks - Rel. Impact of Slerp vs Qwen2.5 Coder
Exported plot data to: plot_data_csv_exports/top_bottom_impact_slerp.csv
Generating Clustered Heatmap/Dendrogram(s) for Subtasks (I-C vs I-MergedX) (excluding 'nan' group)...


  - Generated plot: Clustered Heatmap/Dendrogram: Linear Profile (Scaled)


  - Generated plot: Clustered Heatmap/Dendrogram: Row Dendrogram - Linear Profile (Scaled Subtask Differences)
Exported clustering data to: plot_data_csv_exports/clustering_heatmap_linear.csv
Exported original clustering data to: plot_data_csv_exports/clustering_original_data_linear.csv


  - Generated plot: Clustered Heatmap/Dendrogram: Task_Arith Profile (Scaled)


  - Generated plot: Clustered Heatmap/Dendrogram: Row Dendrogram - Task_Arith Profile (Scaled Subtask Differences)
Exported clustering data to: plot_data_csv_exports/clustering_heatmap_task_arith.csv
Exported original clustering data to: plot_data_csv_exports/clustering_original_data_task_arith.csv


  - Generated plot: Clustered Heatmap/Dendrogram: DARE_Ties Profile (Scaled)


  - Generated plot: Clustered Heatmap/Dendrogram: Row Dendrogram - DARE_Ties Profile (Scaled Subtask Differences)
Exported clustering data to: plot_data_csv_exports/clustering_heatmap_dare_ties.csv
Exported original clustering data to: plot_data_csv_exports/clustering_original_data_dare_ties.csv


  - Generated plot: Clustered Heatmap/Dendrogram: Ties Profile (Scaled)


  - Generated plot: Clustered Heatmap/Dendrogram: Row Dendrogram - Ties Profile (Scaled Subtask Differences)
Exported clustering data to: plot_data_csv_exports/clustering_heatmap_ties.csv
Exported original clustering data to: plot_data_csv_exports/clustering_original_data_ties.csv


  - Generated plot: Clustered Heatmap/Dendrogram: Slerp Profile (Scaled)


  - Generated plot: Clustered Heatmap/Dendrogram: Row Dendrogram - Slerp Profile (Scaled Subtask Differences)
Exported clustering data to: plot_data_csv_exports/clustering_heatmap_slerp.csv
Exported original clustering data to: plot_data_csv_exports/clustering_original_data_slerp.csv
Generating Main Task Dendrogram(s) (I-C vs I-MergedX)...


  - Generated plot: Main Task Dendrogram: main_task_category based on Linear Profile
Exported dendrogram data to: plot_data_csv_exports/main_task_dendrogram_linear.csv


  - Generated plot: Main Task Dendrogram: main_task_category based on Task_Arith Profile
Exported dendrogram data to: plot_data_csv_exports/main_task_dendrogram_task_arith.csv


  - Generated plot: Main Task Dendrogram: main_task_category based on DARE_Ties Profile
Exported dendrogram data to: plot_data_csv_exports/main_task_dendrogram_dare_ties.csv


  - Generated plot: Main Task Dendrogram: main_task_category based on Ties Profile
Exported dendrogram data to: plot_data_csv_exports/main_task_dendrogram_ties.csv


  - Generated plot: Main Task Dendrogram: main_task_category based on Slerp Profile
Exported dendrogram data to: plot_data_csv_exports/main_task_dendrogram_slerp.csv
Skipping Subtask Dendrograms (previously Plot 8b) as it's covered by Plot 7's row dendrograms if subtask data is used for clustering.

--- Generating Absolute Score Distribution by linguistic competencies (Box Plot) ---


Generated plot: Absolute Score Distribution by linguistic competencies
Exported plot data to: plot_data_csv_exports/absolute_score_distribution_by_competency_groups.csv

--- CSV Export Summary ---
All plot data has been exported to the 'plot_data_csv_exports' directory.
Each plot has its own CSV file with the data used to generate the visualization.
The exported files include:
- Line charts: data in long format with tasks/models/values
- Bar charts: data with grouping variables and scores
- Box plots: melted data with grouping and score variables
- Scatter/joint plots: paired difference values with metadata
- Impact plots: top/bottom ranked data with impact scores
- Clustering plots: both original and processed (scaled/ordered) data
- Dendrograms: data matrices used for hierarchical clustering
- Correlation matrices: correlation coefficients between variables

--- Script Finished ---
