In [None]:
# -*- coding: utf-8 -*-

import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import re # Added for cleaning plot names
import json # Added to handle potential JSON loading if paths were used

# Set default plotly template for better aesthetics
pio.templates.default = "plotly_white"

# Create directory for CSV exports if it doesn't exist
csv_export_dir = "plot_data_csv_exports"
os.makedirs(csv_export_dir, exist_ok=True)

# --- Helper function for cleaning model names for plots ---
def clean_plot_name(name):
    """Cleans model names for display in plots."""
    if name is None: # Handle potential None input
        return "Unknown"
    name_str = str(name)

    # For merged models, remove the trailing _XX
    # This condition handles names like "Linear_24" -> "Linear"
    if not name_str.startswith("Qwen2.5"):
        name_str = re.sub(r'_\d+$', '', name_str)
    return name_str

# --- Font configuration for plots ---
font_config = {
    "title_font_size": 24,
    "font_size": 18,
    "xaxis_title_font_size": 18,
    "yaxis_title_font_size": 18,
    "xaxis_tickfont_size": 16,
    "yaxis_tickfont_size": 16,
    "legend_title_font_size": 24,
    "legend_font_size": 22,
}

# --- Default Plot Dimensions ---
default_plot_height = 520
default_plot_width = 2300

# --- Helper Function from process_results.py (adapted) ---
def process_frame(frame):
    """
    Processes the DataFrame by selecting relevant columns,
    handling potential renaming and grouping data.
    """
    if "Unnamed: 0" in frame.columns:
        del frame["Unnamed: 0"]

    # Standardize column names for linguistic competencies
    if "linguistic subfield" in frame.columns and "linguistic competencies" not in frame.columns:
        frame["linguistic competencies"] = frame["linguistic subfield"]
        del frame["linguistic subfield"]
    elif "linguistic subfield" in frame.columns and "linguistic competencies" in frame.columns:
        del frame["linguistic subfield"]
    return frame

# --- Core Analysis Function ---
def run_analysis_for_experiment(experiment_name, models, short_names, abs_data_file, group_data_file,
                                main_task_col, subtask_col, subtask_group_col, subtask_phenomena_col):
    """
    Runs the full data loading and analysis pipeline for a given experiment.
    Returns the processed summary and subtasks dataframes.
    """
    print("\n" + "="*30)
    print(f" PROCESSING EXPERIMENT: {experiment_name.upper()} ")
    print("="*30 + "\n")

    # --- 1. Model Categorization ---
    instruct_model = None
    specialist_model = None # Coder or Math model
    merged_models = []
    base_model = None

    for m_full_name in models:
        m_short = short_names.get(m_full_name, "")
        is_instruct = (m_short == "Qwen2.5 Instruct")
        is_specialist = (m_short.startswith("Qwen2.5 Coder") or m_short.startswith("Qwen2.5 Math"))
        is_base = (m_short == "Qwen2.5 Base")
        is_merged = not (is_instruct or is_specialist or is_base)

        if is_instruct:
            instruct_model = m_full_name
        elif is_specialist:
            specialist_model = m_full_name
        elif is_base:
            base_model = m_full_name
        elif is_merged:
            if m_full_name in models:
                merged_models.append(m_full_name)

    if not instruct_model: print("CRITICAL ERROR: Instruct model not identified."); exit()
    if not specialist_model: print(f"CRITICAL ERROR: Specialist model for {experiment_name} not identified."); exit()

    print(f"--- Model Categorization ({experiment_name}) ---")
    if base_model: print(f"Base Model: {base_model} ({short_names.get(base_model, 'N/A')})")
    print(f"Instruct Model: {instruct_model} ({short_names.get(instruct_model, 'N/A')})")
    print(f"Specialist Model: {specialist_model} ({short_names.get(specialist_model, 'N/A')})")
    print(f"Merged Models ({len(merged_models)}):")
    for m in merged_models: print(f"  - {m} ({short_names.get(m, 'N/A')})")
    print("-" * 25)

    comparison_models_ordered = []
    if base_model: comparison_models_ordered.append(base_model)
    if instruct_model: comparison_models_ordered.append(instruct_model)
    if specialist_model: comparison_models_ordered.append(specialist_model)
    comparison_models_ordered.extend([m for m in merged_models if m])
    comparison_models = list(dict.fromkeys(m for m in comparison_models_ordered if m))
    print(f"Models for comparison: {[clean_plot_name(short_names.get(m, m)) for m in comparison_models]}")
    print("-" * 25)

    # --- 2. Data Loading ---
    def load_data_from_csv(abs_filepath, group_filepath, model_list):
        try:
            raw_abs_df = pd.read_csv(abs_filepath)
        except FileNotFoundError:
            print(f"Error: Absolute scores file not found at {abs_filepath}")
            return pd.DataFrame(), pd.DataFrame()

        if "Unnamed: 0" in raw_abs_df.columns:
            del raw_abs_df["Unnamed: 0"]

        raw_abs_df.rename(columns={'probing_dataset': 'subtask_cleaned', 'model_name': 'model'}, inplace=True)
        raw_abs_df = raw_abs_df[raw_abs_df['encoding'] == 'full'].copy()
        raw_abs_df['score'] = pd.to_numeric(raw_abs_df['score'], errors='coerce')
        abs_df_grouped = raw_abs_df.groupby(['subtask_cleaned', 'model'])['score'].mean().reset_index()
        abs_pivot_df = abs_df_grouped.pivot_table(index='subtask_cleaned', columns='model', values='score').reset_index()

        for model_col in model_list:
            if model_col not in abs_pivot_df.columns:
                abs_pivot_df[model_col] = np.nan

        cols_to_keep_abs = ['subtask_cleaned'] + [m for m in model_list if m in abs_pivot_df.columns]
        abs_final_df = abs_pivot_df[cols_to_keep_abs].copy()

        try:
            raw_group_df = pd.read_csv(group_filepath)
        except FileNotFoundError:
            print(f"Error: Group info file not found at {group_filepath}.")
            return pd.DataFrame(), abs_final_df

        group_df_processed = process_frame(raw_group_df.copy())
        rename_map_group = {
            subtask_col: 'subtask_cleaned',
            subtask_group_col: 'group',
            main_task_col: 'main_task_category'
        }
        group_df_processed.rename(columns=rename_map_group, inplace=True)
        
        id_cols_group = ['main_task_category', 'subtask_cleaned', 'group']
        id_cols_group_present = [col for col in id_cols_group if col in group_df_processed.columns]
        group_info_to_merge = group_df_processed[id_cols_group_present].drop_duplicates(subset=['subtask_cleaned'])
        
        subtasks_df = pd.merge(abs_final_df, group_info_to_merge, on='subtask_cleaned', how='left')

        models_in_subtasks_df = [m for m in model_list if m in subtasks_df.columns]
        for model_col in models_in_subtasks_df:
            subtasks_df[model_col] = pd.to_numeric(subtasks_df[model_col], errors='coerce') * 100

        summary_df = pd.DataFrame()
        # CORRECTED: Group by the 'group' column which corresponds to 'linguistic competencies' for the summary
        if 'group' in subtasks_df.columns and models_in_subtasks_df:
            # UPDATED LOGIC: Filter out tasks that couldn't be mapped to a competency group (where 'group' is NaN)
            known_subtasks = subtasks_df.dropna(subset=['group']).copy()
            
            # Additional check to remove any literal 'nan' strings if they exist after conversion
            known_subtasks = known_subtasks[known_subtasks['group'].astype(str).str.lower() != 'nan']

            if not known_subtasks.empty:
                summary_df = known_subtasks.groupby('group')[models_in_subtasks_df].mean()
                # Rename the index to 'main_task_category' as the rest of the script expects this name
                summary_df.index.name = 'main_task_category'
        
        return summary_df, subtasks_df

    summary_df, subtasks_df = load_data_from_csv(abs_data_file, group_data_file, comparison_models)

    print(f"\n--- Summary DataFrame ({experiment_name}) ---")
    if not summary_df.empty: print(summary_df.head())
    else: print("Summary DataFrame is empty.")
    print("-" * 50)
    
    return summary_df, subtasks_df, comparison_models, instruct_model, specialist_model, base_model

# --- Configuration ---
abs_data_file = "results_flash-holmes.csv"
group_data_file = "transformed_results.csv"
MAIN_TASK_COL = "probing dataset"
SUBTASK_COL = "probe"
SUBTASK_GROUP_COL = "linguistic competencies"
SUBTASK_PHENOMENA_COL = "linguistic phenomena"

# --- CODER Experiment Config ---
# Please define the models and their short names here, which you want to analyze.
models_coder = [
    "Qwen__Qwen2.5-7B", "Qwen__Qwen2.5-7B-Instruct", "Qwen__Qwen2.5-Coder-7B",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29"
]
short_names_coder = {
    "Qwen__Qwen2.5-7B": "Qwen2.5 Base",
    "Qwen__Qwen2.5-7B-Instruct": "Qwen2.5 Instruct",
    "Qwen__Qwen2.5-Coder-7B": "Qwen2.5 Coder",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29": "Task Arithmetic_29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29": "DARE Ties_29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29": "Ties_29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29": "Slerp_29",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29": "Linear_29"
}

# --- MATH Experiment Config ---
# Please define the models and their short names here, which you want to analyze.
models_math = [
    "Qwen__Qwen2.5-7B", "Qwen__Qwen2.5-7B-Instruct", "Qwen__Qwen2.5-Math-7B",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-task_arithmetic-26",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-dare_ties-27",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-ties-26",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-slerp-24",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-linear-24"
]
short_names_math = {
    "Qwen__Qwen2.5-7B": "Qwen2.5 Base",
    "Qwen__Qwen2.5-7B-Instruct": "Qwen2.5 Instruct",
    "Qwen__Qwen2.5-Math-7B": "Qwen2.5 Math",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-task_arithmetic-26": "Task Arithmetic_26",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-dare_ties-27": "DARE Ties_27",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-ties-26": "Ties_26",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-slerp-24": "Slerp_24",
    "_username___Qwen2.5-7B-Instruct-Qwen2.5-Math-7B-Merged-linear-24": "Linear_24"
}

# --- Run Analysis for Both Experiments ---
summary_df_coder, _, comparison_models_coder, instruct_model_coder, specialist_model_coder, base_model_coder = run_analysis_for_experiment(
    "Coder", models_coder, short_names_coder, abs_data_file, group_data_file, MAIN_TASK_COL, SUBTASK_COL, SUBTASK_GROUP_COL, SUBTASK_PHENOMENA_COL
)
summary_df_math, _, comparison_models_math, instruct_model_math, specialist_model_math, base_model_math = run_analysis_for_experiment(
    "Math", models_math, short_names_math, abs_data_file, group_data_file, MAIN_TASK_COL, SUBTASK_COL, SUBTASK_GROUP_COL, SUBTASK_PHENOMENA_COL
)

# --- Combined Plotting Section ---
print("\n" + "="*30)
print(" GENERATING COMBINED PLOT ")
print("="*30 + "\n")

def prepare_plot_data(df, comparison_models, short_names_map, base_model, instruct_model, specialist_model):
    """Prepares data for the combined faceted plot from a summary dataframe."""
    if df.empty:
        return pd.DataFrame()
        
    models_to_plot = [m for m in comparison_models if m in df.columns and not df[m].isna().all()]
    
    # Melt the dataframe to long format
    df_melted = df.reset_index().melt(
        id_vars='main_task_category', 
        value_vars=models_to_plot,
        var_name='model_full_name',
        value_name='Score'
    )
    df_melted.rename(columns={'main_task_category': 'Task'}, inplace=True)

    # Map short names and model types
    df_melted['Model Short Name'] = df_melted['model_full_name'].map(lambda x: clean_plot_name(short_names_map.get(x, x)))
    
    def get_model_type(full_name):
        if full_name == base_model: return "Base"
        if full_name == instruct_model: return "Instruct"
        if full_name == specialist_model: return "Specialist"
        return "Merged"
        
    df_melted['Model Type'] = df_melted['model_full_name'].apply(get_model_type)
    
    return df_melted

# Prepare data for both experiments
plot_df_coder = prepare_plot_data(summary_df_coder, comparison_models_coder, short_names_coder, base_model_coder, instruct_model_coder, specialist_model_coder)
plot_df_math = prepare_plot_data(summary_df_math, comparison_models_math, short_names_math, base_model_math, instruct_model_math, specialist_model_math)

# Define model order with core models at the bottom
all_tasks = sorted(list(set(plot_df_coder['Task'].unique()) | set(plot_df_math['Task'].unique())))

# For Coder plot y-axis order
all_coder_models = plot_df_coder['Model Short Name'].unique()
core_coder = ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Coder']
core_coder_present = [m for m in core_coder if m in all_coder_models]
merged_coder = sorted([m for m in all_coder_models if m not in core_coder_present])
models_coder_plot_order = merged_coder + core_coder_present[::-1] # Reverse core for bottom display

# For Math plot y-axis order
all_math_models = plot_df_math['Model Short Name'].unique()
core_math = ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Math']
core_math_present = [m for m in core_math if m in all_math_models]
merged_math = sorted([m for m in all_math_models if m not in core_math_present])
models_math_plot_order = merged_math + core_math_present[::-1] # Reverse core for bottom display


if not all_tasks or (plot_df_coder.empty and plot_df_math.empty):
    print("No data available to generate the combined plot. Exiting plot generation.")
else:
    # UPDATED: Create subplot titles for each competency
    subplot_titles = [f"<b>{task}</b>" for task in all_tasks for _ in (1,2)]
    
    fig = make_subplots(
        rows=len(all_tasks),
        cols=2,
        subplot_titles=subplot_titles,
        horizontal_spacing=0.02,
        vertical_spacing=0.04 # Reduced spacing
    )

    color_map = {
        "Base": 'rgb(100, 149, 237)',
        "Instruct": 'rgb(50, 205, 50)',
        "Specialist": 'rgb(255, 165, 0)',
        "Merged": 'rgb(192, 192, 192)'
    }
    
    # Loop through each task to create a row of subplots
    for i, task in enumerate(all_tasks):
        row_num = i + 1
        
        # --- Column 1: Coder ---
        df_c = plot_df_coder[plot_df_coder['Task'] == task]
        if not df_c.empty:
            # Text styling logic
            text_threshold = 20
            text_positions = ['outside' if score < text_threshold else 'inside' for score in df_c['Score']]
            text_colors = ['black' if score < text_threshold else 'white' for score in df_c['Score']]

            fig.add_trace(go.Bar(
                y=df_c['Model Short Name'],
                x=df_c['Score'],
                marker_color=[color_map.get(t) for t in df_c['Model Type']],
                orientation='h',
                text=df_c.apply(lambda row: f"<b>{row['Model Short Name']}</b>: {row['Score']:.1f}%", axis=1),
                textposition=text_positions,
                textfont=dict(size=12, color=text_colors),
                insidetextanchor='middle',
                cliponaxis=False,
                hoverinfo='none'
            ), row=row_num, col=1)
            
            instruct_score = summary_df_coder.loc[task, instruct_model_coder] if task in summary_df_coder.index else np.nan
            specialist_score = summary_df_coder.loc[task, specialist_model_coder] if task in summary_df_coder.index else np.nan
            
            if pd.notna(instruct_score) and pd.notna(specialist_score):
                fig.add_shape(type="rect", x0=min(instruct_score, specialist_score), x1=max(instruct_score, specialist_score), y0=-0.5, y1=len(models_coder_plot_order)-0.5, fillcolor="rgba(255, 128, 128, 0.2)", line_width=0, layer="below", row=row_num, col=1)
                fig.add_shape(type="line", x0=instruct_score, x1=instruct_score, y0=-0.5, y1=len(models_coder_plot_order)-0.5, line=dict(color=color_map['Instruct'], dash="dash", width=2), layer="above", row=row_num, col=1)
                fig.add_shape(type="line", x0=specialist_score, x1=specialist_score, y0=-0.5, y1=len(models_coder_plot_order)-0.5, line=dict(color=color_map['Specialist'], dash="dash", width=2), layer="above", row=row_num, col=1)

            max_score_c = df_c['Score'].max()
            fig.update_xaxes(range=[0, max_score_c * 1.25], row=row_num, col=1)

        # --- Column 2: Math ---
        df_m = plot_df_math[plot_df_math['Task'] == task]
        if not df_m.empty:
            # Text styling logic
            text_threshold = 20
            text_positions = ['outside' if score < text_threshold else 'inside' for score in df_m['Score']]
            text_colors = ['black' if score < text_threshold else 'white' for score in df_m['Score']]

            fig.add_trace(go.Bar(
                y=df_m['Model Short Name'],
                x=df_m['Score'],
                marker_color=[color_map.get(t) for t in df_m['Model Type']],
                orientation='h',
                text=df_m.apply(lambda row: f"<b>{row['Model Short Name']}</b>: {row['Score']:.1f}%", axis=1),
                textposition=text_positions,
                textfont=dict(size=12, color=text_colors),
                insidetextanchor='middle',
                cliponaxis=False,
                hoverinfo='none'
            ), row=row_num, col=2)

            instruct_score = summary_df_math.loc[task, instruct_model_math] if task in summary_df_math.index else np.nan
            specialist_score = summary_df_math.loc[task, specialist_model_math] if task in summary_df_math.index else np.nan

            if pd.notna(instruct_score) and pd.notna(specialist_score):
                fig.add_shape(type="rect", x0=min(instruct_score, specialist_score), x1=max(instruct_score, specialist_score), y0=-0.5, y1=len(models_math_plot_order)-0.5, fillcolor="rgba(255, 128, 128, 0.2)", line_width=0, layer="below", row=row_num, col=2)
                fig.add_shape(type="line", x0=instruct_score, x1=instruct_score, y0=-0.5, y1=len(models_math_plot_order)-0.5, line=dict(color=color_map['Instruct'], dash="dash", width=2), layer="above", row=row_num, col=2)
                fig.add_shape(type="line", x0=specialist_score, x1=specialist_score, y0=-0.5, y1=len(models_math_plot_order)-0.5, line=dict(color=color_map['Specialist'], dash="dash", width=2), layer="above", row=row_num, col=2)

            max_score_m = df_m['Score'].max()
            fig.update_xaxes(range=[0, max_score_m * 1.25], row=row_num, col=2)

    # Update layout for the entire figure
    # UPDATED: Increased height per bar and adjusted facet height calculation
    height_per_facet = max(len(models_coder_plot_order), len(models_math_plot_order)) * 25 + 40 
    total_height = height_per_facet * len(all_tasks)

    fig.update_layout(
        title_text='<b>Mean Absolute Performance by Linguistic Competency: Coder vs. Math Experiments</b>',
        height=total_height,
        width=2000,
        showlegend=False,
        margin=dict(t=120, l=10, r=10, b=50), # Increased top margin for titles
        plot_bgcolor='white',
        **font_config
    )

    # Update all axes and text properties
    fig.update_yaxes(categoryorder='array', categoryarray=models_coder_plot_order, showticklabels=False, col=1)
    fig.update_yaxes(categoryorder='array', categoryarray=models_math_plot_order, showticklabels=False, col=2)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray', zeroline=False)
    
    # Format subplot titles
    fig.for_each_annotation(lambda a: a.update(font=dict(size=16)))
    
    # Add main column titles
    fig.add_annotation(x=0.25, y=1.02, yanchor='bottom', text="<b>Coder Experiment</b>", showarrow=False, xref="paper", yref="paper", font=dict(size=20))
    fig.add_annotation(x=0.75, y=1.02, yanchor='bottom', text="<b>Math Experiment</b>", showarrow=False, xref="paper", yref="paper", font=dict(size=20))
    
    fig.show()
    print("Generated combined plot: Mean Absolute Performance by Linguistic Competency")
    
    # Export the plot data to CSV
    plot_df_coder['Experiment'] = 'Coder'
    plot_df_math['Experiment'] = 'Math'
    combined_plot_data = pd.concat([plot_df_coder, plot_df_math])
    csv_filename = os.path.join(csv_export_dir, "combined_performance_coder_vs_math.csv")
    combined_plot_data.to_csv(csv_filename, index=False)
    print(f"Exported combined plot data to: {csv_filename}")


print("\n--- Script Finished ---")



 PROCESSING EXPERIMENT: CODER 

--- Model Categorization (Coder) ---
Base Model: Qwen__Qwen2.5-7B (Qwen2.5 Base)
Instruct Model: Qwen__Qwen2.5-7B-Instruct (Qwen2.5 Instruct)
Specialist Model: Qwen__Qwen2.5-Coder-7B (Qwen2.5 Coder)
Merged Models (5):
  - _username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-task_arithmetic-29 (Task Arithmetic_29)
  - _username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-dare_ties-29 (DARE Ties_29)
  - _username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-ties-29 (Ties_29)
  - _username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-slerp-29 (Slerp_29)
  - _username___Qwen2.5-7B-Instruct-Qwen2.5-Coder-7B-Merged-linear-29 (Linear_29)
-------------------------
Models for comparison: ['Qwen2.5 Base', 'Qwen2.5 Instruct', 'Qwen2.5 Coder', 'Task Arithmetic', 'DARE Ties', 'Ties', 'Slerp', 'Linear']
-------------------------

--- Summary DataFrame (Coder) ---
                    Qwen__Qwen2.5-7B  Qwen__Qwen2.5-7B-Instruct  \
main_task_category             

Generated combined plot: Mean Absolute Performance by Linguistic Competency
Exported combined plot data to: plot_data_csv_exports/combined_performance_coder_vs_math.csv

--- Script Finished ---
