In [None]:
# Import standard libraries and scientific computing packages
import pickle  # For loading/saving Python objects in binary format
import numpy as np  # Fundamental package for numerical computations with arrays
import pandas as pd  # Powerful data manipulation and analysis library
import matplotlib.pyplot as plt  # Plotting library for creating static, animated, and interactive visualizations
import seaborn as sns  # Statistical data visualization built on matplotlib
import os  # Import os module for file and directory path operations
from glob import glob  # Import glob to find files matching patterns
import chardet  # Import chardet to detect file encoding automatically
import seaborn as sns # Import seaborn for advanced and easier data visualization, especially heatmaps

# Import various utilities from scikit-learn for model selection, preprocessing, feature selection, metrics, and base classes
from sklearn.model_selection import GroupKFold, GridSearchCV  # GroupKFold for grouped cross-validation, GridSearchCV for hyperparameter tuning
from sklearn.preprocessing import StandardScaler  # Standardizes features by removing the mean and scaling to unit variance
from sklearn.impute import SimpleImputer  # Handles missing values by imputing (here, with mean)
from sklearn.feature_selection import SelectKBest, f_classif  # SelectKBest selects top k features based on ANOVA F-value
from sklearn.metrics import (classification_report, accuracy_score, confusion_matrix, roc_curve, auc,
                             f1_score, precision_score, recall_score, hamming_loss, roc_auc_score)  # Various classification evaluation metrics
from sklearn.base import BaseEstimator, TransformerMixin  # Base classes for building custom transformers and estimators
from sklearn.utils import shuffle  # Utility to shuffle arrays or sparse matrices in a consistent way

# Imbalanced-learn (imblearn) imports for handling imbalanced datasets via over- and under-sampling
from imblearn.over_sampling import SMOTENC  # SMOTE variant for categorical features
from imblearn.under_sampling import RandomUnderSampler  # Random under-sampling to balance classes
from imblearn.pipeline import Pipeline as ImbPipeline  # Pipeline supporting imbalanced data operations
from imblearn.combine import SMOTEENN  # Combined over- and under-sampling method
from imblearn.over_sampling import RandomOverSampler  # Random over-sampling

# CatBoost classifier for gradient boosting with categorical support
from catboost import CatBoostClassifier, Pool

# Multiclass classifier wrapper for multilabel problems
from sklearn.multiclass import OneVsRestClassifier

# SHAP for explainable AI: interpreting model predictions
import shap

# Warning control
import warnings

# Utility to compute class weights for imbalanced data
from sklearn.utils.class_weight import compute_class_weight

# xarray for handling labeled multi-dimensional arrays (used for EEG data here)
import xarray as xr

# Redundant imports (CatBoostClassifier, OneVsRestClassifier, shuffle, SelectKBest) - could be cleaned up


# Custom transformer class to drop columns with all missing values (NaNs)
class DropAllNaNColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Converts input to DataFrame to check for columns that are all NaN
        X_df = pd.DataFrame(X)
        # Store boolean mask of columns that are not all NaN for use in transform
        self.non_nan_cols_ = ~np.all(X_df.isna(), axis=0)
        return self

    def transform(self, X):
        # Convert input to DataFrame and select only columns that are not all NaN
        X_df = pd.DataFrame(X)
        return X_df.loc[:, self.non_nan_cols_].values


# Wrapper class for CatBoost classifier using OneVsRest for multilabel classification
class CatBoostOVRWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, learning_rate=0.03, depth=6, iterations=1000, **kwargs):
        # Initialize with default CatBoost hyperparameters and additional kwargs
        self.learning_rate = learning_rate
        self.depth = depth
        self.iterations = iterations
        self.kwargs = kwargs
        self.model = None

    def get_params(self, deep=True):
        # Returns parameters for sklearn compatibility (used in GridSearchCV)
        return {
            'learning_rate': self.learning_rate,
            'depth': self.depth,
            'iterations': self.iterations,
            **self.kwargs,
        }

    def set_params(self, **params):
        # Allows setting parameters; distinguishes between main params and extra kwargs
        for key, value in params.items():
            if key in ['learning_rate', 'depth', 'iterations']:
                setattr(self, key, value)
            else:
                self.kwargs[key] = value
        return self

    def fit(self, X, y):
        # Detect categorical features automatically (here, only 'condition' column is categorical)
        self.cat_features = [col for col in X.columns if col == 'condition']

        # Initialize CatBoostClassifier with parameters and categorical feature info
        base_model = CatBoostClassifier(
            learning_rate=self.learning_rate,
            depth=self.depth,
            iterations=self.iterations,
            cat_features=self.cat_features,
            **self.kwargs,
        )
        # Wrap with OneVsRestClassifier to support multilabel classification
        self.model = OneVsRestClassifier(base_model)
        # Fit model on training data
        self.model.fit(X, y)
        return self

    def predict(self, X):
        # Predict multilabel classes for input data
        return self.model.predict(X)

    def predict_proba(self, X):
        # Predict probability estimates for multilabel classes
        return self.model.predict_proba(X)


# Function to perform feature selection for multilabel data by selecting top-k features per label and taking union
def multi_label_select_kbest_union(X, Y, k=10):
    n_labels = Y.shape[1]  # Number of labels (multilabel classification)
    selected_feature_indices = set()  # Store unique selected feature indices

    # For each label, perform SelectKBest feature selection and collect indices
    for i in range(n_labels):
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X, Y[:, i])
        indices = selector.get_support(indices=True)
        selected_feature_indices.update(indices)

    # Return sorted list of selected feature indices combining all labels
    selected_feature_indices = sorted(list(selected_feature_indices))
    return selected_feature_indices


# === Load data from pickle file ===
file_path = "/Volumes/NOC_Drive/biomarker_data.pkl"
with open(file_path, "rb") as file:
    hbn_data = pickle.load(file)

# Extract EEG data and cohort dataframe
eeg_data = hbn_data["HBN_source_space"]["eeg_xdata"]
cohort_df = hbn_data["HBN_source_space"]["cohort_df"].copy()

# Extract subject unique IDs from EEG filenames using regex
cohort_df['subject_uid'] = cohort_df['eeg_filename'].str.extract(r'HBN-sub-(NDAR[A-Z0-9]+)')[0]
# Extract condition (EO or EC) from filenames
cohort_df['condition'] = cohort_df['eeg_filename'].str.extract(r'_(EO|EC)_')[0]

# Define a list of selected disorders for classification
selected_disorders = [
    'ADHD-Combined Type',
    'ADHD-Hyperactive/Impulsive Type',
    'ADHD-Inattentive Type',
    'ADHD-Other',
    'Anxiety Disorders',
    'ASD',
    'Communication Disorders',
    'Depressive Disorders',
    'Disruptive, Impulse Control and Conduct Disorders-Other',
    'Elimination Disorders',
    'Healthy controls',
    'Intellectual Disability',
    'OCD',
    'OCD-Other',
    'Oppositional Defiant Disorder',
    'SLD (Mathematics)',
    'SLD (Reading)',
    'SLD (Written Expression)',
    'Tic Disorders',
    'Trauma and Stressor Related Disorders'
]

# Calculate class counts for each disorder, filter for disorders with at least 15 samples
class_counts = cohort_df[selected_disorders].sum()
valid_disorders = class_counts[class_counts >= 15].index.tolist()
if not valid_disorders:
    raise ValueError("No disorders have at least 15 samples!")

# Filter cohort to include only rows with at least one valid disorder label
df_multi = cohort_df[cohort_df[valid_disorders].sum(axis=1) >= 1].copy()
# Extract multilabel targets matrix for valid disorders
y = df_multi[valid_disorders].values.astype(int)

# Verify that each class has enough samples (more than 10 here)
min_class_counts = y.sum(axis=0)
if np.any(min_class_counts <= 10):
    raise ValueError(f"Not enough samples in all classes. Class counts: {min_class_counts}")

# Get IDs and conditions from EEG data coordinates
ids = eeg_data.coords['subject_uid'].values
conditions = eeg_data.coords['condition'].values

# Map (subject_uid, condition) tuple to index in EEG data for quick lookup
index_map = {(uid, cond): i for i, (uid, cond) in enumerate(zip(ids, conditions))}

# Function to map subset of dataframe rows to EEG data indices
def map_indices(df_subset):
    indices = []
    for row in df_subset.itertuples():
        idx = index_map.get((row.subject_uid, row.condition))
        if idx is not None:
            indices.append(idx)
    return indices

# Get indices in EEG data corresponding to filtered cohort subset
indices = map_indices(df_multi)
# Select EEG data matching those indices
eeg_selected = eeg_data.isel(ID=indices)

# Define frequency bands and their corresponding frequency ranges (Hz)
frequency_combinations = [
    ['1.0-4.0 Hz'],  # Delta band
    ['4.0-5.1 Hz', '5.1-6.5 Hz', '6.5-8.3 Hz'],  # Theta band
    ['8.3-10.5 Hz', '10.5-13.4 Hz'],  # Alpha band
    ['13.4-17.0 Hz', '17.0-21.7 Hz', '21.7-27.6 Hz'],  # Beta band
    ['27.6-35.2 Hz', '35.2-44.8 Hz']  # Gamma band
]
frequency_labels = ['Delta', 'Theta', 'Alpha', 'Beta', 'Gamma']

# Extract frequency coordinate values from EEG data
freq_coords = eeg_selected.coords['frequency'].values
combined_freq_data = []

# Average frequency values within each defined band and combine them
for band_freqs in frequency_combinations:
    # Find indices matching frequencies in the band
    indices_f = [i for i, f in enumerate(freq_coords) if f in band_freqs]
    if indices_f:
        # Average over frequency dimension for the selected band
        avg_data = eeg_selected.isel(frequency=indices_f).mean(dim='frequency')
        combined_freq_data.append(avg_data)
    else:
        raise ValueError(f"No matching frequencies found for band: {band_freqs}")

# Concatenate averaged frequency band data into a new dimension
combined_data = xr.concat(combined_freq_data, dim='frequency')
# Assign frequency band labels
combined_data = combined_data.assign_coords(frequency=frequency_labels)
# Rearrange dimensions to desired order
combined_data = combined_data.transpose('ID', 'source', 'frequency', 'biomarker')

# Extract data shape info (samples, sources, frequencies, biomarkers)
n_samples, n_sources, n_freqs, n_biomarkers = eeg_selected.shape

# Generate feature names for all EEG features combining biomarkers, frequencies, and sources
eeg_feature_names_all = [
    f"biomarker_{b}_freq_{f}_source_{s}"
    for s in range(n_sources)
    for f in range(n_freqs)
    for b in range(n_biomarkers)
]

# Flatten combined EEG data into 2D array (samples x features) for machine learning input
X_raw = combined_data.values.reshape(combined_data.shape[0], -1)

# Extract categorical condition feature and reset index for merging
condition_cat = df_multi['condition'].astype('category').reset_index(drop=True)
# Convert flattened EEG features into DataFrame
X_raw_df = pd.DataFrame(X_raw)
# Combine EEG features and categorical condition column into one DataFrame
X_df = pd.concat([X_raw_df, condition_cat], axis=1)
# Rename columns: numerical features named as string numbers, last column is 'condition'
X_df.columns = list(map(str, range(X_raw.shape[1]))) + ['condition']

cat_features = ['condition']  # Categorical feature column name

# Initialize data preprocessing objects
dropper = DropAllNaNColumns()  # Drop columns with all NaN values
imputer = SimpleImputer(strategy='mean')  # Fill missing values with mean
scaler = StandardScaler()  # Standardize features

# Define parameter grid for CatBoost hyperparameter tuning via GridSearchCV
param_grid = {
    'depth': [4, 6],  # Tree depth
    'learning_rate': [0.01, 0.05],  # Learning rate
    'l2_leaf_reg': [3, 5]  # L2 regularization parameter
}

# Initialize lists to collect evaluation metrics across cross-validation folds
outer_scores = []
auc_scores = []
hamming_losses = []
subset_accuracies = []
micro_f1s = []
macro_f1s = []
micro_precisions = []
micro_recalls = []
roc_aucs = []
roc_curves = []

# Use GroupKFold cross-validation to split data ensuring samples from the same subject don't leak across folds
groups = df_multi['subject_uid'].values
outer_cv = GroupKFold(n_splits=10)

# Lists for storing selected features and SHAP explanations per fold
selected_feature_indices_per_fold = []
all_shap_feature_dfs = []

# Reinitialize metric lists (some duplicates present, could be cleaned)
outer_scores = []
auc_scores = []
f1_micro_scores = []
f1_macro_scores = []
hamming_losses = []
precisions = []
recalls = []

# Loop over folds for outer cross-validation
for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X_df, y[:, 0], groups=groups), 1):
    print(f"\n--- Outer Fold {fold} ---")

    # Split train/test data for this fold
    X_train = X_df.iloc[train_idx].copy()
    y_train = y[train_idx]
    X_test = X_df.iloc[test_idx].copy()
    y_test = y[test_idx]

    # Separate EEG numerical features and categorical feature in train set
    X_train_eeg = X_train.drop(columns=cat_features).values
    X_train_cat = X_train[cat_features].reset_index(drop=True)

    # Preprocessing train EEG features
    X_train_eeg = dropper.fit_transform(X_train_eeg)  # Drop all-NaN columns
    X_train_eeg = imputer.fit_transform(X_train_eeg)  # Impute missing values with mean

    # Select top features based on multilabel feature selection union (max 1000 or 20% of features)
    k_features = min(int(0.2 * X_train_eeg.shape[1]), 1000)
    selected_indices = multi_label_select_kbest_union(X_train_eeg, y_train, k=k_features)
    selected_feature_names = [eeg_feature_names_all[i] for i in selected_indices]
    selected_feature_indices_per_fold.append(selected_indices)

    # Reduce train data to selected features
    X_train_eeg = X_train_eeg[:, selected_indices]
    X_train_eeg = scaler.fit_transform(X_train_eeg)  # Standardize features

    # Recombine EEG features and categorical feature into DataFrame
    X_train_df = pd.DataFrame(X_train_eeg, columns=[f'feat_{i}' for i in range(X_train_eeg.shape[1])])
    X_train_df = pd.concat([X_train_df.reset_index(drop=True), X_train_cat], axis=1)
    for cat_col in cat_features:
        X_train_df[cat_col] = X_train_df[cat_col].astype('category')  # Ensure categorical dtype

    # --------------------
    # ⚖️ Balanced Oversampling Per Label (to address class imbalance)
    # --------------------
    X_resampled_list = []
    y_resampled_list = []

    # For each label, apply random oversampling individually and expand labels accordingly
    for i in range(y_train.shape[1]):
        ros = RandomOverSampler(random_state=42)
        X_tmp, y_tmp = ros.fit_resample(X_train_df, y_train[:, i])

        # Create multilabel target array where only current label column is filled, others zeros
        y_tmp_full = np.zeros((X_tmp.shape[0], y_train.shape[1]))
        y_tmp_full[:, i] = y_tmp

        X_resampled_list.append(X_tmp)
        y_resampled_list.append(y_tmp_full)

    # Combine resampled data from all labels
    X_train_balanced = pd.concat(X_resampled_list, ignore_index=True)
    y_train_balanced = np.vstack(y_resampled_list)

    # --------------------
    # 🔧 GridSearchCV for CatBoost classifier
    # --------------------
    model = CatBoostOVRWrapper(iterations=500, verbose=0, cat_features=cat_features)

    grid_search = GridSearchCV(model, param_grid, scoring='f1_micro', cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_balanced, y_train_balanced)

    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")

    # Prepare test set features (apply same processing)
    X_test_eeg = X_test.drop(columns=cat_features).values
    X_test_eeg = dropper.transform(X_test_eeg)
    X_test_eeg = imputer.transform(X_test_eeg)
    X_test_eeg = X_test_eeg[:, selected_indices]
    X_test_eeg = scaler.transform(X_test_eeg)

    # Recombine test features and categorical feature
    X_test_df = pd.DataFrame(X_test_eeg, columns=[f'feat_{i}' for i in range(X_test_eeg.shape[1])])
    X_test_df = pd.concat([X_test_df.reset_index(drop=True), X_test[cat_features].reset_index(drop=True)], axis=1)
    for cat_col in cat_features:
        X_test_df[cat_col] = X_test_df[cat_col].astype('category')

    # Predict on test set
    y_pred = best_model.predict(X_test_df)
    y_prob = best_model.predict_proba(X_test_df)

    # Calculate various evaluation metrics for multilabel classification
    acc = accuracy_score(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    h_loss = hamming_loss(y_test, y_pred)
    roc_auc_micro = roc_auc_score(y_test, y_prob, average='micro', multi_class='ovo')

    # Append metrics for fold
    outer_scores.append(acc)
    f1_micro_scores.append(f1_micro)
    f1_macro_scores.append(f1_macro)
    precisions.append(precision_micro)
    recalls.append(recall_micro)
    hamming_losses.append(h_loss)
    roc_aucs.append(roc_auc_micro)

    # Print metrics for current fold
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Micro: {f1_micro:.4f}")
    print(f"F1 Macro: {f1_macro:.4f}")
    print(f"Precision Micro: {precision_micro:.4f}")
    print(f"Recall Micro: {recall_micro:.4f}")
    print(f"Hamming Loss: {h_loss:.4f}")
    print(f"ROC AUC Micro: {roc_auc_micro:.4f}")

    # --------------------
    # SHAP Explanation
    # --------------------
    # Use TreeExplainer for CatBoost to get SHAP values
    explainer = shap.TreeExplainer(best_model.model.estimators_[0].get_booster())
    shap_values = explainer.shap_values(X_test_df)

    # Convert SHAP values to DataFrame with feature names
    if isinstance(shap_values, list):
        # If list of arrays for multilabel, stack horizontally
        shap_values_all = np.hstack(shap_values)
    else:
        shap_values_all = shap_values

    shap_df = pd.DataFrame(shap_values_all, columns=X_test_df.columns)
    shap_df['subject_uid'] = df_multi.iloc[test_idx]['subject_uid'].values
    shap_df['fold'] = fold
    all_shap_feature_dfs.append(shap_df)

# After all folds, concatenate SHAP explanations into one DataFrame for analysis
shap_results_df = pd.concat(all_shap_feature_dfs, ignore_index=True)
    




In [None]:
# === Save evaluation metrics to CSV ===

# Create a DataFrame named metrics_df with all collected evaluation metrics from cross-validation folds
metrics_df = pd.DataFrame({
    'Fold': list(range(1, len(outer_scores) + 1)),  # Fold numbers from 1 up to number of folds
    'Accuracy': outer_scores,                       # List of accuracy scores per fold
    'AUC': auc_scores,                             # List of AUC scores per fold (area under ROC curve)
    'F1_Micro': f1_micro_scores,                   # Micro-averaged F1 scores per fold (accounts for label imbalance)
    'F1_Macro': f1_macro_scores,                   # Macro-averaged F1 scores per fold (treats all labels equally)
    'Hamming_Loss': hamming_losses,                # Hamming loss per fold (fraction of labels incorrectly predicted)
    'Precision': precisions,                        # Precision scores per fold (positive predictive value)
    'Recall': recalls                               # Recall scores per fold (true positive rate)
})

# Define the full file path where the metrics CSV will be saved
metrics_csv_path = "/Users/tuanadurmayuksel/Desktop/Multi_Final/model_metrics_20com1.csv"

# Save the metrics DataFrame to a CSV file at the specified path without including the index column
metrics_df.to_csv(metrics_csv_path, index=False)

# Print a confirmation message that the metrics were saved successfully
print(f"\n✅ Saved metrics to {metrics_csv_path}")


# === Save SHAP feature importances to CSV ===
# Concatenate all SHAP DataFrames from each fold into one combined DataFrame
# Assign a new column 'Fold' to each DataFrame indicating which fold it belongs to (fold indices start at 1)
shap_combined_df = pd.concat([
    df.assign(Fold=fold_num + 1) for fold_num, df in enumerate(all_shap_feature_dfs)
])

# Define the full file path where the combined SHAP feature importances CSV will be saved
shap_csv_path = "/Users/tuanadurmayuksel/Desktop/Multi_Final/shap_feature_importances_20com1.csv"

# Save the combined SHAP DataFrame to a CSV file at the specified path without including the index column
shap_combined_df.to_csv(shap_csv_path, index=False)

# Print a confirmation message that the SHAP importances were saved successfully
print(f"✅ Saved SHAP importances to {shap_csv_path}")


In [None]:
# Define a function to detect the text encoding of a given file
def detect_encoding(file_path):
    # Open the file in binary mode and read first 10,000 bytes for encoding detection
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(10000))  # Use chardet to guess encoding from bytes
    return result['encoding']  # Return the detected encoding string (e.g., 'utf-8')

# Specify the directory containing all model metric CSV files
metrics_dir = "/Users/tuanadurmayuksel/Desktop/Multi_Final"

# Use glob to get a list of all CSV files starting with "model_metrics_" in the metrics directory
metric_files = glob(os.path.join(metrics_dir, "model_metrics_*.csv"))

# Initialize an empty dictionary to group metrics by subsets (like '3com', '5com', etc.)
grouped_metrics = {}

# Loop through each metric CSV file found
for file in metric_files:
    filename = os.path.basename(file)  # Extract the filename from full path
    
    # Extract subset key from filename by splitting on "com" and adding it back, e.g., '3com', '7com'
    subset_key = filename.split("com")[0] + "com"
    
    # Detect the file's encoding to ensure proper CSV reading (handles various encodings)
    encoding = detect_encoding(file)
    
    # Read the CSV file into a DataFrame using the detected encoding
    df = pd.read_csv(file, encoding=encoding)
    
    # Compute the mean of all numeric columns to get average metric values for this file/fold
    avg_metrics = df.mean(numeric_only=True)
    
    # Add a column 'Combination' to track which file this average comes from
    avg_metrics["Combination"] = filename
    
    # If the subset_key is not yet in the dictionary, create an empty list for it
    if subset_key not in grouped_metrics:
        grouped_metrics[subset_key] = []
    
    # Append the average metrics Series for this file to the list of that subset group
    grouped_metrics[subset_key].append(avg_metrics)

# Initialize a list to store summary DataFrames for each subset group
summary_dfs = []

# Loop over each subset group and its list of average metric Series
for subset_key, metrics_list in grouped_metrics.items():
    # Convert the list of Series to a DataFrame where each row is one file's average metrics
    metrics_df = pd.DataFrame(metrics_list)
    
    # Compute the mean across all files for each metric, excluding the 'Combination' column
    avg_summary = metrics_df.drop(columns=["Combination"]).mean()
    
    # Add metadata columns for clarity
    avg_summary["Subset"] = subset_key  # Label the subset group (e.g., '3com')
    avg_summary["Num_Combinations"] = len(metrics_list)  # Number of files combined
    
    # Append the summarized Series to the summary_dfs list
    summary_dfs.append(avg_summary)

# Combine all subset summary Series into a single summary DataFrame
summary_df = pd.DataFrame(summary_dfs)

# Define the desired column order: put 'Subset' and 'Num_Combinations' first, then all other columns
columns = ["Subset", "Num_Combinations"] + [col for col in summary_df.columns if col not in ["Subset", "Num_Combinations"]]
summary_df = summary_df[columns]  # Reorder columns accordingly

# Define the output file path for the combined summary CSV
summary_path = os.path.join(metrics_dir, "subset_average_metrics_summary.csv")

# Save the final summary DataFrame to a CSV without the index column
summary_df.to_csv(summary_path, index=False)

# Print a confirmation message indicating where the summary was saved
print(f"✅ Saved summary of average metrics per subset to: {summary_path}")



In [None]:
# Specify the directory where all metric CSV files are stored
metrics_dir = "/Users/tuanadurmayuksel/Desktop/Multi_Final"

# Use glob to find all CSV files starting with "model_metrics_" in the specified directory
metric_files = glob(os.path.join(metrics_dir, "model_metrics_*.csv"))

# Initialize a dictionary to group metrics by subset key (e.g., '3com', '7com')
grouped_metrics = {}

# Loop through each metric CSV file found
for file in metric_files:
    filename = os.path.basename(file)  # Extract filename from full file path
    
    # Derive the subset key by taking the part before 'com' and appending 'com'
    # For example, from '3com' or '7com' in the filename
    subset_key = filename.split("com")[0] + "com"
    
    # Read the CSV file into a pandas DataFrame (assuming default encoding)
    df = pd.read_csv(file)
    
    # Calculate the mean of all numeric columns for the current file (averaging folds or runs)
    avg_metrics = df.mean(numeric_only=True)
    
    # Add a 'Combination' column to store the filename as an identifier
    avg_metrics["Combination"] = filename
    
    # Initialize the list for this subset key if it does not exist yet
    if subset_key not in grouped_metrics:
        grouped_metrics[subset_key] = []
    
    # Append the average metrics (as a Series) for this file into the group list
    grouped_metrics[subset_key].append(avg_metrics)

# Create a list to store summary dictionaries for each subset
summary_rows = []

# For each subset group and its collected average metrics
for subset_key, metrics_list in grouped_metrics.items():
    # Convert the list of Series objects into a DataFrame for easier aggregation
    metrics_df = pd.DataFrame(metrics_list)
    
    # Start building a summary row with the subset name and number of files combined
    row = {"Subset": subset_key, "Num_Combinations": len(metrics_list)}
    
    # For each metric column in the DataFrame
    for metric in metrics_df.columns:
        # Skip the 'Combination' column since it’s not a metric to aggregate
        if metric == "Combination":
            continue
        
        # Calculate the mean of this metric across all files in the subset and store it
        row[f"{metric}_mean"] = metrics_df[metric].mean()
        
        # Calculate the standard deviation of this metric and store it
        row[f"{metric}_std"] = metrics_df[metric].std()
    
    # Append the summary row dictionary to the list
    summary_rows.append(row)

# Convert the list of summary dictionaries to a DataFrame for saving
summary_df = pd.DataFrame(summary_rows)

# Define the full path for the output CSV summary file
summary_path = os.path.join(metrics_dir, "subset_average_metrics_summary1.csv")

# Save the summary DataFrame to a CSV file without the index column
summary_df.to_csv(summary_path, index=False)

# Print confirmation message with the path where the summary file is saved
print(f"✅ Saved summary with mean & std metrics per subset to: {summary_path}")


In [None]:
# Load the CSV file containing subset average metrics into a DataFrame
csv_path = "/Users/tuanadurmayuksel/Desktop/Multi_Final/subset_average_metrics_summary.csv"
df = pd.read_csv(csv_path)

# Define the metric columns to plot and their prettier labels for display on the radar plot
metrics_cols = ['Accuracy', 'AUC', 'F1_Micro', 'F1_Macro', 'Precision', 'Recall', 'Hamming_Loss']
pretty_labels = ['Accuracy', 'AUC', 'F1 Micro', 'F1 Macro', 'Precision', 'Recall', 'Hamming Loss']

# Calculate angles for radar plot axes; one angle per metric, evenly spaced around the circle
num_vars = len(metrics_cols)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]  # Append first angle at the end to close the radar plot loop

# Mapping from subset keys (matching 'Subset' values in df) to readable class size labels
subset_to_class_size = {
    'model_metrics_20com': '20 class size',
    'model_metrics_15com': '15 class size',
    'model_metrics_7com':  '7 class size',
    'model_metrics_3com':  '3 class size'
}

# Define distinct colors for each subset group for the radar plot lines and fills
subset_colors = {
    'model_metrics_20com': '#FF8C00',  # dark orange
    'model_metrics_15com': '#4169E1',  # royal blue
    'model_metrics_7com':  '#DC143C',  # crimson
    'model_metrics_3com':  '#228B22'   # forest green
}

# Define the order in which subsets will be plotted and displayed
ordered_subsets = [
    'model_metrics_20com',
    'model_metrics_15com',
    'model_metrics_7com',
    'model_metrics_3com'
]


# === Create the radar plot figure with polar coordinates ===
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

# Loop over each subset to plot its average metric values on the radar
for subset_name in ordered_subsets:
    # Extract data rows for current subset
    group = df[df['Subset'] == subset_name]
    if group.empty:
        continue  # Skip if no data for this subset
    
    # Compute mean metric values for the subset and close the loop for radar plot
    values = group[metrics_cols].mean().values.tolist()
    values += values[:1]  # close the radar polygon
    
    # Select color and label for this subset
    color = subset_colors[subset_name]
    label = subset_to_class_size[subset_name]
    
    # Plot the radar line and fill with transparency for visual effect
    ax.plot(angles, values, color=color, linewidth=2.5, label=label)
    ax.fill(angles, values, color=color, alpha=0.25)

# Set the angle grid labels with pretty metric names, larger font for readability
ax.set_thetagrids(np.degrees(angles[:-1]), pretty_labels, fontsize=30, fontweight='semibold', color="black")

# Set radial axis ticks from 0 to 1, labeled with 2 decimals
radial_ticks = np.linspace(0, 1, 6)
ax.set_yticks(radial_ticks)
ax.set_yticklabels([f"{x:.2f}" for x in radial_ticks], fontsize=20, color='black')

# Style radial grid lines with dashed black lines
ax.yaxis.grid(True, color='black', linestyle='--', alpha=0.7, linewidth=1)

# Fix the radial axis limits from 0 to 1 for all metrics
ax.set_rlim(0, 1)

# Rotate the plot so the first metric is at the top
ax.set_theta_offset(np.pi / 2)

# Set plot direction clockwise for more natural reading order
ax.set_theta_direction(-1)

# Hide the circular border line around the radar plot
ax.spines['polar'].set_visible(False)

# Set the background color of the radar plot to white
ax.set_facecolor('white')

# Add a legend outside the plot area, titled and sized for clarity
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.19), title='Class Sizes', fontsize=21, title_fontsize=22)

# Adjust layout so nothing overlaps or is cut off
plt.tight_layout()

# Show the radar plot
plt.show()


In [None]:
# Manually provided improved metrics data for different class-size subsets
data = {
    'Subset': ['model_metrics_20com', 'model_metrics_7com', 'model_metrics_15com', 'model_metrics_3com'],
    'Accuracy_mean': [0.0252, 0.2102, 0.0350, 0.4130],
    'Accuracy_std': [0, 0.2321, 0.0058, 0.0823],
    'AUC_mean': [0.5017, 0.5026, 0.5050, 0.5399],
    'AUC_std': [0, 0.0007, 0.0006, 0.0344],
    'F1_Micro_mean': [0.0648, 0.3369, 0.0750, 0.5616],
    'F1_Micro_std': [0, 0.1976, 0.0184, 0.0444],
    'F1_Macro_mean': [0.0188, 0.0887, 0.0220, 0.3577],
    'F1_Macro_std': [0, 0.0199, 0.0029, 0.0513],
    'Hamming_Loss_mean': [0.1166, 0.2099, 0.1100, 0.3106],
    'Hamming_Loss_std': [0, 0.0605, 0.0124, 0.0437],
    'Precision_mean': [0.4921, 0.5609, 0.5100, 0.6132],
    'Precision_std': [0, 0.0736, 0.0102, 0.0507],
    'Recall_mean': [0.0344, 0.2627, 0.0420, 0.5219],
    'Recall_std': [0, 0.2081, 0.0103, 0.0523],
}

# Create DataFrame from manual data
df = pd.DataFrame(data)

# Extract numerical class size from 'Subset' column and sort DataFrame ascending by class size
df['Class_Size'] = df['Subset'].str.extract(r'_(\d+)com').astype(int)
df = df.sort_values(by='Class_Size', ascending=True).reset_index(drop=True)

# Metrics and their prettier names for x-axis labels
metrics = ['Accuracy', 'AUC', 'F1_Micro', 'F1_Macro', 'Hamming_Loss', 'Precision', 'Recall']
pretty_metrics = ['Accuracy', 'AUC', 'F1 Micro', 'F1 Macro', 'Hamming Loss', 'Precision', 'Recall']

# Mapping subset keys to simpler class size labels for legend
subset_rename = {
    'model_metrics_20com': '20 class',
    'model_metrics_15com': '15 class',
    'model_metrics_7com': '7 class',
    'model_metrics_3com': '3 class',
}

# Assign a distinct color for each class size subset
subset_colors = {
    'model_metrics_20com': '#FF8C00',  # Dark orange
    'model_metrics_15com': '#4169E1',  # Royal blue
    'model_metrics_7com': '#DC143C',   # Crimson red
    'model_metrics_3com': '#228B22',   # Forest green
}

# X locations for groups of bars (one bar group per metric)
x = np.arange(len(metrics))

# Width of each bar within a group (each subset will be offset horizontally)
width = 0.18

# Create the figure and axis for the bar plot with a wide layout
fig, ax = plt.subplots(figsize=(14, 8))

# Iterate over each subset row to plot bars with error bars
for i, row in df.iterrows():
    subset = row['Subset']
    # Extract mean values for metrics and their std deviations
    means = row[[m + '_mean' for m in metrics]].values.astype(float)
    stds = row[[m + '_std' for m in metrics]].values.astype(float)

    # Use std dev for error bars but ignore zero std by replacing them with NaN (no error bar)
    error_bars = [std if std > 0 else np.nan for std in stds]

    # Plot bars, horizontally offset by index to avoid overlap, with error bars and caps
    ax.bar(x + i * width, means, width,
           yerr=error_bars,
           label=subset_rename.get(subset, subset),  # use readable label if available
           color=subset_colors.get(subset, 'gray'),  # default color fallback
           capsize=5,
           edgecolor='black')

# Set x-axis tick labels at center of grouped bars, with font size and rotation for readability
ax.set_xticks(x + width * (len(df) - 1) / 2)
ax.set_xticklabels(pretty_metrics, fontsize=30, fontweight='bold', rotation=25)

# Configure y-axis limits and ticks from 0 to 1.0 with labels
ax.set_ylim(0, 1.0)
ax.set_yticks(np.linspace(0, 1, 11))
ax.set_yticklabels([f"{x:.1f}" for x in np.linspace(0, 1, 11)], fontsize=30)

# Axis labels for clarity
ax.set_ylabel('Metric Values', fontsize=25)
ax.set_xlabel('Metrics', fontsize=25)

# Add legend for class sizes with title and font sizes
ax.legend(title='Class Size', fontsize=25, title_fontsize=25)

# Adjust layout to prevent overlap and tight spacing
plt.tight_layout()

# Display the plot
plt.show()




In [None]:
# Aggregate importance by Frequency and Biomarker by averaging over all sources
agg_df = feature_df.groupby(['Frequency', 'Biomarker'])['importance'].mean().reset_index()

# Pivot the data to have Biomarkers as rows and Frequency bands as columns
# Note: column names should match exactly, so use proper capitalization
heatmap_data = agg_df.pivot(index='Biomarker', columns='Frequency', values='importance')

plt.figure(figsize=(10, 6))

# Draw heatmap with annotations, using 'viridis' color map
sns.heatmap(heatmap_data, annot=True, fmt=".3f", cmap='viridis')

plt.title('Biomarker x Frequency Importance Heatmap (Averaged over Sources)')
plt.ylabel('Biomarker')
plt.xlabel('Frequency Band')

plt.tight_layout()
plt.show()



In [None]:
# Aggregate importance by frequency and biomarker, averaging over all sources
agg_df = feature_df.groupby(['frequency', 'biomarker'])['importance'].mean().reset_index()

# Pivot the DataFrame to create a matrix for the heatmap:
# rows = biomarkers, columns = frequency bands, values = average importance scores
heatmap_data = agg_df.pivot(index='biomarker', columns='frequency', values='importance')

# Check if all importance values are zero (no meaningful data to plot)
if np.max(heatmap_data.values) == 0:
    print("All coefficient values are zero! Cannot plot meaningful heatmap.")
else:
    # Normalize the importance values by dividing by the max value,
    # to keep the color scale consistent and between 0-1
    heatmap_data_norm = heatmap_data / np.max(heatmap_data.values)

    # Clean up frequency column names by stripping any leading/trailing whitespace
    heatmap_data_norm.columns = heatmap_data_norm.columns.str.strip()

    # Clean up biomarker index names by stripping whitespace as well
    heatmap_data_norm.index = heatmap_data_norm.index.str.strip()

    # Set the figure size for better visualization
    plt.figure(figsize=(12, 6))

    # Create the heatmap with seaborn
    ax = sns.heatmap(
        heatmap_data_norm,
        cmap="YlGnBu",            # Color map for a visually pleasant gradient
        cbar_kws={'aspect': 35},  # Colorbar appearance settings (aspect ratio)
        annot=True,               # Annotate cells with numeric values
        fmt=".3f",                # Format numbers with 3 decimal places
        annot_kws={"size": 20},   # Font size for annotations
        linecolor='black',        # Color for grid lines between cells
        linewidth=1               # Width of grid lines
    )

    # Set axis labels and font sizes
    plt.xlabel("Frequency Band", fontsize=18)
    plt.ylabel("", fontsize=18)  # Leaving y-label empty to use custom text instead

    # Customize tick labels font size, rotation, and weight for readability
    plt.xticks(fontsize=16, rotation=25, fontweight='bold')
    plt.yticks(fontsize=16, rotation=0, fontweight='bold')

    # Remove left spine (left vertical border line) for a cleaner look
    ax.spines['left'].set_visible(False)

    # Add a custom y-axis label "Biomarker" with specific position and rotation
    ax.yaxis.set_label_coords(-0.1, 0.5)
    ax.text(
        x=-0.10, y=0.40, s="Biomarker",
        fontsize=18, rotation=90,
        ha='center', va='bottom',
        transform=ax.transAxes
    )

    # Access the colorbar and set its font size and label
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=20)
    cbar.set_label('Normalized Importance', fontsize=20)

    # Adjust layout to prevent clipping of labels and titles
    plt.tight_layout()

    # Display the plot
    plt.show()


In [None]:
# Define disorder labels for rows and columns of the co-occurrence matrix
labels = [
    "ADHD-C", "ADHD-H", "ADHD-I", "ADHD-Other", "Anxiety", "ASD", "CommDis",
    "Depression", "Impulse", "Elimination", "Healthy", "ID", "OCD", "OCD-Other",
    "ODD", "SLD-Math", "SLD-Read", "SLD-Write", "Tic", "Trauma-Stress"
]
# Co-occurrence matrix where each entry represents the proportion of co-occurrence between disorders
co_occurrence_data = [
    [1.00, 0.11, 0.38, 0.12, 0.00, 0.13, 0.02, 0.04, 0.02, 0.08, 0.17, 0.05, 0.02, 0.04, 0.23, 0.00, 0.00, 0.02, 0.07, 0.01],
    [0.11, 1.00, 0.12, 0.04, 0.00, 0.03, 0.04, 0.04, 0.03, 0.04, 0.03, 0.04, 0.03, 0.04, 0.11, 0.05, 0.04, 0.03, 0.06, 0.01],
    [0.38, 0.12, 1.00, 0.14, 0.01, 0.03, 0.04, 0.04, 0.04, 0.05, 0.19, 0.04, 0.04, 0.04, 0.11, 0.08, 0.05, 0.05, 0.00, 0.03],
    [0.12, 0.04, 0.14, 1.00, 0.00, 0.04, 0.03, 0.03, 0.03, 0.04, 0.06, 0.03, 0.04, 0.03, 0.09, 0.04, 0.04, 0.03, 0.03, 0.03],
    [0.00, 0.00, 0.01, 0.00, 1.00, 0.03, 0.03, 0.01, 0.02, 0.03, 0.22, 0.04, 0.10, 0.03, 0.04, 0.12, 0.08, 0.04, 0.02, 0.07],
    [0.13, 0.03, 0.03, 0.04, 0.03, 1.00, 0.01, 0.04, 0.03, 0.04, 0.12, 0.06, 0.10, 0.04, 0.02, 0.04, 0.04, 0.02, 0.06, 0.04],
    [0.02, 0.03, 0.04, 0.03, 0.03, 0.01, 1.00, 0.07, 0.03, 0.03, 0.13, 0.01, 0.03, 0.02, 0.01, 0.17, 0.16, 0.12, 0.03, 0.00],
    [0.04, 0.04, 0.04, 0.02, 0.01, 0.04, 0.07, 1.00, 0.04, 0.03, 0.10, 0.05, 0.08, 0.05, 0.08, 0.05, 0.09, 0.05, 0.01, 0.07],
    [0.02, 0.03, 0.04, 0.03, 0.02, 0.03, 0.03, 0.04, 1.00, 0.03, 0.04, 0.02, 0.03, 0.02, 0.11, 0.02, 0.03, 0.04, 0.00, 0.02],
    [0.08, 0.04, 0.05, 0.04, 0.03, 0.04, 0.03, 0.03, 0.03, 1.00, 0.10, 0.02, 0.01, 0.01, 0.11, 0.02, 0.02, 0.03, 0.02, 0.01],
    [0.17, 0.03, 0.19, 0.06, 0.22, 0.12, 0.13, 0.10, 0.04, 0.10, 1.00, 0.06, 0.06, 0.04, 0.09, 0.09, 0.14, 0.08, 0.09, 0.06],
    [0.05, 0.04, 0.04, 0.03, 0.04, 0.06, 0.01, 0.05, 0.02, 0.02, 0.06, 1.00, 0.03, 0.03, 0.06, 0.03, 0.03, 0.06, 0.02, 0.01],
    [0.02, 0.03, 0.04, 0.04, 0.10, 0.10, 0.03, 0.08, 0.03, 0.01, 0.06, 0.03, 1.00, 0.06, 0.03, 0.02, 0.04, 0.04, 0.11, 0.03],
    [0.04, 0.04, 0.04, 0.03, 0.03, 0.04, 0.02, 0.05, 0.02, 0.01, 0.04, 0.03, 0.06, 1.00, 0.04, 0.03, 0.04, 0.04, 0.01, 0.03],
    [0.23, 0.11, 0.11, 0.09, 0.04, 0.02, 0.01, 0.08, 0.11, 0.11, 0.09, 0.06, 0.03, 0.04, 1.00, 0.11, 0.03, 0.03, 0.03, 0.03],
    [0.00, 0.05, 0.08, 0.04, 0.12, 0.04, 0.17, 0.05, 0.02, 0.02, 0.09, 0.03, 0.02, 0.03, 0.11, 1.00, 0.23, 0.30, 0.02, 0.04],
    [0.00, 0.04, 0.05, 0.04, 0.08, 0.04, 0.16, 0.09, 0.03, 0.02, 0.14, 0.03, 0.04, 0.04, 0.03, 0.23, 1.00, 0.26, 0.02, 0.03],
    [0.02, 0.03, 0.05, 0.03, 0.04, 0.02, 0.12, 0.05, 0.04, 0.03, 0.08, 0.06, 0.04, 0.04, 0.03, 0.30, 0.26, 1.00, 0.02, 0.03],
    [0.07, 0.06, 0.00, 0.03, 0.02, 0.06, 0.03, 0.01, 0.00, 0.02, 0.09, 0.02, 0.11, 0.01, 0.03, 0.02, 0.02, 0.02, 1.00, 0.04],
    [0.01, 0.01, 0.03, 0.03, 0.07, 0.04, 0.00, 0.07, 0.02, 0.01, 0.06, 0.01, 0.03, 0.03, 0.03, 0.04, 0.03, 0.03, 0.04, 1.00]
]

# Convert the raw data into a pandas DataFrame for easier manipulation and visualization
co_matrix = pd.DataFrame(co_occurrence_data, index=labels, columns=labels)

# Set up the plot size for readability
plt.figure(figsize=(13, 11))

# Create a heatmap to visualize the co-occurrence matrix
ax = sns.heatmap(
    co_matrix,
    fmt=".1f",             # Format values with one decimal place
    cmap="YlGnBu",         # Use a yellow-green-blue color palette
    square=True,           # Make each cell square-shaped for uniformity
    cbar_kws={'label': 'Proportion'},  # Label for the colorbar
    linewidths=0.8,        # Width of lines between cells
    linecolor='gray',      # Color of lines between cells
)

# Customize tick labels: font size, rotation, and alignment for clarity
plt.xticks(fontsize=23, rotation=45, ha='right')  # Rotate x-axis labels diagonally
plt.yticks(fontsize=23)

# Add labels for x and y axes with larger, bold fonts
plt.xlabel("Disorders", fontsize=20, weight='bold')
plt.ylabel("Co-occurrence", fontsize=20, weight='bold')

# Customize the colorbar label and tick labels font sizes
colorbar = ax.collections[0].colorbar
colorbar.set_label('Proportion', fontsize=20, weight='bold')
colorbar.ax.tick_params(labelsize=20)

# Add a title to the heatmap with an appropriate font size
plt.title("Disorder Co-occurrence Matrix", fontsize=18)

# Adjust layout to prevent label overlap or clipping
plt.tight_layout()

# Show the final heatmap plot
plt.show()



In [None]:

# Manually entered SHAP importance values for different frequency bands and biomarkers
data = {
    'Alpha':         [0.41, 0.10, 0.18, 0.23],
    'Beta':          [1.00, 0.17, 0.22, 0.26],
    'Delta':         [0.57, 0.25, 0.18, 0.10],
    'Gamma':         [0.91, 0.18, 0.36, 0.29],
    'Theta':         [0.53, 0.28, 0.14, 0.19],
}
index = ['AbsolutePower', 'DFA', 'RelativePower', 'fEI']

# Create DataFrame with biomarkers as rows, frequency bands as columns
df = pd.DataFrame(data, index=index)

# Normalize the data by dividing by the maximum value in the entire DataFrame
df_norm = df / df.values.max()

# Rename some row indices for brevity
df_norm.columns = df_norm.columns.str.strip()
df_norm = df_norm.rename(index={
    'RelativePower': 'RP',
    'AbsolutePower': 'AP'
})

# Plotting the heatmap with customized appearance
plt.figure(figsize=(12, 6))
ax = sns.heatmap(
    df_norm,
    cmap="YlGnBu",               # Color palette: yellow-green-blue
    cbar_kws={'aspect': 35},     # Aspect ratio of the color bar
    annot=True,                  # Show values on heatmap cells
    fmt=".2f",                   # Format annotation to 2 decimal places
    annot_kws={"size": 25},      # Font size for annotations
    linecolor='black',           # Color of grid lines between cells
    linewidth=1                  # Width of grid lines
)

plt.xlabel("Frequency Band", fontsize=25)
plt.ylabel("", fontsize=18)

plt.xticks(fontsize=25, rotation=25, fontweight='bold')  # Rotate and format x-axis labels
plt.yticks(fontsize=20, rotation=0, fontweight='bold')   # Format y-axis labels

# Remove left spine (the vertical border line on the y-axis)
ax.spines['left'].set_visible(False)

# Add a vertical "Biomarker" label to the left side of the heatmap
ax.yaxis.set_label_coords(-0.1, 0.5)
ax.text(
    x=-0.11, y=0.25, s="Biomarker",
    fontsize=25, rotation=90,
    ha='center', va='bottom',
    transform=ax.transAxes
)

# Format color bar ticks and label font size
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)
cbar.set_label('Normalized SHAP Importance', fontsize=22)

plt.tight_layout()
plt.show()