In [None]:
from dataloader import *
import pickle

if __name__ == '__main__':

    config_filepath = "config.json"

    with open(config_filepath, "r") as json_file:
        config_data = json.load(json_file)

    feats2level = config_data["feats2level"]
    allfeats_batch1 = config_data["allfeats_batch1"]
    allfeats_batch2 = config_data["allfeats_batch2"]
    features_to_load = config_data["features_to_load"]
    merged_data_pkl = config_data["merged_data_pkl"]
    base_folder = config_data["base_folder"]
    
     # Load merged dataset
    pickle_in = open(merged_data_pkl, "rb")
    merged_dataset = pickle.load(pickle_in)
    
    # contrast = merged_dataset.merged_data[0]['contrast']
    # # contrast = np.array(contrast)
    # print(type(contrast))
    # print(contrast.shape)

    # print(len(contrast))
    # print(contrast[0].shape)

        
    

    # Perform train-test split
    
    train_dataset, test_dataset = train_test_split_by_subject(
        merged_dataset,
        feature_names=features_to_load,
        feats2level = feats2level,
        test_size=0.2,        # 20% for testing
        random_state=29       # For reproducibility
    )

    print(f"\nTraining Dataset: {len(train_dataset)} samples")
    print(f"Testing Dataset: {len(test_dataset)} samples")

    # Create DataLoaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,      # Adjust based on your memory constraints
        shuffle=True,       # Shuffle for training
        num_workers=4       # Adjust based on your CPU cores
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=16,      # Adjust as needed
        shuffle=False,      # No need to shuffle for testing
        num_workers=4       # Adjust based on your CPU cores
    )
    
    # Get NumPy arrays for scikit-learn (classification based on 'group_id')
    X_train, y_train = train_dataset.get_numpy(label_column='group_id')
    X_test, y_test = test_dataset.get_numpy(label_column='group_id')

    print("\nShape of X_train:", X_train.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of X_test:", X_test.shape)
    print("Shape of y_test:", y_test.shape)

    print("\nFirst 5 training labels:", y_train[:5])
    print("First 5 testing labels:", y_test[:5])

    


In [None]:

import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple, Dict, Optional
import re
from collections import defaultdict
from pathlib import Path
from load_feat_pd import load_feat  # Ensure this module is in your PYTHONPATH
import pandas as pd
from sklearn.model_selection import train_test_split, GroupKFold, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
)
from sklearn.base import BaseEstimator, clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import json

def make_serializable(obj):
    """
    Recursively converts non-serializable objects into serializable formats.
    
    Args:
        obj: The object to serialize.
    
    Returns:
        A serializable version of the object.
    """
    if isinstance(obj, (np.ndarray, np.generic)):
        return obj.tolist()
    elif isinstance(obj, (dict)):
        return {k: make_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [make_serializable(v) for v in obj]
    elif callable(obj):
        return str(obj)  # Convert functions/methods to their string representation
    else:
        return obj


def nested_k_fold_cross_validation(
    dataset: Dataset,
    feature_names: List[str],
    feats2level: dict,
    model: BaseEstimator,
    outer_k: int = 5,
    inner_k: int = 3,
    random_state: int = 42
) -> Dict[str, List]:
    """
    Performs nested k-fold cross-validation using the existing train_test_split_by_subject function
    for the outer loop and GroupKFold for the inner loop.

    Args:
        dataset (Dataset): The merged dataset (FeatureDataset instance).
        feature_names (List[str]): List of feature names included in the dataset.
        model (BaseEstimator): The machine learning model to train (must follow scikit-learn's estimator API).
        outer_k (int): Number of outer folds.
        inner_k (int): Number of inner folds.
        random_state (int): Base random seed for reproducibility.

    Returns:
        Dict[str, List]: A dictionary containing performance metrics for each outer fold.
    """
    # Convert merged data to DataFrame
    merged_data = dataset.merged_data  # Assuming 'merged_data' is a list of dicts
    df = pd.DataFrame(merged_data)

    # Extract unique subject_ids and their genders
    subjects_df = df[['subject_id', 'gender']].drop_duplicates()

    # Drop subjects with missing gender
    subjects_df = subjects_df.dropna(subset=['gender'])

    # Ensure 'gender' is string type
    subjects_df['gender'] = subjects_df['gender'].astype(str)

    # Initialize StratifiedKFold for outer loop based on gender
    outer_cv = StratifiedKFold(n_splits=outer_k, shuffle=True, random_state=random_state)

    # Prepare data for outer loop
    X_subjects_outer = subjects_df['subject_id']
    y_subjects_outer = subjects_df['gender']

    # Initialize a dictionary to store metrics
    metrics = {
        'accuracy': [],
        'roc_auc': [],
        'sensitivity': [],
        'specificity': [],
        'confusion_matrix': [],
        'classification_report': []
    }

    # Outer Loop
    for outer_fold, (train_subjects_idx, test_subjects_idx) in enumerate(outer_cv.split(X_subjects_outer, y_subjects_outer), 1):
        print(f"\n=== Outer Fold {outer_fold} ===")

        # Extract outer train and test subject_ids
        outer_train_subjects = subjects_df.iloc[train_subjects_idx]
        outer_test_subjects = subjects_df.iloc[test_subjects_idx]

        outer_train_ids = set(outer_train_subjects['subject_id'])
        outer_test_ids = set(outer_test_subjects['subject_id'])

        # Assign samples to outer train and test sets based on subject_id
        outer_train_df = df[df['subject_id'].isin(outer_train_ids)].reset_index(drop=True)
        outer_test_df = df[df['subject_id'].isin(outer_test_ids)].reset_index(drop=True)

        # Create SubsetFeatureDataset instances
        outer_train_dataset = SubsetFeatureDataset(outer_train_df, feature_names, feats2level)
        outer_test_dataset = SubsetFeatureDataset(outer_test_df, feature_names, feats2level)

        print(f"Training subjects: {len(outer_train_ids)}")
        print(f"Testing subjects: {len(outer_test_ids)}")
        print(f"Total training samples: {len(outer_train_df)}")
        print(f"Total testing samples: {len(outer_test_df)}")

        # Extract features and labels for outer train and test sets
        X_outer_train, y_outer_train = outer_train_dataset.get_numpy(label_column='group_id')
        X_outer_test, y_outer_test = outer_test_dataset.get_numpy(label_column='group_id')

        # Initialize GroupKFold for inner loop based on subject_id
        inner_cv = GroupKFold(n_splits=inner_k)

        # Extract groups for inner loop
        groups_outer_train = outer_train_df['subject_id'].values

        best_score = -np.inf
        best_model = None

        # Inner Loop: Hyperparameter Tuning or Model Validation
        for inner_fold, (inner_train_idx, inner_val_idx) in enumerate(inner_cv.split(X_outer_train, y_outer_train, groups=groups_outer_train), 1):
            print(f"  --- Inner Fold {inner_fold} ---")

            X_inner_train, X_inner_val = X_outer_train[inner_train_idx], X_outer_train[inner_val_idx]
            y_inner_train, y_inner_val = y_outer_train[inner_train_idx], y_outer_train[inner_val_idx]  # Corrected here

            # Clone the model to ensure independence
            model_clone = clone(model)

            # Train the model on inner training set
            model_clone.fit(X_inner_train, y_inner_train)

            # Evaluate on inner validation set
            score = model_clone.score(X_inner_val, y_inner_val)
            print(f"    Inner Fold {inner_fold} Score: {score:.4f}")

            # Update best model if current model is better
            if score > best_score:
                best_score = score
                best_model = clone(model_clone)

        print(f"  Best Inner Fold Score: {best_score:.4f}")

        # Train the best model on the entire outer training set
        best_model.fit(X_outer_train, y_outer_train)

        # Predict on the outer test set
        y_pred = best_model.predict(X_outer_test)
        if hasattr(best_model, "predict_proba"):
            y_pred_proba = best_model.predict_proba(X_outer_test)[:, 1]
        else:
            # If model does not support predict_proba, use decision function or default probabilities
            y_pred_proba = best_model.decision_function(X_outer_test)
            # Ensure y_pred_proba is positive if necessary
            if np.any(y_pred_proba < 0):
                y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (y_pred_proba.max() - y_pred_proba.min())

        # Calculate Accuracy
        accuracy = accuracy_score(y_outer_test, y_pred)
        metrics['accuracy'].append(accuracy)
        print(f"  Outer Fold {outer_fold} Accuracy: {accuracy:.4f}")

        # Calculate ROC-AUC
        try:
            roc_auc = roc_auc_score(y_outer_test, y_pred_proba)
        except ValueError as e:
            print(f"    ROC-AUC Calculation Error: {e}")
            roc_auc = np.nan
        metrics['roc_auc'].append(roc_auc)
        print(f"  Outer Fold {outer_fold} ROC-AUC: {roc_auc:.4f}")

        # Generate Confusion Matrix
        conf_matrix = confusion_matrix(y_outer_test, y_pred)
        metrics['confusion_matrix'].append(conf_matrix.tolist())  # Convert ndarray to list
        print(f"  Outer Fold {outer_fold} Confusion Matrix:\n{conf_matrix}")

        # Generate Classification Report
        class_report = classification_report(y_outer_test, y_pred, output_dict=True)
        # Convert any numpy types within the dict to native Python types
        class_report_serializable = make_serializable(class_report)
        metrics['classification_report'].append(class_report_serializable)
        print(f"  Outer Fold {outer_fold} Classification Report:\n{classification_report(y_outer_test, y_pred)}")

        # Calculate Sensitivity and Specificity
        if conf_matrix.shape == (2, 2):
            tn, fp, fn, tp = conf_matrix.ravel()
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            metrics['sensitivity'].append(sensitivity)
            metrics['specificity'].append(specificity)
            print(f"  Outer Fold {outer_fold} Sensitivity (Recall): {sensitivity:.4f}")
            print(f"  Outer Fold {outer_fold} Specificity: {specificity:.4f}")
        else:
            print("  Confusion matrix is not binary. Skipping Sensitivity and Specificity calculation.")
            metrics['sensitivity'].append(None)
            metrics['specificity'].append(None)

    # Convert all metrics to serializable formats
    serializable_metrics = make_serializable(metrics)

    # Attempt to serialize and catch potential errors
    try:
        with open('nested_cv_metrics.json', 'w') as f:
            json.dump(serializable_metrics, f, indent=4)
        print("\nNested K-Fold Cross-Validation completed and metrics saved to 'nested_cv_metrics.json'.")
    except TypeError as e:
        print(f"\nSerialization Error: {e}")
        # Optionally, inspect the metrics to identify problematic entries
        for key, value in metrics.items():
            for idx, item in enumerate(value):
                try:
                    json.dumps(item)
                except TypeError:
                    print(f"Non-serializable object found in '{key}' at index {idx}: {item}")



In [None]:
import pickle

from dataloader import *

# Example Usage of Nested K-Fold Cross-Validation
if __name__ == '__main__':

    # Load merged dataset
    config_filepath = "config.json"

    with open(config_filepath, "r") as json_file:
        config_data = json.load(json_file)

    feats2level = config_data["feats2level"]
    allfeats_batch1 = config_data["allfeats_batch1"]
    allfeats_batch2 = config_data["allfeats_batch2"]
    features_to_load = config_data["features_to_load"]
    merged_data_pkl = config_data["merged_data_pkl"]
    base_folder = config_data["base_folder"]
    
     # Load merged dataset
    pickle_in = open(merged_data_pkl, "rb")
    merged_dataset = pickle.load(pickle_in)
    print(f"Loaded dataset of type: {type(merged_dataset)}")
    
    seed = 7

    # Perform Nested K-Fold Cross-Validation
    # Initialize a classifier (e.g., RandomForestClassifier or LogisticRegression)
    classifier = RandomForestClassifier(
        n_estimators=100,          # Number of trees
        max_depth=None,            # Maximum depth of each tree
        min_samples_split=2,       # Minimum samples to split a node
        min_samples_leaf=1,        # Minimum samples at a leaf node
        random_state=seed,           # Seed for reproducibility
        class_weight='balanced'    # Adjust weights inversely proportional to class frequencies
    )

    # Perform Nested Cross-Validation
    metrics = nested_k_fold_cross_validation(
        dataset=merged_dataset,
        feature_names=features_to_load,
        feats2level=feats2level,
        model=classifier,
        outer_k=5,
        inner_k=5,
        random_state=seed
    )



In [None]:
metrics_averages = average_metrics('nested_cv_metrics.json')
