In [None]:
###############################################################################
# 1) IMPORTS & SETUP
###############################################################################
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from tensorflow.keras.models import load_model
from mapie.classification import MapieClassifier

import joblib
from sklearn.base import BaseEstimator, ClassifierMixin

###############################################################################
# 2) WRAPPERS FOR KERAS & SKLEARN MODELS
###############################################################################
class KerasClassifier(BaseEstimator, ClassifierMixin):
    """
    Wraps a pre-trained Keras model (.h5) so that it can be used
    by MAPIE's MapieClassifier in "prefit" mode.

    - .fit(X, y) loads the .h5 model (no re-training).
    - .predict_proba(X) returns an (n_samples, n_classes) array.
    - .predict(X, **kwargs) returns the predicted class index for each sample.
    """
    def __init__(self, model_path, n_classes=11):
        self.model_path = model_path
        self.model_ = None
        self.classes_ = np.arange(n_classes)  # default 0..10
    
    def fit(self, X, y, **kwargs):
        self.model_ = load_model(self.model_path)
        # Optionally: self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X, **kwargs):
        return self.model_.predict(X)

    def predict(self, X, **kwargs):
        proba = self.predict_proba(X)
        max_idx = np.argmax(proba, axis=1)
        return self.classes_[max_idx]


class SklearnClassifierWrapper(BaseEstimator, ClassifierMixin):
    """
    Wraps a pre-trained scikit-learn classifier so that it can
    ignore extra arguments like 'prediction_type' from MAPIE.
    """
    def __init__(self, model):
        self.model = model
        self.classes_ = None

    def fit(self, X, y):
        # 'Prefit' => no real training, but set self.classes_ if available
        if hasattr(self.model, "classes_"):
            self.classes_ = self.model.classes_
        else:
            self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X, **kwargs):
        return self.model.predict_proba(X)

    def predict(self, X, **kwargs):
        return self.model.predict(X)

###############################################################################
# 3) LOAD & SPLIT DATA
###############################################################################
def load_and_split_data(csv_path, test_size=0.2, random_state=42):
    df = pd.read_csv(csv_path)

    # Identify labels
    label_cols = [c for c in df.columns if 'slider' in c.lower()]

    # Identify features (example: columns starting with hrv|eda|acc|ibis)
    feature_cols = df.filter(regex='^(hrv|eda|acc|num_ibis)').columns.tolist()

    # Group-based split
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    participants = df['participant_id'].values

    train_idx, test_idx = next(gss.split(df, groups=participants))

    df_cal = df.iloc[train_idx].copy()  # calibration set
    df_test = df.iloc[test_idx].copy()  # test set

    X_cal = df_cal[feature_cols]
    X_test = df_test[feature_cols]

    return df_cal, df_test, X_cal, X_test, label_cols

###############################################################################
# 4) APPLY MAPIE WITH FALLBACK FOR DEGENERATE INTERVALS
###############################################################################
def apply_mapie_fallback(model, X_cal, y_cal, X_test, alpha=0.1):
    """
    Uses MAPIE in "prefit" mode to get classification intervals.
    If degenerate single-class intervals appear, fallback to min/max probabilities.
    Returns (y_pred, lower_bounds, upper_bounds).
    """
    mapie_clf = MapieClassifier(
        estimator=model,
        method="score",
        cv="prefit",  # do not retrain the underlying model
    )
    mapie_clf.fit(X_cal, y_cal)  # calibrate

    # "set" => returns (y_pred, y_pred_sets)
    y_pred, y_pis = mapie_clf.predict(X_test, alpha=[alpha], prediction_type="set")

    print("[MAPIE] y_pred shape:", y_pred.shape)
    print("[MAPIE] y_pis shape:", y_pis.shape)

    if y_pis.ndim == 3 and y_pis.shape[2] == 1:
        # Degenerate single-class intervals
        print("Degenerate intervals -> fallback to probabilities.")
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)
            lower_bounds = y_prob.min(axis=1)
            upper_bounds = y_prob.max(axis=1)
        else:
            lower_bounds = np.full(len(X_test), np.nan)
            upper_bounds = np.full(len(X_test), np.nan)
    else:
        # typical multi-class intervals => shape: (n_samples, 1, n_classes)
        classes_ = mapie_clf.classes_
        class_idx = np.array([np.where(classes_ == c)[0][0] for c in y_pred])
        lower_bounds = y_pis[np.arange(len(y_pis)), class_idx, 0]
        upper_bounds = y_pis[np.arange(len(y_pis)), class_idx, 1]

    return y_pred, lower_bounds, upper_bounds

###############################################################################
# 5) MAIN EVALUATION LOOP (LSTM)
###############################################################################
def evaluate_pretrained_models(csv_path, models_base_dir):
    """
    1) Load & group-split data
    2) For each label subfolder in 'models_base_dir', look for subfolder:
         - 'lstm' with *.h5
    3) Load the model, wrap it, apply MAPIE in prefit mode, save intervals.
    """
    df_cal, df_test, X_cal, X_test, label_cols = load_and_split_data(csv_path)

    # We'll handle only LSTM models
    model_types = ["lstm"]
    results = []

    labels_found = [
        d for d in os.listdir(models_base_dir)
        if os.path.isdir(os.path.join(models_base_dir, d))
    ]

    for label_name in labels_found:
        if label_name not in label_cols:
            print(f"[SKIP] {label_name} not found in CSV label columns.")
            continue

        # define y_cal, y_test for this label
        y_cal = df_cal[label_name].astype(int)
        y_test = df_test[label_name].astype(int)

        # For each model type, check subfolder
        for mtype in model_types:
            subfolder_path = os.path.join(models_base_dir, label_name, mtype)
            if not os.path.isdir(subfolder_path):
                print(f"[SKIP] No '{mtype}' subfolder for label {label_name}")
                continue

            # find model files (assuming LSTM models are saved as .h5 files)
            model_files = [f for f in os.listdir(subfolder_path)
                           if f.endswith(".h5")]

            if not model_files:
                print(f"[SKIP] No {mtype} model file found in {subfolder_path}")
                continue

            # take the first model file
            model_file = model_files[0]
            full_model_path = os.path.join(subfolder_path, model_file)
            print(f"\n=== Processing Label: {label_name}, Model: {mtype}, File: {model_file} ===")

            # Load & wrap the LSTM model (using KerasClassifier)
            wrapped_model = KerasClassifier(model_path=full_model_path, n_classes=11)
            wrapped_model.fit(X_cal, y_cal)

            # Apply MAPIE
            y_pred, lower, upper = apply_mapie_fallback(wrapped_model, X_cal, y_cal, X_test, alpha=0.1)

            # Save intervals
            out_dir = os.path.join("results_mapie", label_name)
            os.makedirs(out_dir, exist_ok=True)
            intervals_file = os.path.join(out_dir, f"{mtype}_intervals_{label_name}.csv")

            intervals_df = pd.DataFrame({
                "Index": np.arange(len(X_test)),
                "y_test": y_test,
                "y_pred": y_pred,
                "Lower_Bound": lower,
                "Upper_Bound": upper,
            })
            intervals_df.to_csv(intervals_file, index=False)
            print(f"[INFO] Intervals saved to {intervals_file}")

            results.append({
                "label": label_name,
                "model_type": mtype,
                "model_file": model_file,
                "interval_csv": intervals_file,
            })

    # Summarize all
    df_results = pd.DataFrame(results)
    summary_path = "results_mapie/all_results.csv"
    os.makedirs(os.path.dirname(summary_path), exist_ok=True)
    df_results.to_csv(summary_path, index=False)
    print(f"\n=== All results saved to {summary_path} ===")


###############################################################################
# 6) RUN IT
###############################################################################
if __name__ == "__main__":
    # CSV path for your entire dataset
    CSV_PATH = "cleaned_dataset_0307.csv"

    # Base directory where your trained models are stored
    MODELS_BASE_DIR = "folds_original_models/fold_4/models"

    evaluate_pretrained_models(CSV_PATH, MODELS_BASE_DIR)


In [None]:
###############################################################################
# 1) IMPORTS & SETUP
###############################################################################
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from mapie.classification import MapieClassifier

import joblib
from sklearn.base import BaseEstimator, ClassifierMixin

###############################################################################
# 2) WRAPPERS FOR MODELS
###############################################################################
class SklearnClassifierWrapper(BaseEstimator, ClassifierMixin):
    """
    Wraps a pre-trained scikit-learn classifier so that it can
    ignore extra arguments like 'prediction_type' from MAPIE.
    """
    def __init__(self, model):
        self.model = model
        self.classes_ = None

    def fit(self, X, y):
        # 'Prefit' => no re-training, but set self.classes_ if available
        if hasattr(self.model, "classes_"):
            self.classes_ = self.model.classes_
        else:
            self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X, **kwargs):
        return self.model.predict_proba(X)

    def predict(self, X, **kwargs):
        return self.model.predict(X)

###############################################################################
# 3) LOAD & SPLIT DATA
###############################################################################
def load_and_split_data(csv_path, test_size=0.2, random_state=42):
    df = pd.read_csv(csv_path)

    # Identify labels (e.g. columns containing the word "slider")
    label_cols = [c for c in df.columns if 'slider' in c.lower()]

    # Identify features (example: columns starting with hrv|eda|acc|num_ibis)
    feature_cols = df.filter(regex='^(hrv|eda|acc|num_ibis)').columns.tolist()

    # Group-based split based on participant_id
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    participants = df['participant_id'].values

    train_idx, test_idx = next(gss.split(df, groups=participants))

    df_cal = df.iloc[train_idx].copy()  # calibration set
    df_test = df.iloc[test_idx].copy()   # test set

    X_cal = df_cal[feature_cols]
    X_test = df_test[feature_cols]

    return df_cal, df_test, X_cal, X_test, label_cols

###############################################################################
# 4) APPLY MAPIE WITH FALLBACK FOR DEGENERATE INTERVALS
###############################################################################
def apply_mapie_fallback(model, X_cal, y_cal, X_test, alpha=0.1):
    """
    Uses MAPIE in "prefit" mode to get classification intervals.
    If degenerate single-class intervals appear, fallback to min/max probabilities.
    Returns (y_pred, lower_bounds, upper_bounds).
    """
    mapie_clf = MapieClassifier(
        estimator=model,
        method="score",
        cv="prefit",  # do not retrain the underlying model
    )
    mapie_clf.fit(X_cal, y_cal)  # calibrate

    # "set" => returns (y_pred, y_pred_sets)
    y_pred, y_pis = mapie_clf.predict(X_test, alpha=[alpha], prediction_type="set")

    print("[MAPIE] y_pred shape:", y_pred.shape)
    print("[MAPIE] y_pis shape:", y_pis.shape)

    if y_pis.ndim == 3 and y_pis.shape[2] == 1:
        # Degenerate single-class intervals -> fallback to probabilities.
        print("Degenerate intervals -> fallback to probabilities.")
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)
            lower_bounds = y_prob.min(axis=1)
            upper_bounds = y_prob.max(axis=1)
        else:
            lower_bounds = np.full(len(X_test), np.nan)
            upper_bounds = np.full(len(X_test), np.nan)
    else:
        # Typical multi-class intervals => shape: (n_samples, 1, n_classes)
        classes_ = mapie_clf.classes_
        class_idx = np.array([np.where(classes_ == c)[0][0] for c in y_pred])
        lower_bounds = y_pis[np.arange(len(y_pis)), class_idx, 0]
        upper_bounds = y_pis[np.arange(len(y_pis)), class_idx, 1]

    return y_pred, lower_bounds, upper_bounds

###############################################################################
# 5) MAIN EVALUATION LOOP (LGBM & XGBOOST)
###############################################################################
def evaluate_pretrained_models(csv_path, models_base_dir):
    """
    1) Load & group-split data.
    2) For each label subfolder in 'models_base_dir', look for subfolders:
         - 'xgb' with *.pkl files.
         - 'lgbm' with *.pkl files.
    3) Load the model, wrap it, apply MAPIE in prefit mode, and save intervals.
    """
    df_cal, df_test, X_cal, X_test, label_cols = load_and_split_data(csv_path)

    # We'll handle only LGBM and XGBoost models.
    model_types = ["lgbm", "xgb"]
    results = []

    labels_found = [
        d for d in os.listdir(models_base_dir)
        if os.path.isdir(os.path.join(models_base_dir, d))
    ]

    for label_name in labels_found:
        if label_name not in label_cols:
            print(f"[SKIP] {label_name} not found in CSV label columns.")
            continue

        # define y_cal, y_test for this label
        y_cal = df_cal[label_name].astype(int)
        y_test = df_test[label_name].astype(int)

        # For each model type (lgbm and xgb), check for corresponding subfolder
        for mtype in model_types:
            subfolder_path = os.path.join(models_base_dir, label_name, mtype)
            if not os.path.isdir(subfolder_path):
                print(f"[SKIP] No '{mtype}' subfolder for label {label_name}")
                continue

            # find model files (assuming .pkl extension)
            model_files = [f for f in os.listdir(subfolder_path)
                           if f.endswith(".pkl")]

            if not model_files:
                print(f"[SKIP] No {mtype} model file found in {subfolder_path}")
                continue

            # take the first model file found
            model_file = model_files[0]
            full_model_path = os.path.join(subfolder_path, model_file)
            print(f"\n=== Processing Label: {label_name}, Model: {mtype}, File: {model_file} ===")

            # Load the model via joblib and wrap it with SklearnClassifierWrapper
            loaded_model = joblib.load(full_model_path)
            wrapped_model = SklearnClassifierWrapper(loaded_model)
            wrapped_model.fit(X_cal, y_cal)  # sets self.classes_

            # Apply MAPIE
            y_pred, lower, upper = apply_mapie_fallback(wrapped_model, X_cal, y_cal, X_test, alpha=0.1)

            # Save intervals (results are saved in the same 'results_mapie' folder)
            out_dir = os.path.join("results_mapie", label_name)
            os.makedirs(out_dir, exist_ok=True)
            intervals_file = os.path.join(out_dir, f"{mtype}_intervals_{label_name}.csv")

            intervals_df = pd.DataFrame({
                "Index": np.arange(len(X_test)),
                "y_test": y_test,
                "y_pred": y_pred,
                "Lower_Bound": lower,
                "Upper_Bound": upper,
            })
            intervals_df.to_csv(intervals_file, index=False)
            print(f"[INFO] Intervals saved to {intervals_file}")

            results.append({
                "label": label_name,
                "model_type": mtype,
                "model_file": model_file,
                "interval_csv": intervals_file,
            })

    # Summarize all results
    df_results = pd.DataFrame(results)
    summary_path = "results_mapie/all_results.csv"
    os.makedirs(os.path.dirname(summary_path), exist_ok=True)
    df_results.to_csv(summary_path, index=False)
    print(f"\n=== All results saved to {summary_path} ===")

###############################################################################
# 6) RUN IT
###############################################################################
if __name__ == "__main__":
    # CSV path for your entire dataset
    CSV_PATH = "cleaned_dataset_0307.csv"

    # Base directory where your trained LGBM & XGBoost models are stored
    MODELS_BASE_DIR = "folds_original_LGBM_XGB_models/fold_4/models"

    evaluate_pretrained_models(CSV_PATH, MODELS_BASE_DIR)
