<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/ALS_QNN_PRO_ACT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This first cell
grabs the basics we need: it installs Qiskit, imports the pieces for the circuit and optimizer, and brings in simple tools to split and scale the data plus quick checks for error and correlation. If installing `qiskit_algorithms` fails, try `qiskit-algorithms` instead. We scale the inputs so the angles stay in a small range, which makes training smoother. Last thing: set the feature map to match the number of features you end up with.

In [4]:
from sklearn.model_selection import train_test_split  # quick train/validation split
from sklearn.preprocessing import MinMaxScaler        # keep features in a compact range for angle encoding
from sklearn.metrics import mean_squared_error        # regression loss (lower is better)
from scipy.stats import pearsonr                      # correlation between predictions and targets (closer to 1 is better)
%pip install qiskit~=1.0 qiskit-machine-learning~=0.8.1 qiskit_algorithms  # pinned install; if it fails, try 'qiskit-algorithms' manually

# Qiskit Imports
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes  # feature map + ansatz for the variational circuit
from qiskit_algorithms.optimizers import COBYLA                  # gradient-free optimizer suited to noisy objectives
from qiskit_machine_learning.algorithms.regressors import VQR    # variational quantum regressor wrapper
from qiskit.primitives import Sampler
# Install pennylane and lightning plugins
%pip install pennylane pennylane-lightning[gpu] torch torchvision torchaudio                          # primitive that evaluates circuits (shot-based)



# 2nd Cell
takes the raw PRO-ACT CSVs and turns them into a clean, ready-to-use dataset. It lines up each person’s timeline by their first ALSFRS visit, builds simple summary features from the first 0–90 days across the available tables (ALSFRS, FVC, vitals, labs, grip, muscle), tidies ALSFRS-R items, and uses the best FVC trial at each test time. For every signal it makes seven summaries (min, max, median, std, first, last, slope) and drops columns with lots of missing values (>30%). It doesn’t do any scaling or encoding here—you’ll do that later while training. Only people with ALSFRS measurements after both 3 months and 12 months are kept, and the target is the slope between the first record after 3 months and the first after 12 months. Tables without a time column are skipped, messy wide/long layouts are handled as best as possible, and if your time isn’t in days, make sure the column names include “day” or “delta.” The cell saves `final_processed_als_data.csv` and also returns the features (`X`), target (`y`), subject IDs, and the joined data frame.


In [2]:
import pandas as pd
import numpy as np
import warnings
from typing import Dict, Optional, List

warnings.filterwarnings("ignore")  # keep notebook output tidy; data has mixed types/old columns


class ALSDataProcessor:
    """
    CV-safe preprocessing for PRO-ACT to reproduce the paper's EDA:
      - Anchor to FIRST ALSFRS visit (t=0) per subject
      - Inputs: first 3 months (0–90 days from anchor) for all longitudinal tables
      - Outcome: ALSFRS Total slope between FIRST-after-3mo and FIRST-after-12mo
      - ALSFRS-R harmonization hooks (Q10 from 10a; merge Q5a/Q5b)
      - FVC reduced to max-of-trials per test before summarization
      - Seven summaries: min, max, median, std, first, last, slope (slope=NaN if only 1 obs)
      - Drop features with >30% missing (no other transforms here — avoid leakage)
    """

    def __init__(self):
        # identifiers/time-like columns we should NOT summarize as numeric features
        self.id_and_delta_cols = {
            "subject_id",
            "alsfrs_delta",
            "fvc_delta",
            "vitals_delta",
            "labs_delta",
            "grip_delta",
            "muscle_delta",
            "onset_delta",
            "death_delta",
            "history_delta",
            "anchor_days",
            "days_from_alsfrs_anchor",
        }

    # --------- Utilities ---------

    @staticmethod
    def _find_time_col(df: pd.DataFrame) -> Optional[str]:
        """Find a time column that represents days since baseline in a table."""
        # Prefer delta
        for c in df.columns:
            lc = c.lower()
            if "delta" in lc:
                return c
        # Fallback to 'days' if present
        for c in df.columns:
            lc = c.lower()
            if "day" in lc:
                return c
        return None

    # --------- ALSFRS-R harmonization ---------

    def _convert_alsfrs_r(self, alsfrs_df: pd.DataFrame) -> pd.DataFrame:
        """
        Prepare ALSFRS table. If ALSFRS-R subitems exist, map per paper:
          - Q10 <- 10a (dyspnea). Ignore 10b/10c.
          - Merge Q5a/Q5b into Q5 if present.
        If only totals exist, this is a no-op aside from coercions.
        """
        df = alsfrs_df.copy()

        if "ALSFRS_Total" in df.columns:
            df["ALSFRS_Total"] = pd.to_numeric(df["ALSFRS_Total"], errors="coerce")

        # Try to locate subitems by loose names
        cols = {c.lower(): c for c in df.columns}

        # Q10 from 10a (dyspnea)
        for candidate in ["alsfrs_r_q10a", "q10a", "dyspnea", "alsfrs_q10a"]:
            if candidate in cols:
                df["Q10"] = pd.to_numeric(df[cols[candidate]], errors="coerce")
                break

        # Merge Q5a/Q5b
        q5a = next(
            (cols[k] for k in ["alsfrs_r_q5a", "q5a", "cutting_wout_gastrostomy"] if k in cols),
            None,
        )
        q5b = next(
            (cols[k] for k in ["alsfrs_r_q5b", "q5b", "cutting_with_gastrostomy"] if k in cols),
            None,
        )
        if q5a and q5b:
            q5a_vals = pd.to_numeric(df[q5a], errors="coerce").values
            q5b_vals = pd.to_numeric(df[q5b], errors="coerce").values
            df["Q5"] = np.nanmax(np.vstack([q5a_vals, q5b_vals]), axis=0)

        return df

    # --------- Anchoring ---------

    def _alsfrs_anchor_days(self, alsfrs_df: pd.DataFrame) -> pd.Series:
        """
        Compute per-subject anchor day = first ALSFRS visit (min delta/days).
        """
        df = alsfrs_df.copy()
        tcol = self._find_time_col(df)
        if tcol is None:
            raise ValueError("ALSFRS table lacks a time delta/days column.")

        df.rename(columns={tcol: "alsfrs_delta"}, inplace=True)
        anchor_map = df.groupby("subject_id")["alsfrs_delta"].min()
        return anchor_map

    # --------- Data I/O ---------

    def load_and_inspect_data(self, file_path: str = "") -> Dict[str, pd.DataFrame]:
        datasets: Dict[str, pd.DataFrame] = {}
        file_list = [
            "PROACT_ALSFRS.csv",
            "PROACT_FVC.csv",
            "PROACT_VITALSIGNS.csv",
            "PROACT_RILUZOLE.csv",
            "PROACT_DEMOGRAPHICS.csv",
            "PROACT_LABS.csv",
            "PROACT_DEATHDATA.csv",
            "PROACT_HANDGRIPSTRENGTH.csv",
            "PROACT_MUSCLESTRENGTH.csv",
            "PROACT_ALSHISTORY.csv",
        ]
        print("--- Loading and Inspecting Data ---")
        for file_name in file_list:
            try:
                df = pd.read_csv(file_path + file_name, on_bad_lines="skip")
                # normalize subject_id
                if "subject_id" not in df.columns:
                    potential = [c for c in df.columns if "subject" in c.lower()]
                    if potential:
                        df = df.rename(columns={potential[0]: "subject_id"})
                # coerce delta-like numeric columns
                for c in df.columns:
                    if "delta" in c.lower() or "day" in c.lower():
                        df[c] = pd.to_numeric(df[c], errors="coerce")
                datasets[file_name] = df
                print(f"✓ {file_name}: {df.shape}")
            except FileNotFoundError:
                print(f"✗ {file_name}: File not found (skipped).")
        return datasets

    # --------- Outcome ---------

    def calculate_alsfrs_slope(self, alsfrs_df: pd.DataFrame) -> pd.DataFrame:
        """
        Outcome = slope between FIRST-after-3mo and FIRST-after-12mo ALSFRS totals,
        with time anchored to first ALSFRS visit.
        """
        df = alsfrs_df.copy()
        tcol = self._find_time_col(df)
        if tcol is None:
            raise ValueError("ALSFRS table lacks a time delta/days column.")
        if "ALSFRS_Total" not in df.columns:
            raise ValueError("ALSFRS_Total missing in ALSFRS table.")

        df.rename(columns={tcol: "alsfrs_delta"}, inplace=True)
        # Anchor
        anchor_map = df.groupby("subject_id")["alsfrs_delta"].min()
        df["days_from_anchor"] = df["alsfrs_delta"] - df["subject_id"].map(anchor_map)
        df["months"] = df["days_from_anchor"] / 30.44

        df = df.sort_values(["subject_id", "months"])
        slopes = {}

        for sid, g in df.groupby("subject_id", sort=False):
            g = g.dropna(subset=["months", "ALSFRS_Total"])
            t1 = g[g["months"] > 3.0].head(1)
            t2 = g[g["months"] > 12.0].head(1)
            if not t1.empty and not t2.empty:
                t1m = float(t1["months"].iloc[0])
                t2m = float(t2["months"].iloc[0])
                t1v = float(t1["ALSFRS_Total"].iloc[0])
                t2v = float(t2["ALSFRS_Total"].iloc[0])
                if t2m > t1m:
                    slopes[sid] = (t2v - t1v) / (t2m - t1m)

        return pd.DataFrame({"subject_id": list(slopes.keys()), "alsfrs_slope": list(slopes.values())})

    # --------- FVC collapse ---------

    @staticmethod
    def _fvc_collapse_trials(df: pd.DataFrame, time_col: str) -> pd.DataFrame:
        """
        Reduce FVC per row/time to the max across trials before summarization.
        Tries to detect typical trial columns; falls back gracefully.
        """
        d = df.copy()
        # Find obvious trial columns
        trial_cols = [c for c in d.columns if "trial" in c.lower()]
        # Some datasets have explicit liters columns per trial name
        if trial_cols:
            d["FVC_Liters"] = pd.to_numeric(d[trial_cols].max(axis=1), errors="coerce")
            keep = ["subject_id", time_col, "FVC_Liters"]
            return d[keep]
        # Fallbacks: look for liters column names
        liter_like = [c for c in d.columns if "liter" in c.lower() or "fvc" in c.lower()]
        if liter_like:
            # If multiple, take row-wise max
            d["FVC_Liters"] = pd.to_numeric(d[liter_like].max(axis=1), errors="coerce")
            keep = ["subject_id", time_col, "FVC_Liters"]
            return d[keep]
        # Last resort: return as-is
        return d

    # --------- Longitudinal summarization ---------

    def create_longitudinal_features(self, df: pd.DataFrame, time_col: str, prefix: str) -> pd.DataFrame:
        """
        Create 7 summaries over [0, 90] days from ALSFRS anchor:
          min, max, median, std, first, last, slope(first→last)
        Slope remains NaN if only one observation or zero time span.
        """
        if time_col not in df.columns:
            return pd.DataFrame()

        d = df.copy()
        # Coerce numerics (but keep subject_id/time cols)
        for c in d.columns:
            if c not in {"subject_id", time_col}:
                d[c] = pd.to_numeric(d[c], errors="coerce")

        # Ensure window is 0..90 days from ALSFRS anchor (already anchored)
        d = d[(d[time_col] >= 0) & (d[time_col] <= 90)].copy()
        if d.empty:
            return pd.DataFrame()

        # Value columns (exclude identifiers/derived delta/time)
        val_cols = [
            c
            for c in d.select_dtypes(include=[np.number]).columns
            if c not in self.id_and_delta_cols and c not in {"subject_id", time_col}
        ]
        if not val_cols:
            return pd.DataFrame()

        out = []
        g = d.groupby("subject_id", as_index=True)
        for col in val_cols:
            agg = g[col].agg(["min", "max", "median", "first", "last"])
            std_ = g[col].std(ddof=0).rename("std")
            slope = g.apply(
                lambda x: (x[col].iloc[-1] - x[col].iloc[0]) / max(1e-9, (x[time_col].iloc[-1] - x[time_col].iloc[0]))
                if len(x) > 1 and (x[time_col].iloc[-1] - x[time_col].iloc[0]) > 0
                else np.nan
            ).rename("slope")
            feat = pd.concat([agg, std_, slope], axis=1)
            feat.columns = [f"{prefix}{col}_{cname}" for cname in feat.columns]
            out.append(feat)

        return pd.concat(out, axis=1).reset_index()

    # --------- Static table processing (no encoding here to avoid leakage) ---------

    @staticmethod
    def process_static_data(df: pd.DataFrame) -> pd.DataFrame:
        """
        CV-safe: DO NOT encode here. Just keep one row per subject.
        (Do categorical encoding in your modeling pipeline.)
        """
        if "subject_id" not in df.columns:
            return pd.DataFrame()
        # Keep first non-duplicated row per subject_id
        return df.drop_duplicates(subset=["subject_id"]).copy()

    # --------- Merge features ---------

    def merge_all_features(self, datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
        if "PROACT_DEMOGRAPHICS.csv" not in datasets:
            raise ValueError("Demographics file is missing.")

        # Build ALSFRS anchor map
        alsfrs = datasets["PROACT_ALSFRS.csv"]
        anchor_map = self._alsfrs_anchor_days(alsfrs)

        # Start with demographics (static)
        final_df = self.process_static_data(datasets["PROACT_DEMOGRAPHICS.csv"])

        # Add static-ish other tables (keep CV-safe; no encodings)
        for file in ["PROACT_RILUZOLE.csv", "PROACT_ALSHISTORY.csv"]:
            if file in datasets:
                static_df = self.process_static_data(datasets[file])
                final_df = pd.merge(final_df, static_df, on="subject_id", how="left")

        # Longitudinal configs
        longitudinal = {
            "PROACT_ALSFRS.csv": "alsfrs_",
            "PROACT_FVC.csv": "fvc_",
            "PROACT_VITALSIGNS.csv": "vitals_",
            "PROACT_LABS.csv": "labs_",
            "PROACT_HANDGRIPSTRENGTH.csv": "grip_",
            "PROACT_MUSCLESTRENGTH.csv": "muscle_",
        }

        print("\n--- Generating Longitudinal Features (anchored to first ALSFRS; window = 0–90 days) ---")
        for file, prefix in longitudinal.items():
            if file not in datasets:
                continue

            df = datasets[file].copy()
            tcol = self._find_time_col(df)
            if tcol is None:
                print(f"Warning: No time delta/days column in {file}. Skipping.")
                continue

            # Anchor this table to ALSFRS first visit
            df["anchor_days"] = df["subject_id"].map(anchor_map)
            df = df[~df["anchor_days"].isna()].copy()
            df["days_from_alsfrs_anchor"] = pd.to_numeric(df[tcol], errors="coerce") - df["anchor_days"]

            # FVC special handling: collapse to max-of-trials BEFORE summarization
            if file == "PROACT_FVC.csv":
                df = self._fvc_collapse_trials(df, time_col="days_from_alsfrs_anchor")

            # Attempt to pivot long-form measurement tables (best effort)
            if file in {"PROACT_LABS.csv", "PROACT_MUSCLESTRENGTH.csv", "PROACT_HANDGRIPSTRENGTH.csv"}:
                try:
                    test_cols = [
                        c
                        for c in df.columns
                        if c not in {"subject_id", "days_from_alsfrs_anchor", "anchor_days"}
                        and any(k in c.lower() for k in ["test", "exam", "muscle", "site", "name", "strength_test"])
                    ]
                    value_cols = [
                        c
                        for c in df.columns
                        if c not in {"subject_id", "days_from_alsfrs_anchor", "anchor_days"}
                        and any(k in c.lower() for k in ["result", "value", "strength", "score"])
                    ]
                    if test_cols and value_cols:
                        tcol_name = test_cols[0]
                        vcol_name = value_cols[0]
                        df[vcol_name] = pd.to_numeric(df[vcol_name], errors="coerce")
                        df = (
                            df.pivot_table(
                                index=["subject_id", "days_from_alsfrs_anchor"],
                                columns=tcol_name,
                                values=vcol_name,
                                aggfunc="mean",
                            )
                            .reset_index()
                        )
                except Exception as e:
                    print(f"Warning: Pivoting failed for {file}: {e}")

            feats = self.create_longitudinal_features(df, "days_from_alsfrs_anchor", prefix)
            if not feats.empty:
                final_df = pd.merge(final_df, feats, on="subject_id", how="left")

        return final_df

    # --------- Eligibility ---------

    def filter_eligible_patients(self, feature_df: pd.DataFrame, alsfrs_df: pd.DataFrame) -> pd.DataFrame:
        """
        Keep subjects who have ANY ALSFRS >3 months AND >12 months AFTER the ALSFRS anchor.
        """
        df = alsfrs_df.copy()
        tcol = self._find_time_col(df)
        if tcol is None:
            raise ValueError("ALSFRS table lacks a time delta/days column.")

        df.rename(columns={tcol: "alsfrs_delta"}, inplace=True)
        anchor_map = df.groupby("subject_id")["alsfrs_delta"].min()
        df["days_from_anchor"] = df["alsfrs_delta"] - df["subject_id"].map(anchor_map)
        df["months"] = df["days_from_anchor"] / 30.44

        g = df.groupby("subject_id")["months"]
        has_t1 = g.apply(lambda s: (s > 3.0).any())
        has_t2 = g.apply(lambda s: (s > 12.0).any())
        eligible_ids = has_t1[has_t1].index.intersection(has_t2[has_t2].index)

        print(f"\nEligible patients: {len(eligible_ids)} / {df['subject_id'].nunique()}")
        return feature_df[feature_df["subject_id"].isin(eligible_ids)].copy()

    # --------- Orchestration ---------

    def run_pipeline(self, file_path: str = "") -> Optional[Dict[str, pd.DataFrame]]:
        """
        End-to-end EDA (CV-safe) that writes 'final_processed_als_data.csv'.
        No imputation/scaling/feature selection here — do that inside your CV pipeline.
        """
        print("====== Starting ALS Data Preprocessing Pipeline ======")
        datasets = self.load_and_inspect_data(file_path)
        if "PROACT_ALSFRS.csv" not in datasets:
            print("CRITICAL ERROR: PROACT_ALSFRS.csv not found. Aborting.")
            return None

        # ALSFRS prep + anchor
        datasets["PROACT_ALSFRS.csv"] = self._convert_alsfrs_r(datasets["PROACT_ALSFRS.csv"])

        # Outcome
        target_df = self.calculate_alsfrs_slope(datasets["PROACT_ALSFRS.csv"])
        print(f"\nCalculated ALSFRS slope for {len(target_df)} patients.")

        # Features
        full_features = self.merge_all_features(datasets)

        # Eligibility
        eligible_features = self.filter_eligible_patients(full_features, datasets["PROACT_ALSFRS.csv"])

        # Join features + target
        final_df = pd.merge(eligible_features, target_df, on="subject_id", how="inner")

        # Drop features with >30% missing
        print("\n--- Handling Missing Values (Dropping cols with >30% missing) ---")
        initial_cols = len(final_df.columns)
        missing_thresh = 0.30
        min_non_na = int(np.ceil(len(final_df) * (1 - missing_thresh)))
        final_df = final_df.dropna(axis=1, thresh=min_non_na)
        dropped = initial_cols - len(final_df.columns)
        print(f"Dropped {dropped} columns for >{int(missing_thresh*100)}% missingness.")

        # Separate X/y (no transforms here to avoid leakage)
        if "alsfrs_slope" not in final_df.columns:
            print("No target available after merges. Aborting.")
            return None

        y = final_df["alsfrs_slope"]
        valid = y.notna()
        final_df = final_df.loc[valid].reset_index(drop=True)

        subject_ids = final_df["subject_id"]
        y = final_df["alsfrs_slope"]
        X = final_df.drop(columns=["subject_id", "alsfrs_slope"])

        # Save CV-safe engineered dataset (raw features)
        out = pd.concat([subject_ids, y, X], axis=1)
        out.to_csv("final_processed_als_data.csv", index=False)
        print("\n✅ Saved CV-safe engineered data to 'final_processed_als_data.csv'")
        print(f"Feature matrix shape: {X.shape} | Target length: {len(y)}")

        return {"X": X, "y": y, "subject_ids": subject_ids, "raw_frame": out}


if __name__ == "__main__":
    # If your CSVs live elsewhere, set file_path accordingly (e.g., "C:/data/PROACT/")
    file_path = ""
    processor = ALSDataProcessor()
    processed = processor.run_pipeline(file_path=file_path)
    if processed is not None:
        print("\nPreview of columns:", list(processed["X"].columns)[:10])
        print("Done.")


--- Loading and Inspecting Data ---
✓ PROACT_ALSFRS.csv: (73845, 20)
✓ PROACT_FVC.csv: (49110, 10)
✓ PROACT_VITALSIGNS.csv: (84721, 36)
✓ PROACT_RILUZOLE.csv: (10363, 3)
✓ PROACT_DEMOGRAPHICS.csv: (12504, 14)
✓ PROACT_LABS.csv: (2937162, 5)
✓ PROACT_DEATHDATA.csv: (5043, 3)
✓ PROACT_HANDGRIPSTRENGTH.csv: (19032, 11)
✓ PROACT_MUSCLESTRENGTH.csv: (204875, 10)
✓ PROACT_ALSHISTORY.csv: (13765, 16)

Calculated ALSFRS slope for 1897 patients.

--- Generating Longitudinal Features (anchored to first ALSFRS; window = 0–90 days) ---

Eligible patients: 3317 / 8538

--- Handling Missing Values (Dropping cols with >30% missing) ---
Dropped 1413 columns for >30% missingness.

✅ Saved CV-safe engineered data to 'final_processed_als_data.csv'
Feature matrix shape: (1897, 346) | Target length: 1897

Preview of columns: ['Demographics_Delta', 'Age', 'Race_Caucasian', 'Sex', 'Subject_used_Riluzole', 'Riluzole_use_Delta', 'Subject_ALS_History_Delta', 'Site_of_Onset', 'alsfrs_Q1_Speech_min', 'alsfrs_Q1_S

# 3rd cell
trains two quick, reliable baselines on `final_processed_als_data.csv`. It does a simple 80/20 split, figures out which columns are numbers or categories, fills missing values, scales only for SVR, and one-hot encodes any categories. It then tries many parameter settings but stops weak ones early, using a cache so repeated steps don’t run again, and tunes both a Random Forest (no scaling needed) and an RBF SVR (with scaling). After tuning, it tests on the hold-out set and prints error and correlation with 95% confidence ranges, plus the best settings it found. It also prints a quick 50/50 blend of RF and SVR as a sanity check. If you just want speed, you can skip the SVR block—Random Forest alone is often strong here.


In [None]:
import numpy as np
import pandas as pd
from typing import Tuple, Dict
import warnings
warnings.filterwarnings("ignore")

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

# Halving search (successive halving)
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingGridSearchCV

# caching
from joblib import Memory

np.random.seed(42)

# ---------- Metrics ----------
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def safe_pcc(y_true, y_pred) -> float:
    yt = np.asarray(y_true, dtype=float).ravel()
    yp = np.asarray(y_pred, dtype=float).ravel()
    if yt.std() < 1e-12 or yp.std() < 1e-12:
        return 0.0
    return float(np.corrcoef(yt, yp)[0, 1])

def bootstrap_ci(y_true, y_pred, metric_fn, n_boot=800, alpha=0.95, seed=42) -> Tuple[float, float]:
    rng = np.random.default_rng(seed)
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    n = len(y_true)
    stats = []
    idx = np.arange(n)
    for _ in range(n_boot):
        b = rng.choice(idx, size=n, replace=True)
        stats.append(metric_fn(y_true[b], y_pred[b]))
    lo = float(np.percentile(stats, (1 - alpha) / 2 * 100))
    hi = float(np.percentile(stats, (1 + alpha) / 2 * 100))
    return lo, hi

# ---------- Main ----------
def run_classical_pipeline_fast() -> pd.DataFrame:
    print("====== FAST Classical Baselines (successive halving, cached) ======")

    # 1) Load engineered data
    df = pd.read_csv("final_processed_als_data.csv")
    print(f"✓ Loaded engineered dataset: {df.shape}")

    X = df.drop(columns=["subject_id", "alsfrs_slope"])
    y = df["alsfrs_slope"].astype(float)

    # Optional quick shuffle for better fold homogeneity
    X, y = shuffle(X, y, random_state=42)

    # 80/20 split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )
    print(f"Split: train={X_train.shape[0]}, test={X_test.shape[0]}")

    # 2) Column typing
    num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
    print(f"Detected numeric={len(num_cols)}, categorical={len(cat_cols)}")

    # Pipeline cache
    memory = Memory(location="sk_cache", verbose=0)

    # Preprocessors
    # Numeric: impute → (optional scaler in SVR branch)
    num_rf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ], memory=memory)

    num_svr = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ], memory=memory)

    if len(cat_cols) > 0:
        cat_common = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ], memory=memory)
        preproc_rf = ColumnTransformer(
            transformers=[("num", num_rf, num_cols), ("cat", cat_common, cat_cols)],
            remainder="drop"
        )
        preproc_svr = ColumnTransformer(
            transformers=[("num", num_svr, num_cols), ("cat", cat_common, cat_cols)],
            remainder="drop"
        )
    else:
        # No categoricals → simpler (faster) preprocessors
        preproc_rf = num_rf
        preproc_svr = num_svr

    # 3) Pipelines (with small but effective grids)
    rf_pipe = Pipeline(steps=[
        ("preprocess", preproc_rf),
        ("select", SelectKBest(score_func=f_regression, k="all")),
        ("model", RandomForestRegressor(random_state=42, n_jobs=-1))
    ], memory=memory)

    rf_grid: Dict[str, list] = {
        # keep imputer fixed (median) to avoid recomputing transforms
        "select__k": ["all", 50],              # feature count toggle
        "model__n_estimators": [250],          # enough trees, faster than 500
        "model__max_depth": [None, 12],
        "model__min_samples_leaf": [1, 2],
        "model__max_features": ["sqrt"],       # stable setting
    }

    # Successive halving (aggressive elimination reduces fits)
    rf_search = HalvingGridSearchCV(
        rf_pipe,
        rf_grid,
        factor=3,
        resource="n_samples",
        min_resources="exhaust",
        cv=3,
        scoring="neg_root_mean_squared_error",   # optimize RMSE directly
        n_jobs=-1,
        verbose=0,
        refit=True
    )
    print("\n--- Fitting RandomForest (HalvingGridSearchCV, cv=3) ---")
    rf_search.fit(X_train, y_train)
    print(f"RF best params: {rf_search.best_params_}")

    # SVR (trimmed grid; if you need even faster, comment this whole block)
    svr_pipe = Pipeline(steps=[
        ("preprocess", preproc_svr),
        ("select", SelectKBest(score_func=f_regression, k="all")),
        ("model", SVR(kernel="rbf"))
    ], memory=memory)

    svr_grid: Dict[str, list] = {
        "select__k": ["all", 50],
        "model__C": [1.0, 3.0],
        "model__epsilon": [0.1],
        "model__gamma": ["scale"],
    }

    svr_search = HalvingGridSearchCV(
        svr_pipe,
        svr_grid,
        factor=3,
        resource="n_samples",
        min_resources="exhaust",
        cv=3,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1,
        verbose=0,
        refit=True
    )
    print("\n--- Fitting SVR (HalvingGridSearchCV, cv=3) ---")
    svr_search.fit(X_train, y_train)
    print(f"SVR best params: {svr_search.best_params_}")

    # 4) Test-set evaluation + (faster) bootstrap CIs
    results = []

    for name, est in [("Random Forest", rf_search), ("SVR (RBF)", svr_search)]:
        y_pred = est.best_estimator_.predict(X_test)
        test_rmse = rmse(y_test, y_pred)
        test_pcc  = safe_pcc(y_test.values, y_pred)

        rmse_lo, rmse_hi = bootstrap_ci(y_test.values, y_pred, rmse, n_boot=800, alpha=0.95, seed=123)
        pcc_lo,  pcc_hi  = bootstrap_ci(y_test.values, y_pred, safe_pcc, n_boot=800, alpha=0.95, seed=456)

        results.append({
            "Model": name,
            "RMSE": test_rmse,
            "RMSE 95% CI Low": rmse_lo,
            "RMSE 95% CI High": rmse_hi,
            "PCC": test_pcc,
            "PCC 95% CI Low": pcc_lo,
            "PCC 95% CI High": pcc_hi,
        })

    results_df = pd.DataFrame(results).set_index("Model")
    print("\n====== Test Set Performance (FAST mode) ======")
    print(results_df.round(4))

    # Optional quick 50–50 blend (no extra CV)
    rf_pred = rf_search.best_estimator_.predict(X_test)
    svr_pred = svr_search.best_estimator_.predict(X_test)
    ens_pred = 0.5 * (rf_pred + svr_pred)

    ens_rmse = rmse(y_test, ens_pred)
    ens_pcc  = safe_pcc(y_test.values, ens_pred)
    ens_rmse_ci = bootstrap_ci(y_test.values, ens_pred, rmse, n_boot=800, alpha=0.95, seed=789)
    ens_pcc_ci  = bootstrap_ci(y_test.values, ens_pred, safe_pcc, n_boot=800, alpha=0.95, seed=101112)

    print("\n--- Simple RF+SVR Avg Ensemble (FAST) ---")
    print(pd.DataFrame({
        "RMSE": [ens_rmse],
        "RMSE 95% CI Low": [ens_rmse_ci[0]],
        "RMSE 95% CI High": [ens_rmse_ci[1]],
        "PCC": [ens_pcc],
        "PCC 95% CI Low": [ens_pcc_ci[0]],
        "PCC 95% CI High": [ens_pcc_ci[1]],
    }, index=["RF+SVR Ensemble"]).round(4))

    return results_df


if __name__ == "__main__":
    run_classical_pipeline_fast()


✓ Loaded engineered dataset: (1897, 348)
Split: train=1517, test=380
Detected numeric=343, categorical=3

--- Fitting RandomForest (HalvingGridSearchCV, cv=3) ---
RF best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__n_estimators': 250, 'select__k': 50}

--- Fitting SVR (HalvingGridSearchCV, cv=3) ---
SVR best params: {'model__C': 1.0, 'model__epsilon': 0.1, 'model__gamma': 'scale', 'select__k': 'all'}

                 RMSE  RMSE 95% CI Low  RMSE 95% CI High     PCC  \
Model                                                              
Random Forest  0.5905           0.5467            0.6386  0.1918   
SVR (RBF)      0.5907           0.5405            0.6404  0.2131   

               PCC 95% CI Low  PCC 95% CI High  
Model                                           
Random Forest          0.0909           0.2972  
SVR (RBF)              0.1179           0.3153  

--- Simple RF+SVR Avg Ensemble (FAST) ---
                   RMSE

# Pure QNN

In [5]:
# pure_qnn_strong_v2.py  — same pure QNN, but with runtime guards + more frequent logging
import os, time, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")

import pennylane as qml
from pennylane import numpy as pnp

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

# ---------- utils ----------
def set_seeds(seed=42):
    np.random.seed(seed); pnp.random.seed(seed)

def rmse(y_true, y_pred): return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def safe_pcc(a, b):
    a = np.asarray(a).ravel(); b = np.asarray(b).ravel()
    if a.std()==0 or b.std()==0: return 0.0
    v = pearsonr(a, b)[0]
    return float(v) if np.isfinite(v) else 0.0

def load_als(path="final_processed_als_data.csv"):
    df = pd.read_csv(path)
    X = df.drop(columns=["subject_id", "alsfrs_slope"], errors="ignore")
    y = df["alsfrs_slope"].astype(float).values
    m = ~np.isnan(y)
    X, y = X.loc[m].reset_index(drop=True), y[m]
    print(f"✓ Loaded: X={X.shape}, y={y.shape}")
    print(f"  Target: mean={y.mean():.3f}, std={y.std():.3f}, range=[{y.min():.3f},{y.max():.3f}]")
    return X, y

def sanitize_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == object:
            try: out[c] = pd.to_numeric(out[c])
            except Exception: out[c] = pd.factorize(out[c].astype(str))[0]
    return out

# ---------- feature selection ----------
def improved_feature_selection(X_df, y, k=12, seed=42):
    imp = SimpleImputer(strategy="median")
    Xn = imp.fit_transform(X_df)

    rf = RandomForestRegressor(
        n_estimators=400, max_features="sqrt", min_samples_leaf=3,
        random_state=seed, n_jobs=1
    ).fit(Xn, y)
    rf_scores = rf.feature_importances_

    pearson_scores = np.array([
        abs(np.corrcoef(Xn[:, i], y)[0,1]) if Xn[:, i].std()>0 else 0.0
        for i in range(Xn.shape[1])
    ])
    spearman_scores = np.array([
        abs(spearmanr(Xn[:, i], y)[0]) if Xn[:, i].std()>0 else 0.0
        for i in range(Xn.shape[1])
    ])
    spearman_scores = np.nan_to_num(spearman_scores)

    def norm(v):
        vmin, vmax = v.min(), v.max()
        return (v - vmin)/(vmax - vmin + 1e-12) if vmax>vmin else np.zeros_like(v)

    fin = 0.45*norm(rf_scores) + 0.35*norm(pearson_scores) + 0.20*norm(spearman_scores)
    idx = np.argsort(fin)[::-1][:k]
    cols = [X_df.columns[i] for i in idx]
    print(f"✓ Selected top-{k}: {cols}")
    return idx.tolist(), cols

# ---------- angles (PLS→[-pi/3,pi/3]) ----------
def make_angles_pls(X_train, X_val, X_test, y_train, n_qubits):
    imp = SimpleImputer(strategy="median")
    rb  = RobustScaler(quantile_range=(10,90))
    pls = PLSRegression(n_components=n_qubits, scale=False)

    Xtr_s = rb.fit_transform(imp.fit_transform(X_train)).astype(np.float64)
    Xva_s = rb.transform(imp.transform(X_val)).astype(np.float64)
    Xte_s = rb.transform(imp.transform(X_test)).astype(np.float64)

    pls.fit(Xtr_s, y_train.reshape(-1,1))
    Xtr_pls = pls.transform(Xtr_s).astype(np.float64)
    Xva_pls = pls.transform(Xva_s).astype(np.float64)
    Xte_pls = pls.transform(Xte_s).astype(np.float64)

    mm = MinMaxScaler(feature_range=(-np.pi/3, np.pi/3))
    Xtr_q = mm.fit_transform(Xtr_pls).astype(np.float64)
    Xva_q = mm.transform(Xva_pls).astype(np.float64)
    Xte_q = mm.transform(Xte_pls).astype(np.float64)
    return Xtr_q, Xva_q, Xte_q

# ---------- pure QNN with multi-observable readout ----------
def build_qnode(n_qubits=6, n_layers=8, device_name="lightning.qubit", use_xx=True):
    try:
        dev = qml.device(device_name, wires=n_qubits)
    except Exception:
        dev = qml.device("default.qubit", wires=n_qubits)

    obs_Z  = [qml.PauliZ(i) for i in range(n_qubits)]
    obs_ZZ = [qml.PauliZ(i) @ qml.PauliZ((i+1)%n_qubits) for i in range(n_qubits)]
    obs_XX = [qml.PauliX(i) @ qml.PauliX((i+1)%n_qubits) for i in range(n_qubits)] if use_xx else []
    observables = obs_Z + obs_ZZ + obs_XX
    n_obs = len(observables)

    def entangle_ring():
        for i in range(n_qubits-1): qml.CNOT(wires=[i, i+1])
        qml.CNOT(wires=[n_qubits-1, 0])

    @qml.qnode(dev, interface="autograd", diff_method="adjoint")
    def circuit(x, W, alpha, beta, s, layer_mask):
        for i in range(n_qubits):
            qml.RY(alpha[i]*x[i] + beta[i], wires=i)
        for l in range(n_layers):
            m = layer_mask[l]
            for i in range(n_qubits):
                qml.RX(m*W[l,i,0], wires=i)
                qml.RY(m*W[l,i,1], wires=i)
                qml.RZ(m*W[l,i,2], wires=i)
            entangle_ring()
            for i in range(n_qubits):
                qml.RY(m*s*(alpha[i]*x[i] + beta[i]), wires=i)
        return [qml.expval(o) for o in observables]

    return circuit, n_obs

# ---------- training ----------
def train_pure_qnn(
    X_raw, y,
    topk=12,
    n_qubits=6,
    n_layers=8,
    steps=140,
    batch_size=48,
    stepsize=0.012,
    early_patience=18,
    n_restarts=3,
    bag_top=2,
    seed=42,
    use_pls=True,
    eval_every=5,
    max_batches_per_epoch=10,   # <-- cap work per epoch to avoid "silent hours"
    use_xx=True
):
    set_seeds(seed)
    X_tr_full, X_te_full, y_tr_full, y_te = train_test_split(X_raw, y, test_size=0.20, random_state=42)
    print(f"\n✓ Split: train={len(y_tr_full)}, test={len(y_te)}")

    X_tr_raw, X_va_raw, y_tr, y_va = train_test_split(X_tr_full, y_tr_full, test_size=0.15, random_state=123)
    print(f"✓ Train/Val split: train={len(y_tr)}, val={len(y_va)}")

    feat_idx, feat_cols = improved_feature_selection(pd.DataFrame(X_tr_raw, columns=X_raw.columns), y_tr, k=topk, seed=seed)
    def sel(df_like):
        df_like = pd.DataFrame(df_like, columns=X_raw.columns)
        return df_like.iloc[:, feat_idx].values
    Xtr_sel, Xva_sel, Xte_sel = sel(X_tr_raw), sel(X_va_raw), sel(X_te_full)

    print("\n✓ Preparing quantum angles...")
    if use_pls:
        Xtr_q, Xva_q, Xte_q = make_angles_pls(Xtr_sel, Xva_sel, Xte_sel, y_tr, n_qubits)
        print(f"  PLS→{n_qubits} angles from top-{topk} features.")
    else:
        imp = SimpleImputer(strategy="median"); rb=RobustScaler((10,90)); mm=MinMaxScaler((-np.pi/3, np.pi/3))
        Xtr_q = mm.fit_transform(rb.fit_transform(imp.fit_transform(Xtr_sel))).astype(np.float64)
        Xva_q = mm.transform(rb.transform(imp.transform(Xva_sel))).astype(np.float64)
        Xte_q = mm.transform(rb.transform(imp.transform(Xte_sel))).astype(np.float64)

    y_min, y_max = y_tr.min(), y_tr.max()
    def y_to_norm(v): return 2.0*(v - y_min)/(y_max - y_min + 1e-12) - 1.0
    def y_from_norm(vn): return 0.5*(vn + 1.0)*(y_max - y_min) + y_min
    y_tr_n, y_va_n = y_to_norm(y_tr), y_to_norm(y_va)

    circuit, n_obs = build_qnode(n_qubits=n_qubits, n_layers=n_layers, use_xx=use_xx)

    Xtr_arr = pnp.array(Xtr_q, requires_grad=False)
    Xva_arr = pnp.array(Xva_q, requires_grad=False)
    Xte_arr = pnp.array(Xte_q, requires_grad=False)
    ytr_arr = pnp.array(y_tr_n, requires_grad=False)
    yva_arr = pnp.array(y_va_n, requires_grad=False)

    def batch_iter(XA, yA, bs, max_batches=None):
        n = len(yA); idx = np.arange(n); pnp.random.shuffle(idx)
        count = 0
        for i in range(0, n, bs):
            if max_batches is not None and count >= max_batches: break
            sel = idx[i:i+bs]
            count += 1
            yield XA[sel], yA[sel]

    def layer_mask_schedule(ep, total):
        t = ep/float(total)
        base = 0.15 + 0.85*0.5*(1.0 - np.cos(np.pi*t))
        m = np.linspace(0.25, 1.0, n_layers) * base
        return pnp.array(np.clip(m,0.0,1.0), requires_grad=False)

    restart_stats, restart_params = [], []
    print(f"\n  Training Pure QNN with {n_restarts} restarts...")
    for r in range(n_restarts):
        set_seeds(seed + 100*r)
        W     = 0.05 * pnp.array(np.random.randn(n_layers, n_qubits, 3), requires_grad=True)
        alpha = 1.00 * pnp.ones((n_qubits,), requires_grad=True)
        beta  = pnp.zeros((n_qubits,), requires_grad=True)
        s     = pnp.array(0.30, requires_grad=True)
        rw    = 0.05 * pnp.array(np.random.randn(n_obs), requires_grad=True)
        rb    = pnp.array(0.0, requires_grad=True)

        opt = qml.AdamOptimizer(stepsize=stepsize)

        def pred_batch(XB, W, alpha, beta, s, rw, rb, lmask):
            preds = []
            for x in XB:
                o = pnp.stack(circuit(x, W, alpha, beta, s, lmask))
                preds.append(pnp.dot(rw, o) + rb)
            return pnp.stack(preds)

        def batch_loss(W, alpha, beta, s, rw, rb, XB, yB, lmask, l2=8e-5):
            pr = pred_batch(XB, W, alpha, beta, s, rw, rb, lmask)
            return pnp.mean((pr - yB)**2) + l2*(pnp.sum(W**2) + pnp.sum(rw**2))

        def val_mse(W, alpha, beta, s, rw, rb, lmask):
            pv = pred_batch(Xva_arr, W, alpha, beta, s, rw, rb, lmask)
            return pnp.mean((pv - yva_arr)**2)

        best_val, bad, best_pack = 1e9, 0, None
        t_start = time.time()
        for ep in range(1, steps+1):
            lmask = layer_mask_schedule(ep, steps)
            for XB, yB in batch_iter(Xtr_arr, ytr_arr, batch_size, max_batches=max_batches_per_epoch):
                W, alpha, beta, s, rw, rb = opt.step(
                    lambda W_,a_,b_,s_,rw_,rb_: batch_loss(W_,a_,b_,s_,rw_,rb_, XB, yB, lmask),
                    W, alpha, beta, s, rw, rb
                )
            # cosine LR
            opt.stepsize = float(max(0.25, 0.5*(1+np.cos(ep*np.pi/max(1,steps)))) * stepsize)

            if ep % eval_every == 0 or ep == 1:
                mse_v = float(val_mse(W, alpha, beta, s, rw, rb, lmask))
                elapsed = time.time() - t_start
                print(f"[restart {r+1}/{n_restarts} | ep {ep:03d}] val_MSE={mse_v:.5f}  lr={opt.stepsize:.4f}  t={elapsed/60:.1f}m")
                t_start = time.time()
                if mse_v + 1e-9 < best_val:
                    best_val, bad = mse_v, 0
                    best_pack = (pnp.array(W), pnp.array(alpha), pnp.array(beta),
                                 pnp.array(s), pnp.array(rw), pnp.array(rb))
                else:
                    bad += 1
                    if bad >= early_patience:
                        print("  early stop (patience)\n")
                        break

        restart_stats.append(best_val); restart_params.append(best_pack)

    order = np.argsort(restart_stats); keep = order[:max(1, min(bag_top, len(order)))]
    print(f"Selected restarts for bagging: {list((keep+1).tolist())}")

    def y_from_norm(vn): return 0.5*(vn + 1.0)*(y_tr.max() - y_tr.min()) + y_tr.min()
    preds_all = []
    for idx in keep:
        W, alpha, beta, s, rw, rb = restart_params[idx]
        lmask = layer_mask_schedule(steps, steps)
        pr_n = np.array(pred_batch(Xte_arr, W, alpha, beta, s, rw, rb, lmask))
        preds_all.append(y_from_norm(pr_n))
    y_te_pred = np.mean(preds_all, axis=0)

    te_rmse = rmse(y_te, y_te_pred); te_pcc = safe_pcc(y_te, y_te_pred); te_r2 = r2_score(y_te, y_te_pred)
    print("\n===== FINAL TEST (Pure QNN) =====")
    print(f"RMSE={te_rmse:.4f}  PCC={te_pcc:.4f}  R²={te_r2:.4f}")
    return dict(rmse=te_rmse, pcc=te_pcc, r2=te_r2)

# ----- runner -----
if __name__ == "__main__":
    t0 = time.time()
    X_raw, y = load_als("final_processed_als_data.csv")
    X_raw = sanitize_features(X_raw)
    out = train_pure_qnn(
        X_raw, y,
        topk=12,
        n_qubits=6,        # 6–7 works well on CPU
        n_layers=8,        # slightly shallower = faster, still expressive
        steps=140,
        batch_size=48,
        stepsize=0.012,
        early_patience=18,
        n_restarts=3,
        bag_top=2,
        seed=42,
        use_pls=True,
        eval_every=5,        # more frequent feedback so it never feels “stuck”
        max_batches_per_epoch=10,  # cap work per epoch
        use_xx=True
    )
    print(f"\n✓ Total time: {time.time()-t0:.1f}s")


✓ Loaded: X=(1897, 346), y=(1897,)
  Target: mean=-0.667, std=0.572, range=[-3.628,1.208]

✓ Split: train=1517, test=380
✓ Train/Val split: train=1289, val=228
✓ Selected top-12: ['alsfrs_ALSFRS_Total_std', 'fvc_FVC_Liters_slope', 'alsfrs_ALSFRS_Total_slope', 'alsfrs_Q1_Speech_min', 'alsfrs_Q1_Speech_last', 'fvc_FVC_Liters_std', 'alsfrs_Q1_Speech_median', 'alsfrs_Q3_Swallowing_min', 'alsfrs_Q1_Speech_max', 'alsfrs_Q3_Swallowing_std', 'vitals_Pulse_median', 'labs_Phosphorus_median']

✓ Preparing quantum angles...
  PLS→6 angles from top-12 features.

  Training Pure QNN with 3 restarts...
[restart 1/3 | ep 001] val_MSE=0.09790  lr=0.0120  t=1.2m
[restart 1/3 | ep 005] val_MSE=0.09326  lr=0.0120  t=3.3m
[restart 1/3 | ep 010] val_MSE=0.09031  lr=0.0118  t=4.1m
[restart 1/3 | ep 015] val_MSE=0.08945  lr=0.0117  t=4.0m
[restart 1/3 | ep 020] val_MSE=0.09065  lr=0.0114  t=4.1m
[restart 1/3 | ep 025] val_MSE=0.09234  lr=0.0111  t=4.3m
[restart 1/3 | ep 030] val_MSE=0.09150  lr=0.0107  t=4.1m