<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/ALS_QNN_PRO_ACT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
%pip install qiskit~=1.0 qiskit-machine-learning~=0.8.1 qiskit_algorithms

# Qiskit Imports
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_algorithms.optimizers import COBYLA
from qiskit_machine_learning.algorithms.regressors import VQR
from qiskit.primitives import Sampler

Collecting qiskit~=1.0
  Downloading qiskit-1.4.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting qiskit-machine-learning~=0.8.1
  Downloading qiskit_machine_learning-0.8.4-py3-none-any.whl.metadata (13 kB)
Collecting qiskit_algorithms
  Downloading qiskit_algorithms-0.4.0-py3-none-any.whl.metadata (4.7 kB)
Collecting rustworkx>=0.15.0 (from qiskit~=1.0)
  Downloading rustworkx-0.17.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting stevedore>=3.0.0 (from qiskit~=1.0)
  Downloading stevedore-5.5.0-py3-none-any.whl.metadata (2.2 kB)
Collecting symengine<0.14,>=0.11 (from qiskit~=1.0)
  Downloading symengine-0.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting scipy>=1.5 (from qiskit~=1.0)
  Downloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31

In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')
class ALSDataProcessor:
    """
    A robust class to load, clean, and process PRO-ACT data for predicting ALSFRS slope,
    replicating the methodology from the "Deep learning methods to predict amyotrophic
    lateral sclerosis disease progression" paper.
    """
    def __init__(self):
        self.label_encoders = {}
        # A list of columns to exclude from feature engineering
        self.id_and_delta_cols = [
            'subject_id', 'alsfrs_delta', 'fvc_delta', 'vitals_delta',
            'labs_delta', 'grip_delta', 'muscle_delta', 'onset_delta',
            'death_delta', 'history_delta'
        ]
    def _convert_alsfrs_r(self, alsfrs_df):
        """Convert ALSFRS-R questions to the original ALSFRS format."""
        df = alsfrs_df.copy()
        # Ensure ALSFRS_Total is numeric, coercing errors
        df['ALSFRS_Total'] = pd.to_numeric(df['ALSFRS_Total'], errors='coerce')
        return df
    def load_and_inspect_data(self, file_path=''):
        """Load all datasets and inspect their structure."""
        datasets = {}
        file_list = [
            'PROACT_ALSFRS.csv', 'PROACT_FVC.csv', 'PROACT_VITALSIGNS.csv',
            'PROACT_RILUZOLE.csv', 'PROACT_DEMOGRAPHICS.csv', 'PROACT_LABS.csv',
            'PROACT_DEATHDATA.csv', 'PROACT_HANDGRIPSTRENGTH.csv',
            'PROACT_MUSCLESTRENGTH.csv', 'PROACT_ALSHISTORY.csv' # Added missing file
        ]
        print("--- Loading and Inspecting Data ---")
        for file_name in file_list:
            try:
                df = pd.read_csv(file_path + file_name, on_bad_lines='skip')
                # --- CORRECTED RENAMING LOGIC ---
                # Check if 'subject_id' already exists. If not, find a candidate and rename only the first one found.
                if 'subject_id' not in df.columns:
                    potential_id_cols = [col for col in df.columns if 'subject' in col.lower()]
                    if potential_id_cols:
                        df.rename(columns={potential_id_cols[0]: 'subject_id'}, inplace=True)
                # --- END CORRECTION ---
                # Convert delta columns to numeric
                delta_cols = [col for col in df.columns if 'delta' in col.lower()]
                for col in delta_cols:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                datasets[file_name] = df
                print(f"✓ {file_name}: Loaded successfully with shape {df.shape}")
            except FileNotFoundError:
                print(f"✗ {file_name}: File not found. Will be skipped.")
        return datasets
    def calculate_alsfrs_slope(self, alsfrs_df):
        """Calculate the primary target variable: ALSFRS slope between months 3-12."""
        df = alsfrs_df.copy()
        df.rename(columns={c:'alsfrs_delta' for c in df.columns if 'delta' in c.lower()}, inplace=True)
        df['months'] = df['alsfrs_delta'] / 30.44
        df.sort_values(['subject_id', 'months'], inplace=True)
        slopes = {}
        for subject_id, subject_data in df.groupby('subject_id'):
            t1_candidates = subject_data[(subject_data['months'] > 3) & (subject_data['months'] <= 12)]
            t2_candidates = subject_data[subject_data['months'] >= 12]
            if not t1_candidates.empty and not t2_candidates.empty:
                t1_row = t1_candidates.iloc[0]
                t2_row = t2_candidates.iloc[0]
                t1, alsfrs_t1 = t1_row['months'], t1_row['ALSFRS_Total']
                t2, alsfrs_t2 = t2_row['months'], t2_row['ALSFRS_Total']
                if t2 > t1 and pd.notna(alsfrs_t1) and pd.notna(alsfrs_t2):
                    slope = (alsfrs_t2 - alsfrs_t1) / (t2 - t1)
                    slopes[subject_id] = slope
        return pd.DataFrame(list(slopes.items()), columns=['subject_id', 'alsfrs_slope'])
    def create_longitudinal_features(self, df, time_col, prefix):
        """Create the seven summary statistics from longitudinal data (first 3 months)."""
        df_sorted = df.sort_values(['subject_id', time_col])
        # Convert potential value columns to numeric
        potential_value_cols = [col for col in df_sorted.columns if col not in ['subject_id', time_col]]
        for col in potential_value_cols:
            df_sorted[col] = pd.to_numeric(df_sorted[col], errors='coerce')
        df_filtered = df_sorted[df_sorted[time_col] <= 90].copy()
        value_cols = [col for col in df_filtered.select_dtypes(include=np.number).columns
                      if col.lower() not in self.id_and_delta_cols]
        if not value_cols:
            return pd.DataFrame()
        summary_dfs = []
        for value_col in value_cols:
            grouped = df_filtered.groupby('subject_id')
            summary = grouped[value_col].agg(['min', 'max', 'median', 'first', 'last']).join(
                grouped[value_col].std(ddof=0).rename('std')
            )
            # Ensure there are at least two data points for slope calculation
            slope_df = grouped.apply(
                lambda g: (g[value_col].iloc[-1] - g[value_col].iloc[0]) / (g[time_col].iloc[-1] - g[time_col].iloc[0])
                if len(g) > 1 and (g[time_col].iloc[-1] - g[time_col].iloc[0]) > 0 else np.nan
            ).rename('slope')
            summary = summary.join(slope_df).fillna(0) # Fill NaN slopes with 0
            summary.columns = [f"{prefix}{value_col}_{stat}" for stat in summary.columns]
            summary_dfs.append(summary)
        return pd.concat(summary_dfs, axis=1).reset_index()
    def process_static_data(self, df):
        """Process static data files (like demographics, riluzole)."""
        processed = df.copy()
        for col in processed.select_dtypes(include=['object', 'category']).columns:
            if col != 'subject_id':
                le = self.label_encoders.setdefault(col, LabelEncoder())
                processed[col] = le.fit_transform(processed[col].astype(str))
        return processed.drop_duplicates(subset=['subject_id'])
    def merge_all_features(self, datasets):
        """Merge all static and longitudinal features into a single dataframe."""
        if 'PROACT_DEMOGRAPHICS.csv' not in datasets:
            raise ValueError("Demographics file is missing.")
        final_df = self.process_static_data(datasets['PROACT_DEMOGRAPHICS.csv'])
        static_files = ['PROACT_RILUZOLE.csv', 'PROACT_ALSHISTORY.csv'] # Added ALSHISTORY
        for file in static_files:
            if file in datasets:
                static_df = self.process_static_data(datasets[file])
                final_df = pd.merge(final_df, static_df, on='subject_id', how='left')
        longitudinal_configs = {
            'PROACT_ALSFRS.csv': 'alsfrs_',
            'PROACT_FVC.csv': 'fvc_',
            'PROACT_VITALSIGNS.csv': 'vitals_',
            'PROACT_LABS.csv': 'labs_',
            'PROACT_HANDGRIPSTRENGTH.csv': 'grip_',
            'PROACT_MUSCLESTRENGTH.csv': 'muscle_'
        }
        print("\n--- Generating Longitudinal Features (from first 3 months) ---")
        for file, prefix in longitudinal_configs.items():
            if file in datasets:
                df = datasets[file].copy()
                time_col_actual = next((c for c in df.columns if 'delta' in c.lower()), None)
                if not time_col_actual:
                    print(f"Warning: No time delta column found in {file}. Skipping.")
                    continue
                print(f"Processing {file}...")
                # Pivot long format files if necessary
                if file in ['PROACT_LABS.csv', 'PROACT_MUSCLESTRENGTH.csv', 'PROACT_HANDGRIPSTRENGTH.csv']:
                    try:
                        test_cols = [c for c in df.columns if any(keyword in c.lower() for keyword in ['test', 'exam', 'muscle', 'site', 'name', 'strength_test']) and c not in ['subject_id', time_col_actual]]
                        if test_cols:
                            test_col = test_cols[0]
                            value_cols = [c for c in df.columns if any(keyword in c.lower() for keyword in ['result', 'value', 'strength', 'score']) and c not in ['subject_id', time_col_actual]]
                            if value_cols:
                                value_col = value_cols[0]
                                df[value_col] = pd.to_numeric(df[value_col], errors='coerce')
                                df = df.pivot_table(index=['subject_id', time_col_actual], columns=test_col, values=value_col, aggfunc='mean').reset_index()
                    except Exception as e:
                        print(f"Warning: Pivoting failed for {file}: {e}")
                summary_features = self.create_longitudinal_features(df, time_col_actual, prefix)
                if not summary_features.empty:
                    final_df = pd.merge(final_df, summary_features, on='subject_id', how='left')
        return final_df
    def filter_eligible_patients(self, feature_df, alsfrs_df):
        """Filter for patients meeting the paper's criteria."""
        df = alsfrs_df.copy()
        df.rename(columns={c:'alsfrs_delta' for c in df.columns if 'delta' in c.lower()}, inplace=True)
        df['months'] = df['alsfrs_delta'] / 30.44
        eligibility = df.groupby('subject_id')['months'].agg(['min', 'max'])
        eligible_ids = eligibility[(eligibility['min'] <= 3) & (eligibility['max'] >= 12)].index
        print(f"\nFound {len(eligible_ids)} eligible patients out of {df['subject_id'].nunique()}.")
        return feature_df[feature_df['subject_id'].isin(eligible_ids)]
    def run_pipeline(self, file_path=''):
        """Execute the complete data preprocessing pipeline."""
        print("====== Starting ALS Data Preprocessing Pipeline ======")
        datasets = self.load_and_inspect_data(file_path)
        if 'PROACT_ALSFRS.csv' not in datasets:
            print("CRITICAL ERROR: PROACT_ALSFRS.csv not found. Aborting.")
            return None
        datasets['PROACT_ALSFRS.csv'] = self._convert_alsfrs_r(datasets['PROACT_ALSFRS.csv'])
        target_df = self.calculate_alsfrs_slope(datasets['PROACT_ALSFRS.csv'])
        print(f"\nCalculated ALSFRS slope for {len(target_df)} patients.")
        full_features = self.merge_all_features(datasets)
        eligible_features = self.filter_eligible_patients(full_features, datasets['PROACT_ALSFRS.csv'])
        final_df = pd.merge(eligible_features, target_df, on='subject_id', how='inner')
        print("\n--- Handling Missing Values ---")
        missing_thresh = 0.30
        initial_cols = len(final_df.columns)
        max_missing = len(final_df) * (1 - missing_thresh)
        final_df.dropna(axis=1, thresh=max_missing, inplace=True)
        print(f"Dropped {initial_cols - len(final_df.columns)} features with >{missing_thresh*100}% missing values.")
        X = final_df.drop(columns=['subject_id', 'alsfrs_slope'])
        y = final_df['alsfrs_slope']
        valid_y_mask = y.notna()
        X = X[valid_y_mask]
        y = y[valid_y_mask]
        subject_ids = final_df.loc[valid_y_mask, 'subject_id']
        imputer = SimpleImputer(strategy='median')
        X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
        print("\n--- Performing Feature Selection (Top 30 via Random Forest) ---")
        rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        rf.fit(X_imputed, y)
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        selected_features = importance_df['feature'].head(30).tolist()
        X_selected = X_imputed[selected_features]
        print("\n====== Pipeline Complete ======")
        print(f"Final feature matrix shape: {X_selected.shape}")
        print(f"Final target vector shape: {y.shape}")
        # Save the final data for the next step
        final_output = pd.concat([subject_ids.reset_index(drop=True),
                                  y.reset_index(drop=True),
                                  X_selected.reset_index(drop=True)], axis=1)
        final_output.to_csv("final_processed_als_data.csv", index=False)
        print("\n✅ Successfully saved processed data to 'final_processed_als_data.csv'")
        return {
            'X': X_selected,
            'y': y,
            'subject_ids': subject_ids,
            'feature_importance': importance_df,
        }
if __name__ == "__main__":
    # --- IMPORTANT ---
    # If your CSV files are in a different folder, change this path.
    # For example: file_path = "C:/Users/YourUser/Downloads/PROACT_data/"
    file_path = ""
    processor = ALSDataProcessor()
    processed_data = processor.run_pipeline(file_path=file_path)
    if processed_data:
        print("\n--- Top 15 Most Important Features ---")
        print(processed_data['feature_importance'].head(15))

--- Loading and Inspecting Data ---
✓ PROACT_ALSFRS.csv: Loaded successfully with shape (73845, 20)
✓ PROACT_FVC.csv: Loaded successfully with shape (49110, 10)
✓ PROACT_VITALSIGNS.csv: Loaded successfully with shape (84721, 36)
✓ PROACT_RILUZOLE.csv: Loaded successfully with shape (10363, 3)
✓ PROACT_DEMOGRAPHICS.csv: Loaded successfully with shape (12504, 14)
✓ PROACT_LABS.csv: Loaded successfully with shape (2937162, 5)
✓ PROACT_DEATHDATA.csv: Loaded successfully with shape (5043, 3)
✓ PROACT_HANDGRIPSTRENGTH.csv: Loaded successfully with shape (19032, 11)
✓ PROACT_MUSCLESTRENGTH.csv: Loaded successfully with shape (204875, 10)
✓ PROACT_ALSHISTORY.csv: Loaded successfully with shape (13765, 16)

Calculated ALSFRS slope for 2023 patients.

--- Generating Longitudinal Features (from first 3 months) ---
Processing PROACT_ALSFRS.csv...
Processing PROACT_FVC.csv...
Processing PROACT_VITALSIGNS.csv...
Processing PROACT_LABS.csv...
Processing PROACT_HANDGRIPSTRENGTH.csv...
Processing PROAC

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import warnings

warnings.filterwarnings('ignore')

def calculate_metrics(y_true, y_pred):
    """Calculates RMSD and PCC."""
    rmsd = np.sqrt(mean_squared_error(y_true, y_pred))
    pcc, _ = pearsonr(y_true, y_pred)
    return rmsd, pcc

def run_classical_pipeline():
    """
    Loads the processed data, trains baseline models, and evaluates their performance.
    """
    print("====== Starting Classical Baseline Model Pipeline ======")

    # --- 1. Load Data ---
    try:
        data = pd.read_csv("final_processed_als_data.csv")
        print(f"✓ Successfully loaded 'final_processed_als_data.csv' with shape {data.shape}")
    except FileNotFoundError:
        print("✗ ERROR: 'final_processed_als_data.csv' not found. Please run the preprocessing script first.")
        return

    # --- 2. Prepare Data ---
    X = data.drop(columns=['subject_id', 'alsfrs_slope'])
    y = data['alsfrs_slope']

    # 80/20 Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples).")

    # Scale data for SVR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # --- 3. Train and Evaluate Models ---
    results = {}

    # Model 1: Random Forest Regressor
    print("\n--- Training Random Forest Regressor ---")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    rf_rmsd, rf_pcc = calculate_metrics(y_test, rf_preds)
    results['Random Forest'] = {'RMSD': rf_rmsd, 'PCC': rf_pcc}
    print("✓ Training and evaluation complete.")

    # Model 2: Support Vector Regressor
    print("\n--- Training Support Vector Regressor (SVR) ---")
    svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
    svr_model.fit(X_train_scaled, y_train)
    svr_preds = svr_model.predict(X_test_scaled)
    svr_rmsd, svr_pcc = calculate_metrics(y_test, svr_preds)
    results['Support Vector Regressor'] = {'RMSD': svr_rmsd, 'PCC': svr_pcc}
    print("✓ Training and evaluation complete.")

    # --- 4. Display Results ---
    print("\n====== Classical Model Performance ======")
    results_df = pd.DataFrame(results).T
    print(results_df)
    print("\nReminder:")
    print("  - RMSD (Root Mean Squared Deviation): Lower is better.")
    print("  - PCC (Pearson Correlation Coefficient): Higher is better (closer to 1.0).")

    return results_df

if __name__ == "__main__":
    run_classical_pipeline()

✓ Successfully loaded 'final_processed_als_data.csv' with shape (2022, 32)
Data split into training (1617 samples) and testing (405 samples).

--- Training Random Forest Regressor ---
✓ Training and evaluation complete.

--- Training Support Vector Regressor (SVR) ---
✓ Training and evaluation complete.

                              RMSD       PCC
Random Forest             0.560428  0.266818
Support Vector Regressor  0.574893  0.246569

Reminder:
  - RMSD (Root Mean Squared Deviation): Lower is better.
  - PCC (Pearson Correlation Coefficient): Higher is better (closer to 1.0).


In [14]:
import os, time, numpy as np, pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

# CPU hygiene
for k in ["OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","NUMEXPR_NUM_THREADS"]:
    os.environ.setdefault(k, "1")
np.random.seed(42)

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr

# Qiskit
from qiskit.circuit.library import ZZFeatureMap, EfficientSU2
from qiskit.quantum_info import SparsePauliOp
from qiskit.primitives import Estimator
try:
    from qiskit_aer.primitives import Estimator as AerEstimator
    AER_OK = True
except Exception:
    AER_OK = False

# ---------------- utils
def safe_pcc(a,b):
    a,b = np.asarray(a).ravel(), np.asarray(b).ravel()
    if a.std()==0 or b.std()==0: return 0.0
    v = pearsonr(a,b)[0]
    return float(v) if np.isfinite(v) else 0.0

def metrics(y_true, y_pred):
    y_pred = np.asarray(y_pred).ravel()
    rmsd = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    return rmsd, safe_pcc(y_true, y_pred), float(r2_score(y_true, y_pred))

def direction_accuracy(y_true, y_pred, band=0.10):
    def cls(y):
        if y < -band: return 0
        if y >  band: return 2
        return 1
    yt = np.array([cls(v) for v in y_true]); yp = np.array([cls(v) for v in y_pred])
    return float((yt==yp).mean())

def load_xy(path="final_processed_als_data.csv"):
    df = pd.read_csv(path)
    X = df.drop(columns=["subject_id","alsfrs_slope"], errors="ignore")
    y = df["alsfrs_slope"].values
    m = ~np.isnan(y)
    X, y = X.loc[m].reset_index(drop=True), y[m]
    print(f"✓ Data loaded: X={X.shape}, y={y.shape}")
    return X, y

def select_topk_features(X_df, y, k=16):
    imp = SimpleImputer(strategy="median")
    Xn = imp.fit_transform(X_df)
    rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1).fit(Xn, y)
    rf_rank = rf.feature_importances_
    # mutual information and correlation give tiny gains but cost; we keep RF ranking for speed
    idx = np.argsort(rf_rank)[::-1][:k]
    cols = [X_df.columns[i] for i in idx]
    print(f"✓ Top-{k} features: {cols}")
    return idx, cols

# ---------------- quantum helpers
def make_observables(n_qubits=4, basis="ZX", use_pairs=True):
    obs=[]
    for i in range(n_qubits):
        # Z always
        p=['I']*n_qubits; p[i]='Z'; obs.append(SparsePauliOp.from_list([("".join(p[::-1]),1.0)]))
        if basis in ("ZX","ZXY"):
            p=['I']*n_qubits; p[i]='X'; obs.append(SparsePauliOp.from_list([("".join(p[::-1]),1.0)]))
        if basis=="ZXY":
            p=['I']*n_qubits; p[i]='Y'; obs.append(SparsePauliOp.from_list([("".join(p[::-1]),1.0)]))
    if use_pairs:
        for i in range(n_qubits):
            for j in range(i+1,n_qubits):
                p=['I']*n_qubits; p[i]=p[j]='Z'
                obs.append(SparsePauliOp.from_list([("".join(p[::-1]),1.0)]))
    return obs  # len = singles + (pairs? 6 : 0)

def _idx_from_param_name(name:str)->int:
    if '[' in name and ']' in name: return int(name.split('[')[1].split(']')[0])
    if '_' in name: return int(name.split('_')[-1])
    return int(''.join(ch for ch in name if ch.isdigit()))

def build_random_sink(n_qubits=4, fmap_reps=1, ansatz_reps=2, rng=None):
    rng = np.random.default_rng(None if rng is None else rng)
    fmap = ZZFeatureMap(feature_dimension=n_qubits, reps=fmap_reps)
    ans  = EfficientSU2(num_qubits=n_qubits, reps=ansatz_reps, entanglement="linear")
    circ = fmap.compose(ans)
    # freeze random weights in ansatz
    rand_theta = {p: float(rng.normal(0,0.35)) for p in ans.parameters}
    circ = circ.assign_parameters(rand_theta, inplace=False)
    # ensure we have the feature params (x[0]..x[n-1]) from composed circuit
    feat_params = [p for p in circ.parameters if p.name.startswith("x")]
    feat_params = sorted(feat_params, key=lambda p: _idx_from_param_name(p.name))
    assert len(feat_params)==n_qubits, f"Expected {n_qubits} feature params, got {len(feat_params)}"
    return circ, feat_params

def build_estimator():
    if AER_OK:
        try:
            return AerEstimator()  # statevector default; fast and stable
        except Exception:
            pass
    return Estimator()

def qrf_features_batched(estimator, sinks, observables, X_theta, batch_size=128, desc_prefix=""):
    N = X_theta.shape[0]; T=len(sinks); D=len(observables)
    Z = np.empty((N, T*D), dtype=float)
    for t,(circ,feat_params) in enumerate(sinks):
        col0 = t*D
        for s in tqdm(range(0,N,batch_size), desc=f"{desc_prefix}Sink {t+1}/{T}", leave=False):
            e = min(N, s+batch_size)
            circuits=[]; obs_list=[]
            for i in range(s,e):
                pmap = {feat_params[k]: float(X_theta[i,k]) for k in range(len(feat_params))}
                cb = circ.assign_parameters(pmap, inplace=False)
                circuits.extend([cb]*D)
                obs_list.extend(observables)
            vals = estimator.run(circuits, obs_list).result().values
            Z[s:e, col0:col0+D] = np.array(vals).reshape(e-s, D)
    return Z

# ---------------- main (stacked ensemble)
def run_qrf_stacked(
    data_path="final_processed_als_data.csv",
    n_qubits=4,
    topk_for_proj=16,            # take more classical features to feed PLS
    pls_components=4,
    seeds=(13, 37),              # increase to 3 for ~1h runs
    zx_sinks=4,                  # sinks per seed for ZX
    zxy_sinks=3,                 # sinks per seed for ZXY
    fmap_reps=1, ansatz_reps=2,  # expressive but same-ish depth
    use_pairs=True,
    batch_size=160,              # larger batch → faster
    train_cap=None               # None = use all train rows
):
    t0=time.time()
    # ---------- data
    X, y = load_xy(data_path)
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

    # ---------- one selection for projection
    idxK, colsK = select_topk_features(X_tr, y_tr, k=topk_for_proj)
    impK, stdK = SimpleImputer(strategy="median"), StandardScaler()
    XtrK = stdK.fit_transform(impK.fit_transform(X_tr.iloc[:, idxK]))
    XteK = stdK.transform(impK.transform(X_te.iloc[:, idxK]))

    # ---------- PLS → angles [0, π]
    pls = PLSRegression(n_components=pls_components, scale=False)
    pls.fit(XtrK, y_tr)
    Xtr_p = pls.transform(XtrK)
    Xte_p = pls.transform(XteK)
    ang = MinMaxScaler(feature_range=(0.0, np.pi))
    Xtr_th = ang.fit_transform(Xtr_p)
    Xte_th = ang.transform(Xte_p)
    assert Xtr_th.shape[1] == n_qubits == pls_components, "Set pls_components==n_qubits"

    # ---------- RF baseline on full features (OOF + full)
    imp_full, std_full = SimpleImputer(strategy="median"), StandardScaler()
    Xtr_full = std_full.fit_transform(imp_full.fit_transform(X_tr))
    Xte_full = std_full.transform(imp_full.transform(X_te))

    # ---------- quantum config: build all sinks once
    est = build_estimator()
    obs_ZX  = make_observables(n_qubits=n_qubits, basis="ZX",  use_pairs=use_pairs)
    obs_ZXY = make_observables(n_qubits=n_qubits, basis="ZXY", use_pairs=use_pairs)

    sink_specs = []  # list of (basis, (circ, feat_params))
    for sd in seeds:
        rng = np.random.default_rng(sd)
        for _ in range(zx_sinks):
            sink_specs.append(("ZX",  build_random_sink(n_qubits, fmap_reps, ansatz_reps, rng)))
        for _ in range(zxy_sinks):
            sink_specs.append(("ZXY", build_random_sink(n_qubits, fmap_reps, ansatz_reps, rng)))

    # optional train cap to speed up quantum feature build
    idx_all = np.arange(X_tr.shape[0])
    if train_cap and len(idx_all) > train_cap:
        idx_use = np.sort(np.random.default_rng(42).choice(idx_all, size=train_cap, replace=False))
    else:
        idx_use = idx_all

    # build Z for training indices we will use AND for full test
    # we’ll store full-length arrays but fill only idx_use rows for train
    D_per = (len(obs_ZX), len(obs_ZXY))
    total_D = zx_sinks*len(seeds)*D_per[0] + zxy_sinks*len(seeds)*D_per[1]
    Z_tr = np.zeros((X_tr.shape[0], total_D), dtype=float)
    Z_te = np.zeros((X_te.shape[0], total_D), dtype=float)

    col = 0
    for (basis, sink) in tqdm(sink_specs, desc="Quantum sinks"):
        obs = obs_ZX if basis=="ZX" else obs_ZXY
        # train part (only idx_use to save time)
        Zp = qrf_features_batched(est, [sink], obs, Xtr_th[idx_use], batch_size=batch_size, desc_prefix="TR ")
        Z_tr[idx_use, col:col+len(obs)] = Zp
        # test part
        Zt = qrf_features_batched(est, [sink], obs, Xte_th, batch_size=batch_size, desc_prefix="TE ")
        Z_te[:, col:col+len(obs)] = Zt
        col += len(obs)

    # concat PLS comps to quantum features (cheap extra signal)
    Z_tr = np.hstack([Z_tr, Xtr_p])
    Z_te = np.hstack([Z_te, Xte_p])

    # whiten features for linear heads
    zsc = StandardScaler()
    Z_tr_w = zsc.fit_transform(Z_tr)
    Z_te_w = zsc.transform(Z_te)

    # ---------- OOF stacking
    K = 5
    kf = KFold(n_splits=K, shuffle=True, random_state=42)
    oof_rf   = np.zeros_like(y_tr, dtype=float)
    oof_qrid = np.zeros_like(y_tr, dtype=float)
    oof_qhgb = np.zeros_like(y_tr, dtype=float)

    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr_full), 1):
        # RF on original features
        rf = RandomForestRegressor(n_estimators=300, max_depth=None, random_state=100+fold, n_jobs=-1)
        rf.fit(Xtr_full[tr_idx], y_tr[tr_idx])
        oof_rf[va_idx] = rf.predict(Xtr_full[va_idx])

        # QRFR heads on quantum features
        ridge = RidgeCV(alphas=np.array([0.03,0.1,0.3,1.0,3.0,10.0,30.0]), cv=3)
        ridge.fit(Z_tr_w[tr_idx], y_tr[tr_idx])
        oof_qrid[va_idx] = ridge.predict(Z_tr_w[va_idx])

        hgb = HistGradientBoostingRegressor(max_depth=3, learning_rate=0.07, max_iter=400,
                                            l2_regularization=0.01, random_state=200+fold)
        hgb.fit(Z_tr_w[tr_idx], y_tr[tr_idx])
        oof_qhgb[va_idx] = hgb.predict(Z_tr_w[va_idx])

        tqdm.write(f"Fold {fold}/{K} done.")

    # meta-learner on OOF preds
    X_oof = np.c_[oof_rf, oof_qrid, oof_qhgb]
    meta = Ridge(alpha=0.3, fit_intercept=True).fit(X_oof, y_tr)

    # fit base models on FULL train for test-time
    rf_full = RandomForestRegressor(n_estimators=300, random_state=123, n_jobs=-1).fit(Xtr_full, y_tr)
    ridge_full = RidgeCV(alphas=np.array([0.03,0.1,0.3,1.0,3.0,10.0,30.0]), cv=5).fit(Z_tr_w, y_tr)
    hgb_full = HistGradientBoostingRegressor(max_depth=3, learning_rate=0.07, max_iter=500,
                                             l2_regularization=0.01, random_state=999).fit(Z_tr_w, y_tr)

    # predictions on test
    y_rf_te  = rf_full.predict(Xte_full)
    y_qr_te  = ridge_full.predict(Z_te_w)
    y_qh_te  = hgb_full.predict(Z_te_w)
    y_stack_te = meta.predict(np.c_[y_rf_te, y_qr_te, y_qh_te])

    # metrics
    def report(tag, yhat):
        rmsd, pcc, r2 = metrics(y_te, yhat)
        acc3 = direction_accuracy(y_te, yhat)*100.0
        print(f"{tag:<18} RMSD={rmsd:.4f}  PCC={pcc:.4f}  R²={r2:.4f}  ACC3={acc3:.1f}%")
        return rmsd, pcc, r2, acc3

    print("\n===== RESULTS (Stacked QRFR vs RF) =====")
    r_rf   = report("RF (baseline)", y_rf_te)
    r_qr   = report("QRFR-Ridge",   y_qr_te)
    r_qh   = report("QRFR-GBDT",    y_qh_te)
    r_meta = report("STACKED (meta)",y_stack_te)

    print("\nDims: ZX sinks/seed=%d  ZXY sinks/seed=%d  seeds=%d  -> Z_dims=%d (+PLS=%d) = %d" %
          (zx_sinks, zxy_sinks, len(seeds),
           zx_sinks*len(seeds)*len(obs_ZX) + zxy_sinks*len(seeds)*len(obs_ZXY),
           Xtr_p.shape[1],
           Z_tr.shape[1]))
    print("Time: %.1fs   AER=%s" % (time.time()-t0, str(AER_OK)))
    return dict(rf=r_rf, qridge=r_qr, qgbdt=r_qh, stacked=r_meta)

if __name__ == "__main__":
    _ = run_qrf_stacked(
        data_path="final_processed_als_data.csv",
        n_qubits=4,
        topk_for_proj=16,
        pls_components=4,
        seeds=(13, 37),      # try (13,37,91) if you have ~1h
        zx_sinks=4,          # raise to 6 if you extend time
        zxy_sinks=3,         # raise to 5 if you extend time
        fmap_reps=1,
        ansatz_reps=2,
        use_pairs=True,
        batch_size=160,
        train_cap=None       # use all training rows
    )


✓ Data loaded: X=(2022, 30), y=(2022,)
✓ Top-16 features: ['fvc_Subject_Liters_Trial_1_slope', 'alsfrs_ALSFRS_Total_slope', 'fvc_Subject_Liters_Trial_1_last', 'fvc_Subject_Liters_Trial_1_std', 'labs_AST(SGOT)_slope', 'vitals_Weight_slope', 'vitals_Pulse_std', 'Age', 'vitals_Vital_Signs_Delta_std', 'vitals_Blood_Pressure_Diastolic_std', 'alsfrs_ALSFRS_Total_std', 'labs_Sodium_slope', 'labs_Bicarbonate_slope', 'labs_Creatinine_slope', 'vitals_Pulse_median', 'labs_Laboratory_Delta_std']


Quantum sinks:   0%|          | 0/14 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

TR Sink 1/1:   0%|          | 0/11 [00:00<?, ?it/s]

TE Sink 1/1:   0%|          | 0/3 [00:00<?, ?it/s]

Fold 1/5 done.
Fold 2/5 done.
Fold 3/5 done.
Fold 4/5 done.
Fold 5/5 done.

===== RESULTS (Stacked QRFR vs RF) =====
RF (baseline)      RMSD=0.5579  PCC=0.2764  R²=0.0731  ACC3=86.2%
QRFR-Ridge         RMSD=0.5817  PCC=0.1774  R²=-0.0075  ACC3=85.9%
QRFR-GBDT          RMSD=0.6010  PCC=0.1445  R²=-0.0757  ACC3=85.7%
STACKED (meta)     RMSD=0.5572  PCC=0.2770  R²=0.0756  ACC3=86.2%

Dims: ZX sinks/seed=4  ZXY sinks/seed=3  seeds=2  -> Z_dims=220 (+PLS=4) = 224
Time: 1380.4s   AER=False
