<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/ALS_QNN_PRO_ACT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
%pip install qiskit~=1.0 qiskit-machine-learning~=0.8.1 qiskit_algorithms

# Qiskit Imports
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_algorithms.optimizers import COBYLA
from qiskit_machine_learning.algorithms.regressors import VQR
from qiskit.primitives import Sampler



In [4]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')
class ALSDataProcessor:
    """
    A robust class to load, clean, and process PRO-ACT data for predicting ALSFRS slope,
    replicating the methodology from the "Deep learning methods to predict amyotrophic
    lateral sclerosis disease progression" paper.
    """
    def __init__(self):
        self.label_encoders = {}
        # A list of columns to exclude from feature engineering
        self.id_and_delta_cols = [
            'subject_id', 'alsfrs_delta', 'fvc_delta', 'vitals_delta',
            'labs_delta', 'grip_delta', 'muscle_delta', 'onset_delta',
            'death_delta', 'history_delta'
        ]
    def _convert_alsfrs_r(self, alsfrs_df):
        """Convert ALSFRS-R questions to the original ALSFRS format."""
        df = alsfrs_df.copy()
        # Ensure ALSFRS_Total is numeric, coercing errors
        df['ALSFRS_Total'] = pd.to_numeric(df['ALSFRS_Total'], errors='coerce')
        return df
    def load_and_inspect_data(self, file_path=''):
        """Load all datasets and inspect their structure."""
        datasets = {}
        file_list = [
            'PROACT_ALSFRS.csv', 'PROACT_FVC.csv', 'PROACT_VITALSIGNS.csv',
            'PROACT_RILUZOLE.csv', 'PROACT_DEMOGRAPHICS.csv', 'PROACT_LABS.csv',
            'PROACT_DEATHDATA.csv', 'PROACT_HANDGRIPSTRENGTH.csv',
            'PROACT_MUSCLESTRENGTH.csv', 'PROACT_ALSHISTORY.csv' # Added missing file
        ]
        print("--- Loading and Inspecting Data ---")
        for file_name in file_list:
            try:
                df = pd.read_csv(file_path + file_name, on_bad_lines='skip')
                # --- CORRECTED RENAMING LOGIC ---
                # Check if 'subject_id' already exists. If not, find a candidate and rename only the first one found.
                if 'subject_id' not in df.columns:
                    potential_id_cols = [col for col in df.columns if 'subject' in col.lower()]
                    if potential_id_cols:
                        df.rename(columns={potential_id_cols[0]: 'subject_id'}, inplace=True)
                # --- END CORRECTION ---
                # Convert delta columns to numeric
                delta_cols = [col for col in df.columns if 'delta' in col.lower()]
                for col in delta_cols:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                datasets[file_name] = df
                print(f"✓ {file_name}: Loaded successfully with shape {df.shape}")
            except FileNotFoundError:
                print(f"✗ {file_name}: File not found. Will be skipped.")
        return datasets
    def calculate_alsfrs_slope(self, alsfrs_df):
        """Calculate the primary target variable: ALSFRS slope between months 3-12."""
        df = alsfrs_df.copy()
        df.rename(columns={c:'alsfrs_delta' for c in df.columns if 'delta' in c.lower()}, inplace=True)
        df['months'] = df['alsfrs_delta'] / 30.44
        df.sort_values(['subject_id', 'months'], inplace=True)
        slopes = {}
        for subject_id, subject_data in df.groupby('subject_id'):
            t1_candidates = subject_data[(subject_data['months'] > 3) & (subject_data['months'] <= 12)]
            t2_candidates = subject_data[subject_data['months'] >= 12]
            if not t1_candidates.empty and not t2_candidates.empty:
                t1_row = t1_candidates.iloc[0]
                t2_row = t2_candidates.iloc[0]
                t1, alsfrs_t1 = t1_row['months'], t1_row['ALSFRS_Total']
                t2, alsfrs_t2 = t2_row['months'], t2_row['ALSFRS_Total']
                if t2 > t1 and pd.notna(alsfrs_t1) and pd.notna(alsfrs_t2):
                    slope = (alsfrs_t2 - alsfrs_t1) / (t2 - t1)
                    slopes[subject_id] = slope
        return pd.DataFrame(list(slopes.items()), columns=['subject_id', 'alsfrs_slope'])
    def create_longitudinal_features(self, df, time_col, prefix):
        """Create the seven summary statistics from longitudinal data (first 3 months)."""
        df_sorted = df.sort_values(['subject_id', time_col])
        # Convert potential value columns to numeric
        potential_value_cols = [col for col in df_sorted.columns if col not in ['subject_id', time_col]]
        for col in potential_value_cols:
            df_sorted[col] = pd.to_numeric(df_sorted[col], errors='coerce')
        df_filtered = df_sorted[df_sorted[time_col] <= 90].copy()
        value_cols = [col for col in df_filtered.select_dtypes(include=np.number).columns
                      if col.lower() not in self.id_and_delta_cols]
        if not value_cols:
            return pd.DataFrame()
        summary_dfs = []
        for value_col in value_cols:
            grouped = df_filtered.groupby('subject_id')
            summary = grouped[value_col].agg(['min', 'max', 'median', 'first', 'last']).join(
                grouped[value_col].std(ddof=0).rename('std')
            )
            # Ensure there are at least two data points for slope calculation
            slope_df = grouped.apply(
                lambda g: (g[value_col].iloc[-1] - g[value_col].iloc[0]) / (g[time_col].iloc[-1] - g[time_col].iloc[0])
                if len(g) > 1 and (g[time_col].iloc[-1] - g[time_col].iloc[0]) > 0 else np.nan
            ).rename('slope')
            summary = summary.join(slope_df).fillna(0) # Fill NaN slopes with 0
            summary.columns = [f"{prefix}{value_col}_{stat}" for stat in summary.columns]
            summary_dfs.append(summary)
        return pd.concat(summary_dfs, axis=1).reset_index()
    def process_static_data(self, df):
        """Process static data files (like demographics, riluzole)."""
        processed = df.copy()
        for col in processed.select_dtypes(include=['object', 'category']).columns:
            if col != 'subject_id':
                le = self.label_encoders.setdefault(col, LabelEncoder())
                processed[col] = le.fit_transform(processed[col].astype(str))
        return processed.drop_duplicates(subset=['subject_id'])
    def merge_all_features(self, datasets):
        """Merge all static and longitudinal features into a single dataframe."""
        if 'PROACT_DEMOGRAPHICS.csv' not in datasets:
            raise ValueError("Demographics file is missing.")
        final_df = self.process_static_data(datasets['PROACT_DEMOGRAPHICS.csv'])
        static_files = ['PROACT_RILUZOLE.csv', 'PROACT_ALSHISTORY.csv'] # Added ALSHISTORY
        for file in static_files:
            if file in datasets:
                static_df = self.process_static_data(datasets[file])
                final_df = pd.merge(final_df, static_df, on='subject_id', how='left')
        longitudinal_configs = {
            'PROACT_ALSFRS.csv': 'alsfrs_',
            'PROACT_FVC.csv': 'fvc_',
            'PROACT_VITALSIGNS.csv': 'vitals_',
            'PROACT_LABS.csv': 'labs_',
            'PROACT_HANDGRIPSTRENGTH.csv': 'grip_',
            'PROACT_MUSCLESTRENGTH.csv': 'muscle_'
        }
        print("\n--- Generating Longitudinal Features (from first 3 months) ---")
        for file, prefix in longitudinal_configs.items():
            if file in datasets:
                df = datasets[file].copy()
                time_col_actual = next((c for c in df.columns if 'delta' in c.lower()), None)
                if not time_col_actual:
                    print(f"Warning: No time delta column found in {file}. Skipping.")
                    continue
                print(f"Processing {file}...")
                # Pivot long format files if necessary
                if file in ['PROACT_LABS.csv', 'PROACT_MUSCLESTRENGTH.csv', 'PROACT_HANDGRIPSTRENGTH.csv']:
                    try:
                        test_cols = [c for c in df.columns if any(keyword in c.lower() for keyword in ['test', 'exam', 'muscle', 'site', 'name', 'strength_test']) and c not in ['subject_id', time_col_actual]]
                        if test_cols:
                            test_col = test_cols[0]
                            value_cols = [c for c in df.columns if any(keyword in c.lower() for keyword in ['result', 'value', 'strength', 'score']) and c not in ['subject_id', time_col_actual]]
                            if value_cols:
                                value_col = value_cols[0]
                                df[value_col] = pd.to_numeric(df[value_col], errors='coerce')
                                df = df.pivot_table(index=['subject_id', time_col_actual], columns=test_col, values=value_col, aggfunc='mean').reset_index()
                    except Exception as e:
                        print(f"Warning: Pivoting failed for {file}: {e}")
                summary_features = self.create_longitudinal_features(df, time_col_actual, prefix)
                if not summary_features.empty:
                    final_df = pd.merge(final_df, summary_features, on='subject_id', how='left')
        return final_df
    def filter_eligible_patients(self, feature_df, alsfrs_df):
        """Filter for patients meeting the paper's criteria."""
        df = alsfrs_df.copy()
        df.rename(columns={c:'alsfrs_delta' for c in df.columns if 'delta' in c.lower()}, inplace=True)
        df['months'] = df['alsfrs_delta'] / 30.44
        eligibility = df.groupby('subject_id')['months'].agg(['min', 'max'])
        eligible_ids = eligibility[(eligibility['min'] <= 3) & (eligibility['max'] >= 12)].index
        print(f"\nFound {len(eligible_ids)} eligible patients out of {df['subject_id'].nunique()}.")
        return feature_df[feature_df['subject_id'].isin(eligible_ids)]
    def run_pipeline(self, file_path=''):
        """Execute the complete data preprocessing pipeline."""
        print("====== Starting ALS Data Preprocessing Pipeline ======")
        datasets = self.load_and_inspect_data(file_path)
        if 'PROACT_ALSFRS.csv' not in datasets:
            print("CRITICAL ERROR: PROACT_ALSFRS.csv not found. Aborting.")
            return None
        datasets['PROACT_ALSFRS.csv'] = self._convert_alsfrs_r(datasets['PROACT_ALSFRS.csv'])
        target_df = self.calculate_alsfrs_slope(datasets['PROACT_ALSFRS.csv'])
        print(f"\nCalculated ALSFRS slope for {len(target_df)} patients.")
        full_features = self.merge_all_features(datasets)
        eligible_features = self.filter_eligible_patients(full_features, datasets['PROACT_ALSFRS.csv'])
        final_df = pd.merge(eligible_features, target_df, on='subject_id', how='inner')
        print("\n--- Handling Missing Values ---")
        missing_thresh = 0.30
        initial_cols = len(final_df.columns)
        max_missing = len(final_df) * (1 - missing_thresh)
        final_df.dropna(axis=1, thresh=max_missing, inplace=True)
        print(f"Dropped {initial_cols - len(final_df.columns)} features with >{missing_thresh*100}% missing values.")
        X = final_df.drop(columns=['subject_id', 'alsfrs_slope'])
        y = final_df['alsfrs_slope']
        valid_y_mask = y.notna()
        X = X[valid_y_mask]
        y = y[valid_y_mask]
        subject_ids = final_df.loc[valid_y_mask, 'subject_id']
        imputer = SimpleImputer(strategy='median')
        X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
        print("\n--- Performing Feature Selection (Top 30 via Random Forest) ---")
        rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        rf.fit(X_imputed, y)
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        selected_features = importance_df['feature'].head(30).tolist()
        X_selected = X_imputed[selected_features]
        print("\n====== Pipeline Complete ======")
        print(f"Final feature matrix shape: {X_selected.shape}")
        print(f"Final target vector shape: {y.shape}")
        # Save the final data for the next step
        final_output = pd.concat([subject_ids.reset_index(drop=True),
                                  y.reset_index(drop=True),
                                  X_selected.reset_index(drop=True)], axis=1)
        final_output.to_csv("final_processed_als_data.csv", index=False)
        print("\n✅ Successfully saved processed data to 'final_processed_als_data.csv'")
        return {
            'X': X_selected,
            'y': y,
            'subject_ids': subject_ids,
            'feature_importance': importance_df,
        }
if __name__ == "__main__":
    # --- IMPORTANT ---
    # If your CSV files are in a different folder, change this path.
    # For example: file_path = "C:/Users/YourUser/Downloads/PROACT_data/"
    file_path = ""
    processor = ALSDataProcessor()
    processed_data = processor.run_pipeline(file_path=file_path)
    if processed_data:
        print("\n--- Top 15 Most Important Features ---")
        print(processed_data['feature_importance'].head(15))

--- Loading and Inspecting Data ---
✓ PROACT_ALSFRS.csv: Loaded successfully with shape (73845, 20)
✓ PROACT_FVC.csv: Loaded successfully with shape (49110, 10)
✓ PROACT_VITALSIGNS.csv: Loaded successfully with shape (91226, 36)
✓ PROACT_RILUZOLE.csv: Loaded successfully with shape (10363, 3)
✓ PROACT_DEMOGRAPHICS.csv: Loaded successfully with shape (12504, 14)
✓ PROACT_LABS.csv: Loaded successfully with shape (2960262, 5)
✓ PROACT_DEATHDATA.csv: Loaded successfully with shape (5043, 3)
✓ PROACT_HANDGRIPSTRENGTH.csv: Loaded successfully with shape (19032, 11)
✓ PROACT_MUSCLESTRENGTH.csv: Loaded successfully with shape (213421, 10)
✓ PROACT_ALSHISTORY.csv: Loaded successfully with shape (13765, 16)

Calculated ALSFRS slope for 2023 patients.

--- Generating Longitudinal Features (from first 3 months) ---
Processing PROACT_ALSFRS.csv...
Processing PROACT_FVC.csv...
Processing PROACT_VITALSIGNS.csv...
Processing PROACT_LABS.csv...
Processing PROACT_HANDGRIPSTRENGTH.csv...
Processing PROAC

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import warnings

warnings.filterwarnings('ignore')

def calculate_metrics(y_true, y_pred):
    """Calculates RMSD and PCC."""
    rmsd = np.sqrt(mean_squared_error(y_true, y_pred))
    pcc, _ = pearsonr(y_true, y_pred)
    return rmsd, pcc

def run_classical_pipeline():
    """
    Loads the processed data, trains baseline models, and evaluates their performance.
    """
    print("====== Starting Classical Baseline Model Pipeline ======")

    # --- 1. Load Data ---
    try:
        data = pd.read_csv("final_processed_als_data.csv")
        print(f"✓ Successfully loaded 'final_processed_als_data.csv' with shape {data.shape}")
    except FileNotFoundError:
        print("✗ ERROR: 'final_processed_als_data.csv' not found. Please run the preprocessing script first.")
        return

    # --- 2. Prepare Data ---
    X = data.drop(columns=['subject_id', 'alsfrs_slope'])
    y = data['alsfrs_slope']

    # 80/20 Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples).")

    # Scale data for SVR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # --- 3. Train and Evaluate Models ---
    results = {}

    # Model 1: Random Forest Regressor
    print("\n--- Training Random Forest Regressor ---")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    rf_rmsd, rf_pcc = calculate_metrics(y_test, rf_preds)
    results['Random Forest'] = {'RMSD': rf_rmsd, 'PCC': rf_pcc}
    print("✓ Training and evaluation complete.")

    # Model 2: Support Vector Regressor
    print("\n--- Training Support Vector Regressor (SVR) ---")
    svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
    svr_model.fit(X_train_scaled, y_train)
    svr_preds = svr_model.predict(X_test_scaled)
    svr_rmsd, svr_pcc = calculate_metrics(y_test, svr_preds)
    results['Support Vector Regressor'] = {'RMSD': svr_rmsd, 'PCC': svr_pcc}
    print("✓ Training and evaluation complete.")

    # --- 4. Display Results ---
    print("\n====== Classical Model Performance ======")
    results_df = pd.DataFrame(results).T
    print(results_df)
    print("\nReminder:")
    print("  - RMSD (Root Mean Squared Deviation): Lower is better.")
    print("  - PCC (Pearson Correlation Coefficient): Higher is better (closer to 1.0).")

    return results_df

if __name__ == "__main__":
    run_classical_pipeline()

✓ Successfully loaded 'final_processed_als_data.csv' with shape (2022, 32)
Data split into training (1617 samples) and testing (405 samples).

--- Training Random Forest Regressor ---
✓ Training and evaluation complete.

--- Training Support Vector Regressor (SVR) ---
✓ Training and evaluation complete.

                              RMSD       PCC
Random Forest             0.560428  0.266818
Support Vector Regressor  0.574893  0.246569

Reminder:
  - RMSD (Root Mean Squared Deviation): Lower is better.
  - PCC (Pearson Correlation Coefficient): Higher is better (closer to 1.0).


In [6]:
# qnn_main_fast.py
import os, numpy as np, pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

# clamp threads so CPU-only envs don't thrash
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr

np.random.seed(42)

# --- Qiskit ---
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes, EfficientSU2
from qiskit.quantum_info import SparsePauliOp
from qiskit.primitives import Estimator
from qiskit_machine_learning.neural_networks import EstimatorQNN

try:
    from qiskit_aer.primitives import Estimator as AerEstimator
    AER_OK = True
except Exception:
    AER_OK = False

# --- Torch ---
import torch
from torch.utils.data import TensorDataset, DataLoader

# ---------------- utils ----------------
def safe_pcc(y_true, y_pred):
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    if y_true.std()==0 or y_pred.std()==0: return 0.0
    v = pearsonr(y_true, y_pred)[0]
    return float(v) if np.isfinite(v) else 0.0

def metrics(y_true, y_pred):
    y_pred = np.asarray(y_pred).ravel()
    rmsd = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    return rmsd, safe_pcc(y_true, y_pred), float(r2_score(y_true, y_pred))

def load_xy(path="final_processed_als_data.csv"):
    df = pd.read_csv(path)
    if "alsfrs_slope" not in df.columns:
        raise ValueError("Target 'alsfrs_slope' missing.")
    X = df.drop(columns=["subject_id", "alsfrs_slope"], errors="ignore")
    y = df["alsfrs_slope"].values
    m = ~np.isnan(y)
    X, y = X.loc[m].reset_index(drop=True), y[m]
    print(f"✓ Data loaded: X={X.shape}, y={y.shape}", flush=True)
    return X, y

def select_topk_features(X, y, k=8):
    imp = SimpleImputer(strategy="median")
    Xn = imp.fit_transform(X)
    rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(Xn, y)
    rf_rank = rf.feature_importances_
    mi = mutual_info_regression(Xn, y, random_state=42)
    corr = np.array([abs(np.corrcoef(Xn[:, i], y)[0,1]) if Xn[:, i].std()>0 else 0.0
                     for i in range(Xn.shape[1])])
    def nz(v): m=v.max(); return v/(m+1e-8) if m>0 else v
    score = nz(rf_rank)+nz(mi)+nz(corr)
    idx = np.argsort(score)[::-1][:k]
    cols = [X.columns[i] for i in idx]
    print(f"✓ Top-{k} features: {cols}", flush=True)
    return idx, cols

# -------------- QNN builder (fast) --------------
def build_qnn(n_qubits=4, fmap_reps=1, ansatz_reps=1, use_pairs=False, use_efficient=False):
    """
    Fast EstimatorQNN for CPU:
      - outputs only <Z_i> (dim=n_qubits) when use_pairs=False (recommended for speed)
      - set use_pairs=True to add <Z_i Z_j> later (slower)
    """
    feature_map = ZZFeatureMap(feature_dimension=n_qubits, reps=fmap_reps)
    ansatz = (EfficientSU2 if use_efficient else RealAmplitudes)(
        num_qubits=n_qubits, reps=ansatz_reps, entanglement="linear"
    )
    circuit = feature_map.compose(ansatz)

    obs = []
    for i in range(n_qubits):
        p = ['I']*n_qubits; p[i]='Z'
        obs.append(SparsePauliOp.from_list([("".join(p[::-1]), 1.0)]))
    if use_pairs:
        for i in range(n_qubits):
            for j in range(i+1, n_qubits):
                p = ['I']*n_qubits; p[i]=p[j]='Z'
                obs.append(SparsePauliOp.from_list([("".join(p[::-1]), 1.0)]))

    est = AerEstimator() if AER_OK else Estimator()
    qnn = EstimatorQNN(
        estimator=est,
        circuit=circuit,
        observables=obs,
        input_params=feature_map.parameters,
        weight_params=ansatz.parameters
    )
    return qnn, len(obs)

# -------------- Hybrid head --------------
class HybridQNN(torch.nn.Module):
    def __init__(self, qnn, out_dim, hidden=16):
        super().__init__()
        from qiskit_machine_learning.connectors import TorchConnector
        self.q_layer = TorchConnector(qnn)            # (B, D)
        self.head = torch.nn.Sequential(
            torch.nn.Linear(out_dim, hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden, 1),
            torch.nn.Tanh(),                          # keep in [-1,1]
        )
        for m in self.head.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.05)
                torch.nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.head(self.q_layer(x))

# -------------- Train loop (fast settings) --------------
def train(model, X_tr, y_tr, X_val, y_val,
          lr=0.02, wd=1e-4, batch_size=96, epochs=80, patience=12, clip=1.0, device="cpu"):
    loss_fn = torch.nn.MSELoss()
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=max(epochs, 50))

    tr = TensorDataset(torch.tensor(X_tr, dtype=torch.float32),
                       torch.tensor(y_tr, dtype=torch.float32).view(-1,1))
    va = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                       torch.tensor(y_val, dtype=torch.float32).view(-1,1))
    tr_ld = DataLoader(tr, batch_size=batch_size, shuffle=True)
    va_ld = DataLoader(va, batch_size=batch_size, shuffle=False)

    model.to(device)
    best, best_state, bad = np.inf, None, 0

    for ep in range(1, epochs+1):
        model.train()
        tot = 0.0
        for xb, yb in tqdm(tr_ld, desc=f"Epoch {ep}/{epochs}", leave=False):
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            if clip: torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()
            tot += loss.item() * xb.size(0)
        tr_loss = tot / len(tr_ld.dataset)

        model.eval()
        with torch.no_grad():
            vtot = 0.0
            for xb, yb in va_ld:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                vtot += loss_fn(pred, yb).item() * xb.size(0)
            va_loss = vtot / len(va_ld.dataset)

        tqdm.write(f"[{ep}] train={tr_loss:.5f}  val={va_loss:.5f}  lr={sch.get_last_lr()[0]:.4g}")

        if va_loss + 1e-8 < best:
            best, bad = va_loss, 0
            best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
        else:
            bad += 1
            if bad >= patience:
                tqdm.write(f"Early stopping at epoch {ep} (patience {patience}).")
                break
        sch.step()

    if best_state is not None:
        model.load_state_dict(best_state)
    model.eval()
    return model

# -------------- Pipeline --------------
def run(data_path="final_processed_als_data.csv",
        n_qubits=4, fmap_reps=1, ansatz_reps=1,
        use_pairs=False,  # set True later for more capacity (slower)
        test_size=0.2, val_frac=0.1, use_gpu=False):
    X, y = load_xy(data_path)
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=test_size, random_state=42)

    # top-8 then pick best 4 for the circuit
    idx8, cols8 = select_topk_features(X_tr, y_tr, k=8)
    imp = SimpleImputer(strategy="median")
    std = StandardScaler()
    Xtr8 = std.fit_transform(imp.fit_transform(X_tr.iloc[:, idx8]))
    Xte8 = std.transform(imp.transform(X_te.iloc[:, idx8]))

    # choose best 4 locally on train
    idx4_local, cols4 = select_topk_features(pd.DataFrame(Xtr8, columns=[X.columns[i] for i in idx8]), y_tr, k=n_qubits)
    Xtr4 = Xtr8[:, idx4_local]
    Xte4 = Xte8[:, idx4_local]

    # angles in [0, π]; targets in [-1,1]
    ang = MinMaxScaler(feature_range=(0.0, np.pi))
    Xtr_th = ang.fit_transform(Xtr4)
    Xte_th = ang.transform(Xte4)

    ysc = MinMaxScaler(feature_range=(-1.0, 1.0))
    y_tr_s = ysc.fit_transform(y_tr.reshape(-1,1)).ravel()

    # val split
    X_trn, X_val, y_trn, y_val = train_test_split(Xtr_th, y_tr_s, test_size=val_frac, random_state=123)

    # QNN + tiny head
    qnn, out_dim = build_qnn(n_qubits=n_qubits, fmap_reps=fmap_reps, ansatz_reps=ansatz_reps,
                             use_pairs=use_pairs, use_efficient=False)
    device = "cuda" if (use_gpu and torch.cuda.is_available()) else "cpu"
    print(f"Training fast QNN on device={device} (AER={AER_OK}) — out_dim={out_dim}, features={cols4}", flush=True)
    model = HybridQNN(qnn, out_dim=out_dim, hidden=16)

    model = train(model, X_trn, y_trn, X_val, y_val,
                  lr=0.02, wd=1e-4, batch_size=96, epochs=80, patience=12, clip=1.0, device=device)

    # predict
    with torch.no_grad():
        y_hat_s = model(torch.tensor(Xte_th, dtype=torch.float32).to(device)).cpu().numpy().ravel()
    y_hat = ysc.inverse_transform(y_hat_s.reshape(-1,1)).ravel()

    rmsd, pcc, r2 = metrics(y_te, y_hat)
    print("\n===== QNN (FAST) TEST METRICS =====")
    print(f"RMSD: {rmsd:.4f}")
    print(f"PCC : {pcc:.4f}")
    print(f"R²  : {r2:.4f}")
    return dict(rmsd=rmsd, pcc=pcc, r2=r2, y_true=y_te, y_pred=y_hat, q_features=cols4)

if __name__ == "__main__":
    _ = run(
        data_path="final_processed_als_data.csv",
        n_qubits=4,
        fmap_reps=1,       # keep 1 for speed (re-upload later if you want)
        ansatz_reps=1,     # keep 1 for speed (increase to 2 once it's smooth)
        use_pairs=False,   # set True later to include ZZ pairs (slower)
        test_size=0.2,
        val_frac=0.1,
        use_gpu=False
    )


✓ Data loaded: X=(2022, 30), y=(2022,)
✓ Top-8 features: ['fvc_Subject_Liters_Trial_1_std', 'alsfrs_ALSFRS_Total_std', 'fvc_Subject_Liters_Trial_1_slope', 'vitals_Vital_Signs_Delta_std', 'vitals_Pulse_median', 'fvc_Subject_Liters_Trial_1_last', 'fvc_pct_of_Normal_Trial_1_std', 'alsfrs_ALSFRS_Total_slope']
✓ Top-4 features: ['fvc_Subject_Liters_Trial_1_std', 'alsfrs_ALSFRS_Total_std', 'vitals_Vital_Signs_Delta_std', 'fvc_Subject_Liters_Trial_1_slope']
Training fast QNN on device=cpu (AER=False) — out_dim=4, features=['fvc_Subject_Liters_Trial_1_std', 'alsfrs_ALSFRS_Total_std', 'vitals_Vital_Signs_Delta_std', 'fvc_Subject_Liters_Trial_1_slope']


Epoch 1/80:   0%|          | 0/16 [00:00<?, ?it/s]

[1] train=0.09588  val=0.08504  lr=0.02


Epoch 2/80:   0%|          | 0/16 [00:00<?, ?it/s]

[2] train=0.08267  val=0.08474  lr=0.01999


Epoch 3/80:   0%|          | 0/16 [00:00<?, ?it/s]

[3] train=0.08239  val=0.08443  lr=0.01997


Epoch 4/80:   0%|          | 0/16 [00:00<?, ?it/s]

[4] train=0.08212  val=0.08554  lr=0.01993


Epoch 5/80:   0%|          | 0/16 [00:00<?, ?it/s]

[5] train=0.08287  val=0.08439  lr=0.01988


Epoch 6/80:   0%|          | 0/16 [00:00<?, ?it/s]

[6] train=0.08277  val=0.08392  lr=0.01981


Epoch 7/80:   0%|          | 0/16 [00:00<?, ?it/s]

[7] train=0.08276  val=0.08393  lr=0.01972


Epoch 8/80:   0%|          | 0/16 [00:00<?, ?it/s]

[8] train=0.08205  val=0.08469  lr=0.01962


Epoch 9/80:   0%|          | 0/16 [00:00<?, ?it/s]

[9] train=0.08113  val=0.08371  lr=0.01951


Epoch 10/80:   0%|          | 0/16 [00:00<?, ?it/s]

[10] train=0.08111  val=0.08418  lr=0.01938


Epoch 11/80:   0%|          | 0/16 [00:00<?, ?it/s]

[11] train=0.08123  val=0.08461  lr=0.01924


Epoch 12/80:   0%|          | 0/16 [00:00<?, ?it/s]

[12] train=0.08068  val=0.08549  lr=0.01908


Epoch 13/80:   0%|          | 0/16 [00:00<?, ?it/s]

[13] train=0.08086  val=0.08614  lr=0.01891


Epoch 14/80:   0%|          | 0/16 [00:00<?, ?it/s]

[14] train=0.08040  val=0.08713  lr=0.01872


Epoch 15/80:   0%|          | 0/16 [00:00<?, ?it/s]

[15] train=0.07986  val=0.08792  lr=0.01853


Epoch 16/80:   0%|          | 0/16 [00:00<?, ?it/s]

[16] train=0.07981  val=0.08746  lr=0.01831


Epoch 17/80:   0%|          | 0/16 [00:00<?, ?it/s]

[17] train=0.07971  val=0.08833  lr=0.01809


Epoch 18/80:   0%|          | 0/16 [00:00<?, ?it/s]

[18] train=0.07988  val=0.08796  lr=0.01785


Epoch 19/80:   0%|          | 0/16 [00:00<?, ?it/s]

[19] train=0.07978  val=0.08839  lr=0.0176


Epoch 20/80:   0%|          | 0/16 [00:00<?, ?it/s]

[20] train=0.07956  val=0.08992  lr=0.01734


Epoch 21/80:   0%|          | 0/16 [00:00<?, ?it/s]

[21] train=0.07956  val=0.09038  lr=0.01707
Early stopping at epoch 21 (patience 12).

===== QNN (FAST) TEST METRICS =====
RMSD: 0.5823
PCC : 0.0073
R²  : -0.0098
