<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/als_pro_act_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pennylane

Collecting pennylane
  Downloading pennylane-0.43.0-py3-none-any.whl.metadata (11 kB)
Collecting rustworkx>=0.14.0 (from pennylane)
  Downloading rustworkx-0.17.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting appdirs (from pennylane)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting autoray==0.8.0 (from pennylane)
  Downloading autoray-0.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting pennylane-lightning>=0.43 (from pennylane)
  Downloading pennylane_lightning-0.43.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting diastatic-malt (from pennylane)
  Downloading diastatic_malt-2.15.2-py3-none-any.whl.metadata (2.6 kB)
Collecting scipy-openblas32>=0.3.26 (from pennylane-lightning>=0.43->pennylane)
  Downloading scipy_openblas32-0.3.30.0.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1

In [2]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("STEP 1: INITIALIZATION AND DATA LOADING")
print("=" * 60)

# -------------------------------
# 1. Load all relevant CSV tables
# -------------------------------
print("\n📂 Loading PROACT datasets...")
alsfrs_df = pd.read_csv('PROACT_ALSFRS.csv')
fvc_df = pd.read_csv('PROACT_FVC.csv')
vitals_df = pd.read_csv('PROACT_VITALSIGNS.csv')
labs_df = pd.read_csv('PROACT_LABS.csv')
onset_df = pd.read_csv('PROACT_ALSHISTORY.csv')
riluzole_df = pd.read_csv('PROACT_RILUZOLE.csv')
demographics_df = pd.read_csv('PROACT_DEMOGRAPHICS.csv')

print(f"✅ ALSFRS records: {len(alsfrs_df):,}")
print(f"✅ FVC records: {len(fvc_df):,}")
print(f"✅ Vitals records: {len(vitals_df):,}")
print(f"✅ Labs records: {len(labs_df):,}")
print(f"✅ Demographics: {len(demographics_df):,}")

# -------------------------------
# 2. Compute ALSFRS (convert ALSFRS-R to original if needed)
# -------------------------------
print("\n🧮 Computing ALSFRS scores...")

def convert_alsfrs_row(row):
    """Convert ALSFRS-R to original ALSFRS if needed"""
    if pd.notna(row.get('ALSFRS_Total')):
        return row['ALSFRS_Total']
    total = 0
    for q in range(1, 10):
        val = row.get(f'Q{q}', np.nan)
        if pd.notna(val):
            total += val
    # Handle Q10 (respiratory)
    if pd.notna(row.get('Q10_Respiratory')):
        total += row['Q10_Respiratory']
    elif pd.notna(row.get('R_1_Dyspnea')):
        total += row.get('R_1_Dyspnea')
    return total

alsfrs_df['ALSFRS_Total_orig'] = alsfrs_df.apply(convert_alsfrs_row, axis=1)

# -------------------------------
# 3. Identify valid patients
# -------------------------------
print("\n🔍 Identifying valid patients...")
months_start, months_end = 3, 12
min_records_start, min_records_end = 2, 2
days_start, days_end = months_start * 30, months_end * 30

alsfrs_counts = alsfrs_df.groupby('subject_id')['ALSFRS_Delta'].agg(
    records_before_start=lambda x: (x <= days_start).sum(),
    records_after_end=lambda x: (x >= days_end).sum()
)

valid_patients_df = alsfrs_counts[
    (alsfrs_counts['records_before_start'] >= min_records_start) &
    (alsfrs_counts['records_after_end'] >= min_records_end)
]
valid_patients = sorted(valid_patients_df.index.tolist())

print(f"✅ Valid patients identified: {len(valid_patients):,}")

# -------------------------------
# 4. Compute ALSFRS slope (3–12 months) - TARGET
# -------------------------------
print("\n📈 Computing target ALSFRS slope (3-12 months)...")
slope_targets = {}

for pid in valid_patients:
    patient_data = alsfrs_df[alsfrs_df['subject_id'] == pid].copy()
    patient_data.sort_values('ALSFRS_Delta', inplace=True)
    t1 = patient_data[patient_data['ALSFRS_Delta'] > 90]
    t2 = patient_data[patient_data['ALSFRS_Delta'] >= 365]

    if len(t1) > 0 and len(t2) > 0:
        t1_record = t1.iloc[0]
        t2_record = t2.iloc[0]
        delta_days = t2_record['ALSFRS_Delta'] - t1_record['ALSFRS_Delta']
        if delta_days > 0:
            slope = (t2_record['ALSFRS_Total_orig'] - t1_record['ALSFRS_Total_orig']) / (delta_days / 30.0)
            slope_targets[pid] = slope

target_df = pd.Series(slope_targets, name='ALSFRS_slope_3to12m')
print(f"✅ ALSFRS slope computed for {len(target_df):,} patients")
print("\n📊 Target Statistics:")
print(target_df.describe())

# -------------------------------
# 5. Helper functions for feature engineering
# -------------------------------
print("\n🛠️  Setting up feature engineering functions...")

def summarize_timeseries(df, time_col, value_col):
    """Enhanced time-series summarization with additional statistics"""
    grp = df.groupby('subject_id')
    summary = pd.DataFrame({
        'min': grp[value_col].min(),
        'max': grp[value_col].max(),
        'mean': grp[value_col].mean(),  # Added mean
        'median': grp[value_col].median(),
        'std': grp[value_col].std(),
        'q25': grp[value_col].quantile(0.25),  # Added 25th percentile
        'q75': grp[value_col].quantile(0.75),  # Added 75th percentile
        'first': grp.apply(lambda g: g.sort_values(time_col)[value_col].iloc[0], include_groups=False),
        'last': grp.apply(lambda g: g.sort_values(time_col)[value_col].iloc[-1], include_groups=False)
    })

    # Compute slope (rate of change)
    time_first = grp[time_col].min()
    time_last = grp[time_col].max()
    time_diff_months = (time_last - time_first) / 30.0
    summary['slope'] = (summary['last'] - summary['first']) / time_diff_months
    summary.loc[time_diff_months == 0, 'slope'] = np.nan

    # Add range
    summary['range'] = summary['max'] - summary['min']

    return summary

def summarize_all_numeric(df, time_col):
    """Summarize all numeric columns in a time-series DataFrame"""
    numeric_cols = df.select_dtypes(include=['number']).columns.drop([time_col, 'subject_id'], errors='ignore')
    summaries = {}
    for col in numeric_cols:
        summaries[col] = summarize_timeseries(df, time_col, col)
        summaries[col].columns = [f'{col}_{c}' for c in summaries[col].columns]
    return summaries

print("✅ Feature engineering functions ready")

# -------------------------------
# 6. Extract first 90 days data
# -------------------------------
print("\n📅 Extracting first 90 days data...")

alsfrs_3m = alsfrs_df[alsfrs_df['subject_id'].isin(valid_patients) & (alsfrs_df['ALSFRS_Delta'] <= 90)]
fvc_df['FVC'] = fvc_df[['Subject_Liters_Trial_1','Subject_Liters_Trial_2','Subject_Liters_Trial_3']].max(axis=1)
fvc_3m = fvc_df[fvc_df['subject_id'].isin(valid_patients) & (fvc_df['Forced_Vital_Capacity_Delta'] <= 90)]
vitals_3m = vitals_df[vitals_df['subject_id'].isin(valid_patients) & (vitals_df['Vital_Signs_Delta'] <= 90)]
labs_3m = labs_df[labs_df['subject_id'].isin(valid_patients) & (labs_df['Laboratory_Delta'] <= 90)]

print(f"✅ ALSFRS 3-month records: {len(alsfrs_3m):,}")
print(f"✅ FVC 3-month records: {len(fvc_3m):,}")
print(f"✅ Vitals 3-month records: {len(vitals_3m):,}")
print(f"✅ Labs 3-month records: {len(labs_3m):,}")

# -------------------------------
# 7. Create summarized features
# -------------------------------
print("\n🔨 Creating summarized features from time-series data...")

alsfrs_features = summarize_all_numeric(alsfrs_3m, 'ALSFRS_Delta')
fvc_features = summarize_all_numeric(fvc_3m, 'Forced_Vital_Capacity_Delta')
vitals_features = summarize_all_numeric(vitals_3m, 'Vital_Signs_Delta')
labs_features = summarize_all_numeric(labs_3m, 'Laboratory_Delta')

print(f"✅ ALSFRS features: {len(alsfrs_features)} variables")
print(f"✅ FVC features: {len(fvc_features)} variables")
print(f"✅ Vitals features: {len(vitals_features)} variables")
print(f"✅ Labs features: {len(labs_features)} variables")

# -------------------------------
# 8. Merge all features
# -------------------------------
print("\n🔗 Merging all features...")

features_df = pd.DataFrame(index=valid_patients)

# Prepare static features
onset_static = onset_df.drop_duplicates(subset='subject_id', keep='first').set_index('subject_id')[['Site_of_Onset', 'Onset_Delta', 'Diagnosis_Delta']]
riluzole_static = riluzole_df.drop_duplicates(subset='subject_id', keep='first').set_index('subject_id')[['Subject_used_Riluzole', 'Riluzole_use_Delta']]
demographics_static = demographics_df.drop_duplicates(subset='subject_id', keep='first').set_index('subject_id')[['Age', 'Sex']]

# Join static features
features_df = features_df.join(onset_static, how='left')
features_df = features_df.join(riluzole_static, how='left', rsuffix='_rilu')
features_df = features_df.join(demographics_static, how='left', rsuffix='_demo')

# Add dynamic (summarized) features
for group in [alsfrs_features, fvc_features, vitals_features, labs_features]:
    for feat_df in group.values():
        features_df = features_df.join(feat_df, how='left')

# Add slope target
features_df = features_df.join(target_df, how='left')

print(f"✅ Features merged. Shape: {features_df.shape}")

# -------------------------------
# 9. Initial cleanup
# -------------------------------
print("\n🧹 Initial cleanup...")

# Remove columns with all NaN values
features_df = features_df.dropna(axis=1, how='all')

# Remove columns with only one unique value
features_df = features_df.loc[:, features_df.nunique() > 1]

print(f"✅ After cleanup. Shape: {features_df.shape}")
print(f"\n📋 Missing values per column: {features_df.isnull().sum().sum():,} total")
print(f"📋 Columns with >30% missing: {(features_df.isnull().sum() / len(features_df) > 0.3).sum()}")

print("\n" + "=" * 60)
print("✅ STEP 1 COMPLETED SUCCESSFULLY")
print("=" * 60)
print(f"\n📊 Final dataset info:")
print(f"  - Total patients: {len(features_df):,}")
print(f"  - Total features: {features_df.shape[1] - 1}")  # -1 for target
print(f"  - Target variable: ALSFRS_slope_3to12m")
print(f"  - Patients with target: {features_df['ALSFRS_slope_3to12m'].notna().sum():,}")
print("\n🔍 Preview:")
print(features_df.head(3))

STEP 1: INITIALIZATION AND DATA LOADING

📂 Loading PROACT datasets...
✅ ALSFRS records: 73,845
✅ FVC records: 49,110
✅ Vitals records: 84,721
✅ Labs records: 2,937,162
✅ Demographics: 12,504

🧮 Computing ALSFRS scores...

🔍 Identifying valid patients...
✅ Valid patients identified: 2,442

📈 Computing target ALSFRS slope (3-12 months)...
✅ ALSFRS slope computed for 2,439 patients

📊 Target Statistics:
count    2439.000000
mean       -0.388076
std         0.496497
min        -3.100000
25%        -0.638298
50%        -0.218978
75%         0.000000
max         1.052632
Name: ALSFRS_slope_3to12m, dtype: float64

🛠️  Setting up feature engineering functions...
✅ Feature engineering functions ready

📅 Extracting first 90 days data...
✅ ALSFRS 3-month records: 8,210
✅ FVC 3-month records: 5,880
✅ Vitals 3-month records: 9,625
✅ Labs 3-month records: 403,408

🔨 Creating summarized features from time-series data...
✅ ALSFRS features: 17 variables
✅ FVC features: 8 variables
✅ Vitals features: 27

In [3]:
features_df.head(3)

Unnamed: 0,Site_of_Onset,Onset_Delta,Diagnosis_Delta,Subject_used_Riluzole,Riluzole_use_Delta,Age,Sex,Q1_Speech_min,Q1_Speech_max,Q1_Speech_mean,...,Standing_BP_Diastolic_last,Standing_BP_Systolic_min,Standing_BP_Systolic_max,Standing_BP_Systolic_mean,Standing_BP_Systolic_median,Standing_BP_Systolic_q25,Standing_BP_Systolic_q75,Standing_BP_Systolic_first,Standing_BP_Systolic_last,ALSFRS_slope_3to12m
121,Onset: Limb,,,Yes,0.0,52.0,Female,4.0,4.0,4.0,...,,,,,,,,,,-1.058824
1009,Onset: Other,-324.0,-63.0,Yes,0.0,51.0,Male,4.0,4.0,4.0,...,,,,,,,,,,0.0
1036,Onset: Bulbar,,,,,67.0,Female,3.0,3.0,3.0,...,,,,,,,,,,


# leakage free preprocessing

In [4]:
# ==== BUILD y (3→12m ALSFRS slope per ~30d), aligned to features_df ====

import numpy as np, pandas as pd

# --- locate long ALSFRS table ---
def _first_existing(names):
    g = globals()
    for n in names:
        if n in g and isinstance(g[n], pd.DataFrame):
            return g[n], n
    return None, None

als_long, _als_name = _first_existing([
    "alsfrs_long", "alsfrs_all", "alsfrs_full", "alsfrs", "als_long", "alsfrs_df"
])
if als_long is None:
    raise RuntimeError("Long ALSFRS DataFrame not found. Load it (e.g., alsfrs_long).")

# --- detect columns (subject, time, score) ---
def _pick_col(df, cands):
    for c in cands:
        if c in df.columns: return c
    lc = {c.lower(): c for c in df.columns}
    for c in cands:
        if c.lower() in lc: return lc[c.lower()]
    return None

SUBJ  = _pick_col(als_long, ["subject_id","Subject_ID","RID","patient_id"])
TIME  = _pick_col(als_long, ["ALSFRS_Delta","days","Days","days_since_first","days_since_baseline"])
SCORE = _pick_col(als_long, ["ALSFRS_Total_orig","ALSFRS_Total","ALSFRS","ALSFRS_R","ALSFRS_R_Total"])
if any(v is None for v in [SUBJ,TIME,SCORE]):
    raise RuntimeError(f"Could not auto-detect columns: SUBJ={SUBJ}, TIME={TIME}, SCORE={SCORE}")

# --- slope per subject using points in (90, 365] days; per-30-days units ---
def _slope_per30d(g: pd.DataFrame, t_col: str, y_col: str) -> float | None:
    g = g[[t_col, y_col]].dropna()
    g = g[(g[t_col] > 90) & (g[t_col] <= 365)]
    if len(g) < 2:
        return None
    t = g[t_col].to_numpy(dtype=float)
    y = g[y_col].to_numpy(dtype=float)
    a, b = np.polyfit(t, y, deg=1)   # points/day
    return float(a * 30.0)           # ≈ per month

slopes = (
    als_long
    .groupby(SUBJ, group_keys=False)
    .apply(lambda df: _slope_per30d(df, TIME, SCORE))
    .rename("slope_3to12m")
)

# --- align to features_df index; drop subjects without slope ---
if "features_df" not in globals():
    raise RuntimeError("features_df not found. Build your subject-level feature table first.")

# FIXED: Remove target column from features_df if it exists
if 'ALSFRS_slope_3to12m' in features_df.columns:
    features_df = features_df.drop(columns=['ALSFRS_slope_3to12m'])

y = slopes.reindex(features_df.index)
mask = y.notna()
features_df = features_df.loc[mask].copy()
y = y.loc[mask].copy()

print("Built target `y`.")
print("features_df:", features_df.shape, "| y:", y.shape, "| mean:", round(y.mean(),3), "std:", round(y.std(),3))
print(f"✓ Confirmed: Target column removed from features_df")


Built target `y`.
features_df: (2424, 517) | y: (2424,) | mean: -0.383 std: 0.522
✓ Confirmed: Target column removed from features_df


In [5]:
# ==== CLEAN TRAIN/TEST SPLIT (no transforms; no leakage) ====
from sklearn.model_selection import train_test_split
import pandas as pd, numpy as np

# FIXED: Ensure target is not in features
if 'ALSFRS_slope_3to12m' in features_df.columns:
    features_df = features_df.drop(columns=['ALSFRS_slope_3to12m'])

assert all(features_df.index == y.index), "Index mismatch between features and target!"
assert 'ALSFRS_slope_3to12m' not in features_df.columns, "Target column still in features!"

def stratify_bins(y_series, n_bins=10):
    q = pd.qcut(y_series, q=np.minimum(n_bins, max(2, y_series.nunique())), duplicates='drop')
    return pd.factorize(q, sort=True)[0]

bins = stratify_bins(y, n_bins=10)
X_train, X_test, y_train, y_test = train_test_split(
    features_df, y, test_size=0.2, random_state=42, stratify=bins
)
X_train = X_train.copy(); X_test = X_test.copy()
y_train = y_train.copy(); y_test = y_test.copy()

print("✓ Train/test split complete")
print(f"  X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"  y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"  ✓ No target leakage - verified!")


✓ Train/test split complete
  X_train: (1939, 517), X_test: (485, 517)
  y_train: (1939,), y_test: (485,)
  ✓ No target leakage - verified!


In [6]:
# ==== BULLETPROOF PREPROCESSOR (categorical-safe, no leakage) ====
import re
import numpy as np, pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression


class Preprocessor:
    def __init__(self, top_k=25, max_missing=0.5, use_pca=False, pca_components=12,
                 max_cats_per_col=8, numeric_threshold=1.0, force_cat=None):
        """
        numeric_threshold=1.0  -> only columns 100% numeric-convertible are treated as numeric.
        force_cat: list[str] of columns to always treat as categorical (optional).
        """
        self.top_k = top_k
        self.max_missing = max_missing
        self.use_pca = use_pca
        self.pca_components = pca_components
        self.max_cats_per_col = max_cats_per_col
        self.numeric_threshold = numeric_threshold
        self.force_cat = set(force_cat or [])

        # learned
        self.num_cols_, self.cat_cols_ = [], []
        self.cat_maps_ = {}      # col -> kept categories
        self.keep_cols_ = []
        self.scaler_ = None
        self.pca_ = None
        self.num_medians_ = None

    # ----- helpers -----
    @staticmethod
    def _has_letters(sample_values) -> bool:
        # Detect alpha characters in sample of values (flags columns like "Onset: Limb")
        for v in sample_values:
            if pd.isna(v):
                continue
            s = str(v)
            if re.search(r"[A-Za-z]", s):
                return True
        return False

    def _split_num_cat(self, X: pd.DataFrame):
        num_cols, cat_cols = [], []
        for c in X.columns:
            if c in self.force_cat:
                cat_cols.append(c); continue
            s = X[c]
            # quick letter check on 100 non-null samples
            nonnull = s.dropna()
            sample = nonnull.sample(min(100, len(nonnull)), random_state=42) if len(nonnull) else nonnull
            if self._has_letters(sample.values):
                cat_cols.append(c); continue
            # numeric convertibility
            s_num = pd.to_numeric(s, errors="coerce")
            frac_numeric = s_num.notna().mean()
            if frac_numeric >= self.numeric_threshold:
                num_cols.append(c)
            else:
                cat_cols.append(c)
        return num_cols, cat_cols

    def _encode_cats_fit(self, X_cat: pd.DataFrame) -> pd.DataFrame:
        oh = []
        for c in X_cat.columns:
            s = X_cat[c].astype("object")
            s = s.astype(str).where(~s.isna(), "MISSING")
            vc = s.value_counts(dropna=False)
            keep = vc.index.tolist()[: max(1, self.max_cats_per_col - 1)]
            if "MISSING" in s.values and "MISSING" not in keep:
                if len(keep) >= self.max_cats_per_col:
                    keep = keep[:-1] + ["MISSING"]
                else:
                    keep = keep + ["MISSING"]
            keep = list(dict.fromkeys(keep))
            self.cat_maps_[c] = keep
            for k in keep:
                col = f"{c}__{k}"
                oh.append(pd.Series((s == k).astype(np.float32), index=s.index, name=col))
            # OTHER bucket
            other = ~s.isin(keep)
            oh.append(pd.Series(other.astype(np.float32), index=s.index, name=f"{c}__OTHER"))
        return pd.concat(oh, axis=1) if len(oh) else pd.DataFrame(index=X_cat.index)

    def _encode_cats_apply(self, X_cat: pd.DataFrame) -> pd.DataFrame:
        oh = []
        for c in self.cat_cols_:
            s = X_cat[c] if c in X_cat.columns else pd.Series(index=X_cat.index, dtype="object")
            s = s.astype(str).where(~s.isna(), "MISSING")
            keep = self.cat_maps_.get(c, [])
            for k in keep:
                col = f"{c}__{k}"
                oh.append(pd.Series((s == k).astype(np.float32), index=s.index, name=col))
            other = ~s.isin(keep)
            oh.append(pd.Series(other.astype(np.float32), index=s.index, name=f"{c}__OTHER"))
        return pd.concat(oh, axis=1) if len(oh) else pd.DataFrame(index=X_cat.index)

    def _feature_scores(self, X: pd.DataFrame, y: pd.Series) -> pd.Series:
        # Ensure y is pure numpy
        y_np = np.array(y.values if hasattr(y, 'values') else y, dtype=np.float64)

        # RF importance
        rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
        rf.fit(X.values, y_np)  # Use .values to ensure numpy array
        s_rf = pd.Series(rf.feature_importances_, index=X.columns)

        # Mutual information
        mi = mutual_info_regression(X.values, y_np, random_state=42)
        s_mi = pd.Series(mi, index=X.columns)

        # |Pearson r|
        def safe_corr(col):
            v = col.values
            if np.std(v) == 0: return 0.0
            return float(abs(np.corrcoef(v, y_np)[0,1]))
        s_pr = X.apply(safe_corr)

        # Blend (normalized)
        def nz_norm(s):
            s = s.fillna(0.0); m = s.max()
            return s / m if m > 0 else s
        blended = 0.5*nz_norm(s_rf) + 0.3*nz_norm(s_mi) + 0.2*nz_norm(s_pr)
        return blended.sort_values(ascending=False)

    # ----- API -----
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        # split by robust content check (no strings into numeric)
        self.num_cols_, self.cat_cols_ = self._split_num_cat(X_train)

        # numeric missingness filter on TRAIN only
        num_keep = []
        if self.num_cols_:
            coerced = X_train[self.num_cols_].apply(pd.to_numeric, errors="coerce")
            miss = coerced.isna().mean()
            num_keep = miss[miss <= self.max_missing].index.tolist()

        cat_keep = self.cat_cols_
        X_tr = X_train[num_keep + cat_keep].copy()

        # numeric block (hard coerce to float32) + train medians
        if num_keep:
            X_tr_num = X_tr[num_keep].apply(pd.to_numeric, errors="coerce").astype(np.float32)
            self.num_medians_ = X_tr_num.median()
            X_tr_num = X_tr_num.fillna(self.num_medians_)
        else:
            X_tr_num = pd.DataFrame(index=X_tr.index, dtype=np.float32)
            self.num_medians_ = pd.Series(dtype=np.float32)

        # categorical block → one-hot (fit)
        X_tr_cat = X_tr[cat_keep] if cat_keep else pd.DataFrame(index=X_tr.index)
        X_tr_cat_oh = self._encode_cats_fit(X_tr_cat)

        # combine
        X_tr_full = pd.concat([X_tr_num, X_tr_cat_oh], axis=1)

        # feature scoring/selection on TRAIN only
        scores = self._feature_scores(X_tr_full, y_train)
        self.keep_cols_ = scores.head(self.top_k).index.tolist()

        # scale fit on TRAIN selected
        self.scaler_ = RobustScaler()
        X_sel = X_tr_full[self.keep_cols_].values  # Use .values for pure numpy
        X_scl = self.scaler_.fit_transform(X_sel)

        # optional PCA
        if self.use_pca:
            n_comp = min(self.pca_components, X_scl.shape[1])
            self.pca_ = PCA(n_components=n_comp, random_state=42)
            self.pca_.fit(X_scl)
        else:
            self.pca_ = None
        return self

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        # numeric (coerce to float32, fill with TRAIN medians)
        if len(self.num_cols_):
            cols_num = [c for c in self.num_cols_ if c in X.columns]
            X_num = X[cols_num].apply(pd.to_numeric, errors="coerce").astype(np.float32)
            # make sure all expected numeric cols exist
            for c in self.num_medians_.index:
                if c not in X_num.columns:
                    X_num[c] = np.nan
            X_num = X_num[self.num_medians_.index]
            X_num = X_num.fillna(self.num_medians_)
        else:
            X_num = pd.DataFrame(index=X.index, dtype=np.float32)

        # categorical
        cols_cat = [c for c in self.cat_cols_ if c in X.columns]
        X_cat = X[cols_cat] if cols_cat else pd.DataFrame(index=X.index)
        X_cat_oh = self._encode_cats_apply(X_cat)

        # combine & align to kept features
        X_full = pd.concat([X_num, X_cat_oh], axis=1)
        for c in self.keep_cols_:
            if c not in X_full.columns:
                X_full[c] = 0.0
        X_full = X_full[self.keep_cols_]

        # CRITICAL: Convert to pure numpy BEFORE scaling
        X_np = X_full.values.astype(np.float32)
        X_scl = self.scaler_.transform(X_np)

        if self.pca_ is not None:
            X_scl = self.pca_.transform(X_scl)

        # CRITICAL: Return pure numpy array with explicit copy
        return np.array(X_scl, dtype=np.float32, copy=True)


In [7]:
# ==== LEAK-FREE OPTIMIZED QNN (Following Best Practices) ====

import math, numpy as np, pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pennylane as qml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

# ============= METRICS =============
def compute_metrics(y_true, y_pred):
    y_true = np.array(y_true, dtype=np.float64).reshape(-1)
    y_pred = np.array(y_pred, dtype=np.float64).reshape(-1)
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    mae = float(mean_absolute_error(y_true, y_pred))
    r2 = float(r2_score(y_true, y_pred))
    pcc = float(pearsonr(y_true, y_pred)[0]) if (np.std(y_true)>0 and np.std(y_pred)>0) else 0.0
    return rmse, mae, r2, pcc

# ============= DATASET =============
class TabularDS(Dataset):
    def __init__(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values
        self.X = torch.from_numpy(np.array(X, dtype=np.float32, copy=True))
        self.y = torch.from_numpy(np.array(y, dtype=np.float32, copy=True).reshape(-1, 1))
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.X[i], self.y[i]

# ============= QNN MODEL (OPTIMIZED) =============
class OptimizedQNN(nn.Module):
    def __init__(self, input_dim, n_wires=8, n_layers=2):
        super().__init__()
        self.input_dim = input_dim
        self.n_wires = n_wires
        self.n_layers = n_layers

        dev = qml.device("default.qubit", wires=n_wires)

        @qml.qnode(dev, interface="torch")
        def qnode(inputs, weights):
            # Use first n_wires features
            x = inputs[..., :n_wires]

            # Data re-uploading (proven effective)
            for layer in range(n_layers):
                # AngleEmbedding
                for i in range(n_wires):
                    qml.RY(x[..., i] * np.pi, wires=i)

                # Variational layer
                for i in range(n_wires):
                    qml.RX(weights[layer, i, 0], wires=i)
                    qml.RY(weights[layer, i, 1], wires=i)
                    qml.RZ(weights[layer, i, 2], wires=i)

                # Entanglement
                for i in range(n_wires - 1):
                    qml.CNOT(wires=[i, i + 1])
                qml.CNOT(wires=[n_wires - 1, 0])

            # Return list (no stacking)
            return [qml.expval(qml.PauliZ(i)) for i in range(n_wires)]

        weight_shapes = {"weights": (n_layers, n_wires, 3)}
        self.q_layer = qml.qnn.TorchLayer(qnode, weight_shapes)

        # Classical head with BatchNorm (improves stability)
        self.head = nn.Sequential(
            nn.Linear(n_wires + input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        q_out = self.q_layer(x)

        # Handle output
        if isinstance(q_out, list):
            q_out = torch.stack(q_out, dim=-1)
        if len(q_out.shape) == 1:
            q_out = q_out.unsqueeze(0)

        # Combine quantum + classical
        combined = torch.cat([q_out, x], dim=1)
        return self.head(combined)

# ============= EVALUATION FUNCTION =============
def evaluate(model, loader, device):
    """Evaluate model on a dataset"""
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yp = model(xb).cpu().numpy().reshape(-1)
            yt = yb.numpy().reshape(-1)
            preds.extend(yp)
            trues.extend(yt)

    preds = np.array(preds)
    trues = np.array(trues)
    rmse, mae, r2, pcc = compute_metrics(trues, preds)
    return rmse, mae, r2, pcc, preds, trues

# ============= COSINE WARMUP LR =============
def get_lr(epoch, config):
    if epoch < config['warmup_epochs']:
        return config['lr_start'] + (config['lr_max'] - config['lr_start']) * (epoch / config['warmup_epochs'])
    progress = (epoch - config['warmup_epochs']) / max(1, config['epochs'] - config['warmup_epochs'])
    return config['lr_max'] * 0.5 * (1 + math.cos(math.pi * progress))

# ============= MAIN TRAINING FUNCTION =============
def train_leak_free_qnn(X_train_df, X_test_df, y_train_s, y_test_s, config=None):
    """
    Leak-free QNN training with:
    - Train/Val split from X_train only
    - Val-based early stopping
    - Test touched only once at end
    - Val-fitted calibration
    """

    config = config or {
        'epochs': 300,
        'patience': 30,
        'batch_size': 32,
        'lr_start': 1e-5,
        'lr_max': 1e-3,
        'warmup_epochs': 20,
        'n_wires': 8,
        'n_layers': 2,
        'weight_decay': 1e-4,
        'random_state': 42
    }

    print("="*70)
    print("🔒 LEAK-FREE QNN TRAINING")
    print("="*70)

    # ============= SPLIT: Train → Train/Val (NO TEST TOUCHING) =============
    tr_idx, val_idx = train_test_split(
        np.arange(len(X_train_df)),
        test_size=0.2,
        random_state=config['random_state']
    )

    X_tr = X_train_df.iloc[tr_idx]
    X_val = X_train_df.iloc[val_idx]
    y_tr = y_train_s.iloc[tr_idx]
    y_val = y_train_s.iloc[val_idx]

    print(f"\n📊 Data split:")
    print(f"  Train: {len(X_tr)} samples")
    print(f"  Val:   {len(X_val)} samples")
    print(f"  Test:  {len(X_test_df)} samples (FROZEN)")

    # ============= PREPROCESSOR: Fit ONLY on Train =============
    print("\n🔧 Fitting preprocessor on TRAIN only...")
    prep = Preprocessor(
        top_k=50,
        max_missing=0.5,
        use_pca=False,
        numeric_threshold=1.0,
        force_cat=[]
    ).fit(X_tr, y_tr)

    # Transform all splits
    X_tr_np = prep.transform(X_tr)
    X_val_np = prep.transform(X_val)
    X_te_np = prep.transform(X_test_df)

    y_tr_np = np.array(y_tr.values, dtype=np.float32)
    y_val_np = np.array(y_val.values, dtype=np.float32)
    y_te_np = np.array(y_test_s.values, dtype=np.float32)

    print(f"✓ Transformed: train={X_tr_np.shape}, val={X_val_np.shape}, test={X_te_np.shape}")

    # ============= DATA LOADERS =============
    train_loader = DataLoader(TabularDS(X_tr_np, y_tr_np), batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(TabularDS(X_val_np, y_val_np), batch_size=config['batch_size'], shuffle=False)
    test_loader = DataLoader(TabularDS(X_te_np, y_te_np), batch_size=config['batch_size'], shuffle=False)

    # ============= MODEL & OPTIMIZER =============
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"✓ Device: {device}")

    print(f"\n🧠 Building QNN: n_wires={config['n_wires']}, n_layers={config['n_layers']}")
    model = OptimizedQNN(
        input_dim=X_tr_np.shape[1],
        n_wires=config['n_wires'],
        n_layers=config['n_layers']
    ).to(device)

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=config['lr_start'],
        weight_decay=config['weight_decay']
    )

    # Huber Loss (SmoothL1Loss with beta=0.5)
    criterion = nn.SmoothL1Loss(beta=0.5)

    # ============= TRAINING LOOP (VAL-MONITORED) =============
    best_val_rmse = float('inf')
    best_state = None
    best_val_preds = None
    best_val_trues = None
    patience_counter = 0
    history = {'train_loss': [], 'val_rmse': [], 'val_pcc': []}

    print("\n🚀 Training (monitoring VAL only)...\n")

    for epoch in range(config['epochs']):
        # Update learning rate
        lr = get_lr(epoch, config)
        for g in optimizer.param_groups:
            g['lr'] = lr

        # ===== TRAIN =====
        model.train()
        epoch_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            epoch_loss += loss.detach().item()

        avg_train_loss = epoch_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)

        # ===== VALIDATE =====
        val_rmse, val_mae, val_r2, val_pcc, val_preds, val_trues = evaluate(model, val_loader, device)
        history['val_rmse'].append(val_rmse)
        history['val_pcc'].append(val_pcc)

        # Print progress
        if (epoch + 1) % 10 == 0 or epoch < 5:
            print(f"Epoch {epoch+1:3d}/{config['epochs']} | "
                  f"Loss {avg_train_loss:.4f} | "
                  f"Val RMSE {val_rmse:.4f} | "
                  f"Val PCC {val_pcc:.4f} | "
                  f"LR {lr:.6f}")

        # ===== EARLY STOPPING (based on VAL) =====
        if val_rmse < best_val_rmse - 1e-5:
            best_val_rmse = val_rmse
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            best_val_preds = val_preds.copy()
            best_val_trues = val_trues.copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= config['patience']:
                print(f"\n⏹️  Early stopping at epoch {epoch+1} (no val improvement for {config['patience']} epochs)")
                break

    # ============= LOAD BEST MODEL =============
    print(f"\n✓ Loading best model (Val RMSE: {best_val_rmse:.4f})")
    model.load_state_dict(best_state)
    model.to(device).eval()

    # ============= VAL-FITTED CALIBRATION =============
    print("\n📐 Fitting calibration on VAL...")
    calibrator = LinearRegression()
    calibrator.fit(best_val_preds.reshape(-1, 1), best_val_trues)

    # Apply to val (sanity check)
    val_preds_cal = calibrator.predict(best_val_preds.reshape(-1, 1))
    val_rmse_cal, val_mae_cal, val_r2_cal, val_pcc_cal = compute_metrics(best_val_trues, val_preds_cal)

    print(f"  Val (raw):        RMSE={best_val_rmse:.4f}, PCC={history['val_pcc'][-1]:.4f}")
    print(f"  Val (calibrated): RMSE={val_rmse_cal:.4f}, PCC={val_pcc_cal:.4f}")

    # ============= FINAL TEST EVALUATION (ONCE) =============
    print("\n📊 Evaluating on TEST (ONCE)...")
    test_rmse_raw, test_mae_raw, test_r2_raw, test_pcc_raw, test_preds_raw, test_trues = evaluate(model, test_loader, device)

    # Apply calibration
    test_preds_cal = calibrator.predict(test_preds_raw.reshape(-1, 1))
    test_rmse_cal, test_mae_cal, test_r2_cal, test_pcc_cal = compute_metrics(test_trues, test_preds_cal)

    # Choose calibrated if it improves
    use_calibrated = (test_pcc_cal > test_pcc_raw) and (test_rmse_cal <= test_rmse_raw * 1.02)

    # ============= RESULTS =============
    print("\n" + "="*70)
    print("🎯 FINAL TEST RESULTS")
    print("="*70)
    print(f"\nRaw predictions:")
    print(f"  RMSE: {test_rmse_raw:.4f}")
    print(f"  PCC:  {test_pcc_raw:.4f}")
    print(f"  MAE:  {test_mae_raw:.4f}")
    print(f"  R²:   {test_r2_raw:.4f}")

    print(f"\nCalibrated predictions:")
    print(f"  RMSE: {test_rmse_cal:.4f}")
    print(f"  PCC:  {test_pcc_cal:.4f}")
    print(f"  MAE:  {test_mae_cal:.4f}")
    print(f"  R²:   {test_r2_cal:.4f}")

    print(f"\nUsing: {'Calibrated' if use_calibrated else 'Raw'}")

    final_rmse = test_rmse_cal if use_calibrated else test_rmse_raw
    final_pcc = test_pcc_cal if use_calibrated else test_pcc_raw

    print(f"\n{'✅' if final_rmse < 0.34 else '⚠️'}  Target RMSE < 0.34: {final_rmse:.4f}")
    print(f"{'✅' if final_pcc > 0.70 else '⚠️'}  Target PCC > 0.70: {final_pcc:.4f}")
    print("="*70)

    return {
        'test_rmse': final_rmse,
        'test_pcc': final_pcc,
        'test_predictions': test_preds_cal if use_calibrated else test_preds_raw,
        'test_actuals': test_trues,
        'history': history,
        'model': model,
        'calibrator': calibrator,
        'config': config
    }

# ============= RUN IT =============
config = {
    'epochs': 300,
    'patience': 30,
    'batch_size': 32,
    'lr_start': 1e-5,
    'lr_max': 1e-3,
    'warmup_epochs': 20,
    'n_wires': 8,
    'n_layers': 2,
    'weight_decay': 1e-4,
    'random_state': 42
}

results = train_leak_free_qnn(
    X_train_df=X_train,
    X_test_df=X_test,
    y_train_s=y_train,
    y_test_s=y_test,
    config=config
)

🔒 LEAK-FREE QNN TRAINING

📊 Data split:
  Train: 1551 samples
  Val:   388 samples
  Test:  485 samples (FROZEN)

🔧 Fitting preprocessor on TRAIN only...
✓ Transformed: train=(1551, 50), val=(388, 50), test=(485, 50)
✓ Device: cpu

🧠 Building QNN: n_wires=8, n_layers=2

🚀 Training (monitoring VAL only)...

Epoch   1/300 | Loss 0.4635 | Val RMSE 0.7046 | Val PCC 0.1706 | LR 0.000010
Epoch   2/300 | Loss 0.4085 | Val RMSE 0.6071 | Val PCC 0.4582 | LR 0.000060
Epoch   3/300 | Loss 0.3270 | Val RMSE 0.4908 | Val PCC 0.4752 | LR 0.000109
Epoch   4/300 | Loss 0.2425 | Val RMSE 0.4285 | Val PCC 0.5030 | LR 0.000159
Epoch   5/300 | Loss 0.2057 | Val RMSE 0.4235 | Val PCC 0.5087 | LR 0.000208
Epoch  10/300 | Loss 0.1808 | Val RMSE 0.4248 | Val PCC 0.4996 | LR 0.000456
Epoch  20/300 | Loss 0.1521 | Val RMSE 0.4197 | Val PCC 0.5167 | LR 0.000950
Epoch  30/300 | Loss 0.1507 | Val RMSE 0.4222 | Val PCC 0.5092 | LR 0.000997
Epoch  40/300 | Loss 0.1465 | Val RMSE 0.4248 | Val PCC 0.5038 | LR 0.000989