<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/ALS_QNN_PRO_ACT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore')

class ALSDataProcessor:
    """
    A robust class to load, clean, and process PRO-ACT data for predicting ALSFRS slope,
    replicating the methodology from the "Deep learning methods to predict amyotrophic
    lateral sclerosis disease progression" paper.
    """
    def __init__(self):
        self.label_encoders = {}
        # A list of columns to exclude from feature engineering
        self.id_and_delta_cols = [
            'subject_id', 'alsfrs_delta', 'fvc_delta', 'vitals_delta',
            'labs_delta', 'grip_delta', 'muscle_delta', 'onset_delta',
            'death_delta', 'history_delta'
        ]

    def _convert_alsfrs_r(self, alsfrs_df):
        """Convert ALSFRS-R questions to the original ALSFRS format."""
        df = alsfrs_df.copy()
        # Ensure ALSFRS_Total is numeric, coercing errors
        df['ALSFRS_Total'] = pd.to_numeric(df['ALSFRS_Total'], errors='coerce')
        return df

    def load_and_inspect_data(self, file_path=''):
        """Load all datasets and inspect their structure."""
        datasets = {}
        file_list = [
            'PROACT_ALSFRS.csv', 'PROACT_FVC.csv', 'PROACT_VITALSIGNS.csv',
            'PROACT_RILUZOLE.csv', 'PROACT_DEMOGRAPHICS.csv', 'PROACT_LABS.csv',
            'PROACT_DEATHDATA.csv', 'PROACT_HANDGRIPSTRENGTH.csv',
            'PROACT_MUSCLESTRENGTH.csv'
        ]
        print("--- Loading and Inspecting Data ---")
        for file_name in file_list:
            try:
                df = pd.read_csv(file_path + file_name)

                # --- CORRECTED RENAMING LOGIC ---
                # Check if 'subject_id' already exists. If not, find a candidate and rename only the first one found.
                if 'subject_id' not in df.columns:
                    potential_id_cols = [col for col in df.columns if 'subject' in col.lower()]
                    if potential_id_cols:
                        df.rename(columns={potential_id_cols[0]: 'subject_id'}, inplace=True)
                # --- END CORRECTION ---

                datasets[file_name] = df
                print(f"✓ {file_name}: Loaded successfully with shape {df.shape}")
            except FileNotFoundError:
                print(f"✗ {file_name}: File not found. Will be skipped.")
        return datasets

    def calculate_alsfrs_slope(self, alsfrs_df):
        """Calculate the primary target variable: ALSFRS slope between months 3-12."""
        df = alsfrs_df.copy()
        df.rename(columns={c:'alsfrs_delta' for c in df.columns if 'delta' in c.lower()}, inplace=True)
        df['months'] = df['alsfrs_delta'] / 30.44
        df.sort_values(['subject_id', 'months'], inplace=True)

        slopes = {}
        for subject_id, subject_data in df.groupby('subject_id'):
            t1_candidates = subject_data[(subject_data['months'] > 3) & (subject_data['months'] <= 12)]
            t2_candidates = subject_data[subject_data['months'] >= 12]

            if not t1_candidates.empty and not t2_candidates.empty:
                t1_row = t1_candidates.iloc[0]
                t2_row = t2_candidates.iloc[0]

                t1, alsfrs_t1 = t1_row['months'], t1_row['ALSFRS_Total']
                t2, alsfrs_t2 = t2_row['months'], t2_row['ALSFRS_Total']

                if t2 > t1 and pd.notna(alsfrs_t1) and pd.notna(alsfrs_t2):
                    slope = (alsfrs_t2 - alsfrs_t1) / (t2 - t1)
                    slopes[subject_id] = slope

        return pd.DataFrame(list(slopes.items()), columns=['subject_id', 'alsfrs_slope'])

    def create_longitudinal_features(self, df, time_col, prefix):
        """Create the seven summary statistics from longitudinal data (first 3 months)."""
        df_sorted = df.sort_values(['subject_id', time_col])
        df_filtered = df_sorted[df_sorted[time_col] <= 90].copy()

        value_cols = [col for col in df_filtered.select_dtypes(include=np.number).columns
                      if col.lower() not in self.id_and_delta_cols]

        if not value_cols:
            return pd.DataFrame()

        summary_dfs = []
        for value_col in value_cols:
            grouped = df_filtered.groupby('subject_id')
            summary = grouped[value_col].agg(['min', 'max', 'median', 'first', 'last']).join(
                grouped[value_col].std(ddof=0).rename('std')
            )

            slope_df = grouped.apply(
                lambda g: (g[value_col].iloc[-1] - g[value_col].iloc[0]) / (g[time_col].iloc[-1] - g[time_col].iloc[0])
                if len(g) > 1 and (g[time_col].iloc[-1] - g[time_col].iloc[0]) > 0 else np.nan
            ).rename('slope')

            summary = summary.join(slope_df).fillna(0)
            summary.columns = [f"{prefix}{value_col}_{stat}" for stat in summary.columns]
            summary_dfs.append(summary)

        return pd.concat(summary_dfs, axis=1).reset_index()

    def process_static_data(self, df):
        """Process static data files (like demographics, riluzole)."""
        processed = df.copy()
        for col in processed.select_dtypes(include=['object', 'category']).columns:
            if col != 'subject_id':
                le = self.label_encoders.setdefault(col, LabelEncoder())
                processed[col] = le.fit_transform(processed[col].astype(str))
        return processed.drop_duplicates(subset=['subject_id'])

    def merge_all_features(self, datasets):
        """Merge all static and longitudinal features into a single dataframe."""
        if 'PROACT_DEMOGRAPHICS.csv' not in datasets:
            raise ValueError("Demographics file is missing.")

        final_df = self.process_static_data(datasets['PROACT_DEMOGRAPHICS.csv'])

        static_files = ['PROACT_RILUZOLE.csv']
        for file in static_files:
            if file in datasets:
                static_df = self.process_static_data(datasets[file])
                final_df = pd.merge(final_df, static_df, on='subject_id', how='left')

        longitudinal_configs = {
            'PROACT_ALSFRS.csv': 'alsfrs_',
            'PROACT_FVC.csv': 'fvc_',
            'PROACT_VITALSIGNS.csv': 'vitals_',
            'PROACT_LABS.csv': 'labs_',
            'PROACT_HANDGRIPSTRENGTH.csv': 'grip_',
            'PROACT_MUSCLESTRENGTH.csv': 'muscle_'
        }

        print("\n--- Generating Longitudinal Features (from first 3 months) ---")
        for file, prefix in longitudinal_configs.items():
            if file in datasets:
                df = datasets[file].copy()
                time_col_actual = next((c for c in df.columns if 'delta' in c.lower()), None)
                if not time_col_actual:
                    print(f"Warning: No time delta column found in {file}. Skipping.")
                    continue

                print(f"Processing {file}...")
                summary_features = self.create_longitudinal_features(df, time_col_actual, prefix)
                if not summary_features.empty:
                    final_df = pd.merge(final_df, summary_features, on='subject_id', how='left')

        return final_df

    def filter_eligible_patients(self, feature_df, alsfrs_df):
        """Filter for patients meeting the paper's criteria."""
        df = alsfrs_df.copy()
        df.rename(columns={c:'alsfrs_delta' for c in df.columns if 'delta' in c.lower()}, inplace=True)
        df['months'] = df['alsfrs_delta'] / 30.44

        eligibility = df.groupby('subject_id')['months'].agg(['min', 'max'])
        eligible_ids = eligibility[(eligibility['min'] <= 3) & (eligibility['max'] >= 12)].index

        print(f"\nFound {len(eligible_ids)} eligible patients out of {df['subject_id'].nunique()}.")
        return feature_df[feature_df['subject_id'].isin(eligible_ids)]

    def run_pipeline(self, file_path=''):
        """Execute the complete data preprocessing pipeline."""
        print("====== Starting ALS Data Preprocessing Pipeline ======")
        datasets = self.load_and_inspect_data(file_path)

        if 'PROACT_ALSFRS.csv' not in datasets:
            print("CRITICAL ERROR: PROACT_ALSFRS.csv not found. Aborting.")
            return None

        datasets['PROACT_ALSFRS.csv'] = self._convert_alsfrs_r(datasets['PROACT_ALSFRS.csv'])

        target_df = self.calculate_alsfrs_slope(datasets['PROACT_ALSFRS.csv'])
        print(f"\nCalculated ALSFRS slope for {len(target_df)} patients.")

        full_features = self.merge_all_features(datasets)

        eligible_features = self.filter_eligible_patients(full_features, datasets['PROACT_ALSFRS.csv'])

        final_df = pd.merge(eligible_features, target_df, on='subject_id', how='inner')

        print("\n--- Handling Missing Values ---")
        missing_thresh = 0.30
        initial_cols = len(final_df.columns)
        max_missing = len(final_df) * (1 - missing_thresh)
        final_df.dropna(axis=1, thresh=max_missing, inplace=True)
        print(f"Dropped {initial_cols - len(final_df.columns)} features with >{missing_thresh*100}% missing values.")

        X = final_df.drop(columns=['subject_id', 'alsfrs_slope'])
        y = final_df['alsfrs_slope']

        valid_y_mask = y.notna()
        X = X[valid_y_mask]
        y = y[valid_y_mask]
        subject_ids = final_df.loc[valid_y_mask, 'subject_id']

        imputer = SimpleImputer(strategy='median')
        X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

        print("\n--- Performing Feature Selection (Top 30 via Random Forest) ---")
        rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        rf.fit(X_imputed, y)

        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)

        selected_features = importance_df['feature'].head(30).tolist()
        X_selected = X_imputed[selected_features]

        print("\n====== Pipeline Complete ======")
        print(f"Final feature matrix shape: {X_selected.shape}")
        print(f"Final target vector shape: {y.shape}")

        # Save the final data for the next step
        final_output = pd.concat([subject_ids.reset_index(drop=True),
                                  y.reset_index(drop=True),
                                  X_selected.reset_index(drop=True)], axis=1)
        final_output.to_csv("final_processed_als_data.csv", index=False)
        print("\n✅ Successfully saved processed data to 'final_processed_als_data.csv'")

        return {
            'X': X_selected,
            'y': y,
            'subject_ids': subject_ids,
            'feature_importance': importance_df,
        }

if __name__ == "__main__":
    # --- IMPORTANT ---
    # If your CSV files are in a different folder, change this path.
    # For example: file_path = "C:/Users/YourUser/Downloads/PROACT_data/"
    file_path = ""

    processor = ALSDataProcessor()
    processed_data = processor.run_pipeline(file_path=file_path)

    if processed_data:
        print("\n--- Top 15 Most Important Features ---")
        print(processed_data['feature_importance'].head(15))

--- Loading and Inspecting Data ---
✓ PROACT_ALSFRS.csv: Loaded successfully with shape (73845, 20)
✓ PROACT_FVC.csv: Loaded successfully with shape (49110, 10)
✓ PROACT_VITALSIGNS.csv: Loaded successfully with shape (84721, 36)
✓ PROACT_RILUZOLE.csv: Loaded successfully with shape (10363, 3)
✓ PROACT_DEMOGRAPHICS.csv: Loaded successfully with shape (12504, 14)
✓ PROACT_LABS.csv: Loaded successfully with shape (2937162, 5)
✓ PROACT_DEATHDATA.csv: Loaded successfully with shape (5043, 3)
✓ PROACT_HANDGRIPSTRENGTH.csv: Loaded successfully with shape (19032, 11)
✓ PROACT_MUSCLESTRENGTH.csv: Loaded successfully with shape (204875, 10)

Calculated ALSFRS slope for 2023 patients.

--- Generating Longitudinal Features (from first 3 months) ---
Processing PROACT_ALSFRS.csv...
Processing PROACT_FVC.csv...
Processing PROACT_VITALSIGNS.csv...
Processing PROACT_LABS.csv...
Processing PROACT_HANDGRIPSTRENGTH.csv...
Processing PROACT_MUSCLESTRENGTH.csv...

Found 3475 eligible patients out of 8538.



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import warnings

warnings.filterwarnings('ignore')

def calculate_metrics(y_true, y_pred):
    """Calculates RMSD and PCC."""
    rmsd = np.sqrt(mean_squared_error(y_true, y_pred))
    pcc, _ = pearsonr(y_true, y_pred)
    return rmsd, pcc

def run_classical_pipeline():
    """
    Loads the processed data, trains baseline models, and evaluates their performance.
    """
    print("====== Starting Classical Baseline Model Pipeline ======")

    # --- 1. Load Data ---
    try:
        data = pd.read_csv("final_processed_als_data.csv")
        print(f"✓ Successfully loaded 'final_processed_als_data.csv' with shape {data.shape}")
    except FileNotFoundError:
        print("✗ ERROR: 'final_processed_als_data.csv' not found. Please run the preprocessing script first.")
        return

    # --- 2. Prepare Data ---
    X = data.drop(columns=['subject_id', 'alsfrs_slope'])
    y = data['alsfrs_slope']

    # 80/20 Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples).")

    # Scale data for SVR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # --- 3. Train and Evaluate Models ---
    results = {}

    # Model 1: Random Forest Regressor
    print("\n--- Training Random Forest Regressor ---")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    rf_rmsd, rf_pcc = calculate_metrics(y_test, rf_preds)
    results['Random Forest'] = {'RMSD': rf_rmsd, 'PCC': rf_pcc}
    print("✓ Training and evaluation complete.")

    # Model 2: Support Vector Regressor
    print("\n--- Training Support Vector Regressor (SVR) ---")
    svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
    svr_model.fit(X_train_scaled, y_train)
    svr_preds = svr_model.predict(X_test_scaled)
    svr_rmsd, svr_pcc = calculate_metrics(y_test, svr_preds)
    results['Support Vector Regressor'] = {'RMSD': svr_rmsd, 'PCC': svr_pcc}
    print("✓ Training and evaluation complete.")

    # --- 4. Display Results ---
    print("\n====== Classical Model Performance ======")
    results_df = pd.DataFrame(results).T
    print(results_df)
    print("\nReminder:")
    print("  - RMSD (Root Mean Squared Deviation): Lower is better.")
    print("  - PCC (Pearson Correlation Coefficient): Higher is better (closer to 1.0).")

    return results_df

if __name__ == "__main__":
    run_classical_pipeline()

✓ Successfully loaded 'final_processed_als_data.csv' with shape (2022, 32)
Data split into training (1617 samples) and testing (405 samples).

--- Training Random Forest Regressor ---
✓ Training and evaluation complete.

--- Training Support Vector Regressor (SVR) ---
✓ Training and evaluation complete.

                              RMSD       PCC
Random Forest             0.566234  0.228837
Support Vector Regressor  0.578394  0.212229

Reminder:
  - RMSD (Root Mean Squared Deviation): Lower is better.
  - PCC (Pearson Correlation Coefficient): Higher is better (closer to 1.0).
