<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/als_pro_act.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# -------------------------------
# 1. Load all relevant CSV tables
# -------------------------------
alsfrs_df = pd.read_csv('PROACT_ALSFRS.csv')
fvc_df = pd.read_csv('PROACT_FVC.csv')
vitals_df = pd.read_csv('PROACT_VITALSIGNS.csv')
labs_df = pd.read_csv('PROACT_LABS.csv')
onset_df = pd.read_csv('PROACT_ALSHISTORY.csv')
riluzole_df = pd.read_csv('PROACT_RILUZOLE.csv')
demographics_df = pd.read_csv('PROACT_DEMOGRAPHICS.csv')

# -------------------------------
# 2. Compute ALSFRS (convert ALSFRS-R to original if needed)
# -------------------------------
def convert_alsfrs_row(row):
    if pd.notna(row.get('ALSFRS_Total')):
        return row['ALSFRS_Total']
    total = 0
    for q in range(1, 10):
        val = row.get(f'Q{q}', np.nan)
        if pd.notna(val):
            total += val
    # Handle Q10 (respiratory)
    if pd.notna(row.get('Q10_Respiratory')):
        total += row['Q10_Respiratory']
    elif pd.notna(row.get('R_1_Dyspnea')):
        total += row.get('R_1_Dyspnea')
    return total

alsfrs_df['ALSFRS_Total_orig'] = alsfrs_df.apply(convert_alsfrs_row, axis=1)

# -------------------------------
# 3. Identify valid patients
# -------------------------------
months_start, months_end = 3, 12
min_records_start, min_records_end = 2, 2
days_start, days_end = months_start * 30, months_end * 30

alsfrs_counts = alsfrs_df.groupby('subject_id')['ALSFRS_Delta'].agg(
    records_before_start=lambda x: (x <= days_start).sum(),
    records_after_end=lambda x: (x >= days_end).sum()
)

valid_patients_df = alsfrs_counts[
    (alsfrs_counts['records_before_start'] >= min_records_start) &
    (alsfrs_counts['records_after_end'] >= min_records_end)
]
valid_patients = sorted(valid_patients_df.index.tolist())

print(f"✅ Valid patients: {len(valid_patients)}")

# -------------------------------
# 4. Compute ALSFRS slope (3–12 months)
# -------------------------------
slope_targets = {}
for pid in valid_patients:
    patient_data = alsfrs_df[alsfrs_df['subject_id'] == pid].copy()
    patient_data.sort_values('ALSFRS_Delta', inplace=True)
    t1 = patient_data[patient_data['ALSFRS_Delta'] > 90]
    t2 = patient_data[patient_data['ALSFRS_Delta'] >= 365]
    if len(t1) > 0 and len(t2) > 0:
        t1_record = t1.iloc[0]
        t2_record = t2.iloc[0]
        delta_days = t2_record['ALSFRS_Delta'] - t1_record['ALSFRS_Delta']
        if delta_days > 0:
            slope = (t2_record['ALSFRS_Total_orig'] - t1_record['ALSFRS_Total_orig']) / (delta_days / 30.0)
            slope_targets[pid] = slope

target_df = pd.Series(slope_targets, name='ALSFRS_slope_3to12m')
print("✅ ALSFRS slope computed for", len(target_df), "patients")
print(target_df.describe())

# -------------------------------
# 5. Helper: summarize all numeric columns in a time-series table
# -------------------------------
def summarize_timeseries(df, time_col, value_col):
    grp = df.groupby('subject_id')
    summary = pd.DataFrame({
        'min': grp[value_col].min(),
        'max': grp[value_col].max(),
        'median': grp[value_col].median(),
        'std': grp[value_col].std(),
        'first': grp.apply(lambda g: g.sort_values(time_col)[value_col].iloc[0], include_groups=False),
        'last': grp.apply(lambda g: g.sort_values(time_col)[value_col].iloc[-1], include_groups=False)
    })
    time_first = grp[time_col].min()
    time_last = grp[time_col].max()
    time_diff_months = (time_last - time_first) / 30.0
    summary['slope'] = (summary['last'] - summary['first']) / time_diff_months
    summary.loc[time_diff_months == 0, 'slope'] = np.nan
    return summary

def summarize_all_numeric(df, time_col):
    numeric_cols = df.select_dtypes(include=['number']).columns.drop([time_col, 'subject_id'], errors='ignore') # Exclude subject_id
    summaries = {}
    for col in numeric_cols:
        summaries[col] = summarize_timeseries(df, time_col, col)
        summaries[col].columns = [f'{col}_{c}' for c in summaries[col].columns]
    return summaries

# -------------------------------
# 6. Subset to first 90 days and summarize automatically
# -------------------------------
alsfrs_3m = alsfrs_df[alsfrs_df['subject_id'].isin(valid_patients) & (alsfrs_df['ALSFRS_Delta'] <= 90)]
fvc_df['FVC'] = fvc_df[['Subject_Liters_Trial_1','Subject_Liters_Trial_2','Subject_Liters_Trial_3']].max(axis=1)
fvc_3m = fvc_df[fvc_df['subject_id'].isin(valid_patients) & (fvc_df['Forced_Vital_Capacity_Delta'] <= 90)]
vitals_3m = vitals_df[vitals_df['subject_id'].isin(valid_patients) & (vitals_df['Vital_Signs_Delta'] <= 90)]
labs_3m = labs_df[labs_df['subject_id'].isin(valid_patients) & (labs_df['Laboratory_Delta'] <= 90)]

alsfrs_features = summarize_all_numeric(alsfrs_3m, 'ALSFRS_Delta')
fvc_features = summarize_all_numeric(fvc_3m, 'Forced_Vital_Capacity_Delta')
vitals_features = summarize_all_numeric(vitals_3m, 'Vital_Signs_Delta')
labs_features = summarize_all_numeric(labs_3m, 'Laboratory_Delta')


# -------------------------------
# 7. Merge all features
# -------------------------------
features_df = pd.DataFrame(index=valid_patients)

# Select relevant columns from static tables and set index
onset_static = onset_df.drop_duplicates(subset='subject_id', keep='first').set_index('subject_id')[['Site_of_Onset', 'Onset_Delta', 'Diagnosis_Delta']]
riluzole_static = riluzole_df.drop_duplicates(subset='subject_id', keep='first').set_index('subject_id')[['Subject_used_Riluzole', 'Riluzole_use_Delta']]
demographics_static = demographics_df.drop_duplicates(subset='subject_id', keep='first').set_index('subject_id')[['Age', 'Sex']]


# Join static tables with suffixes
features_df = features_df.join(onset_static, how='left')
features_df = features_df.join(riluzole_static, how='left', rsuffix='_rilu')
features_df = features_df.join(demographics_static, how='left', rsuffix='_demo')


# Add dynamic (summarized) features
for group in [alsfrs_features, fvc_features, vitals_features, labs_features]:
    for feat_df in group.values():
        features_df = features_df.join(feat_df, how='left')


# Add slope target
features_df = features_df.join(target_df, how='left')

# -------------------------------
# 8. Clean up
# -------------------------------
features_df = features_df.dropna(axis=1, how='all')
features_df = features_df.loc[:, features_df.nunique() > 1]

print(f"✅ Final features shape: {features_df.shape}")
print(features_df.head(3))

✅ Valid patients: 2442
✅ ALSFRS slope computed for 2439 patients
count    2439.000000
mean       -0.388076
std         0.496497
min        -3.100000
25%        -0.638298
50%        -0.218978
75%         0.000000
max         1.052632
Name: ALSFRS_slope_3to12m, dtype: float64
✅ Final features shape: (2442, 330)
      Site_of_Onset  Onset_Delta  Diagnosis_Delta Subject_used_Riluzole  \
121     Onset: Limb          NaN              NaN                   Yes   
1009   Onset: Other       -324.0            -63.0                   Yes   
1036  Onset: Bulbar          NaN              NaN                   NaN   

      Riluzole_use_Delta   Age     Sex  Q1_Speech_min  Q1_Speech_max  \
121                  0.0  52.0  Female            4.0            4.0   
1009                 0.0  51.0    Male            4.0            4.0   
1036                 NaN  67.0  Female            3.0            3.0   

      Q1_Speech_median  ...  Standing_BP_Diastolic_max  \
121                4.0  ...              

In [None]:
features_df.head(3)

Unnamed: 0,Site_of_Onset,Onset_Delta,Diagnosis_Delta,Subject_used_Riluzole,Riluzole_use_Delta,Age,Sex,Q1_Speech_min,Q1_Speech_max,Q1_Speech_median,...,Standing_BP_Diastolic_max,Standing_BP_Diastolic_median,Standing_BP_Diastolic_first,Standing_BP_Diastolic_last,Standing_BP_Systolic_min,Standing_BP_Systolic_max,Standing_BP_Systolic_median,Standing_BP_Systolic_first,Standing_BP_Systolic_last,ALSFRS_slope_3to12m
121,Onset: Limb,,,Yes,0.0,52.0,Female,4.0,4.0,4.0,...,,,,,,,,,,-1.058824
1009,Onset: Other,-324.0,-63.0,Yes,0.0,51.0,Male,4.0,4.0,4.0,...,,,,,,,,,,0.0
1036,Onset: Bulbar,,,,,67.0,Female,3.0,3.0,3.0,...,,,,,,,,,,


In [None]:
# import pandas as pd

# # Load CSV files (assuming they are in the working directory or mounted drive)
# alsfrs_df = pd.read_csv('PROACT_ALSFRS.csv')
# fvc_df = pd.read_csv('PROACT_FVC.csv')
# vitals_df = pd.read_csv('PROACT_VITALSIGNS.csv')
# labs_df = pd.read_csv('PROACT_LABS.csv')
# onset_df = pd.read_csv('PROACT_ALSHISTORY.csv')
# riluzole_df = pd.read_csv('PROACT_RILUZOLE.csv')
# # (Load other tables like Demographics if available)
# demographics_df = pd.read_csv('PROACT_DEMOGRAPHICS.csv')

# # Convert ALSFRS-R to ALSFRS (original 0-40 scale) according to guidelines:contentReference[oaicite:5]{index=5}.
# def convert_alsfrs_row(row):
#     # If original ALSFRS total is present, use it
#     if pd.notna(row['ALSFRS_Total']):
#         return row['ALSFRS_Total']
#     # Otherwise, use ALSFRS-R values to compute original total
#     total = 0
#     # Sum questions 1–9 (same in ALSFRS and ALSFRS-R)
#     for q in range(1, 10):
#         total += row.get(f'Q{q}_{alsfrs_df.columns[1+ (q-1)]}', 0)  # using actual Q# columns
#     # For Q10, use R_1_Dyspnea if available (ALSFRS-R) or Q10_Respiratory if original
#     if pd.notna(row.get('Q10_Respiratory')):
#         total += row['Q10_Respiratory']
#     elif pd.notna(row.get('R_1_Dyspnea')):
#         total += row.get('R_1_Dyspnea', 0) # Added default value 0 for get
#     # Merge Q5a and Q5b (if one is NaN, use the other)
#     # (Already counted in sum above if present; ensure not double-counted)
#     return total

# alsfrs_df['ALSFRS_Total_orig'] = alsfrs_df.apply(convert_alsfrs_row, axis=1)

# # Determine patients with required ALSFRS timeline
# # ALSFRS_Delta is time (days) since first ALSFRS (baseline)

# # Define the number of months and minimum records
# months_start = 3  # First threshold in months
# months_end = 12   # Second threshold in months
# min_records_start = 2 # Minimum records before months_start
# min_records_end = 2   # Minimum records after months_end

# days_start = months_start * 30 # Convert months to days
# days_end = months_end * 30   # Convert months to days

# # Group by patient and count records within the specified timeframes
# alsfrs_counts = alsfrs_df.groupby('subject_id')['ALSFRS_Delta'].agg(
#     records_before_start=lambda x: (x <= days_start).sum(),
#     records_after_end=lambda x: (x >= days_end).sum()
# )

# # Filter patients based on minimum record counts
# valid_patients_df = alsfrs_counts[(alsfrs_counts['records_before_start'] >= min_records_start) &
#                                   (alsfrs_counts['records_after_end'] >= min_records_end)]

# valid_patients = sorted(valid_patients_df.index.tolist())

# print(f"Patients with ≥{min_records_start} ALSFRS in first {months_start} mo and ≥{min_records_end} after {months_end} mo: {len(valid_patients)}")

In [None]:
# alsfrs_counts = alsfrs_df.groupby('subject_id')['ALSFRS_Delta'].agg(
#     count_first_3m=lambda x: (x <= 90).sum(),
#     count_after_12m=lambda x: (x >= 365).sum()
# )

# # Filter for patients with at least 2 records in each window
# valid_patients_df = alsfrs_counts[(alsfrs_counts['count_first_3m'] >= 2) & (alsfrs_counts['count_after_12m'] >= 2)]
# valid_patients = sorted(valid_patients_df.index.tolist())

# print(f"Patients with ≥2 ALSFRS in first 3 mo and ≥2 after 12 mo: {len(valid_patients)}")

In [None]:
# # Count records with at least 2 ALSFRS in the first 3 months
# count_at_least_2_first_3m = alsfrs_counts[alsfrs_counts['count_first_3m'] >= 2].shape[0]
# print(f"Number of records with ≥2 ALSFRS in first 3 months: {count_at_least_2_first_3m}")

# # Count records with at least 2 ALSFRS after 12 months
# count_at_least_2_after_12m = alsfrs_counts[alsfrs_counts['count_after_12m'] >= 2].shape[0]
# print(f"Number of records with ≥2 ALSFRS after 12 months: {count_at_least_2_after_12m}")

In [None]:
# # Compute ALSFRS slope (points/month) for each valid patient
# slope_targets = {}
# for pid in valid_patients:
#     patient_data = alsfrs_df[alsfrs_df['subject_id'] == pid].copy()
#     patient_data.sort_values('ALSFRS_Delta', inplace=True)
#     # t1: first ALSFRS after 3 months (90 days), t2: first ALSFRS after 12 months (365 days)
#     t1_record = patient_data[patient_data['ALSFRS_Delta'] > 90].iloc[0]
#     t2_record = patient_data[patient_data['ALSFRS_Delta'] >= 365].iloc[0]
#     # Calculate slope in ALSFRS points per month
#     delta_days = t2_record['ALSFRS_Delta'] - t1_record['ALSFRS_Delta']
#     if delta_days > 0:
#         slope = (t2_record['ALSFRS_Total_orig'] - t1_record['ALSFRS_Total_orig']) / (delta_days / 30.0)
#         slope_targets[pid] = slope
# target_df = pd.Series(slope_targets, name='ALSFRS_slope_3to12m')
# print(target_df.describe())  # inspect target distribution


In [None]:
# import numpy as np

# # Helper to compute summary stats for a given patient series
# def summarize_timeseries(df, time_col, value_col):
#     grp = df.groupby('subject_id')
#     summary = pd.DataFrame({
#         'min': grp[value_col].min(),
#         'max': grp[value_col].max(),
#         'median': grp[value_col].median(),
#         'std': grp[value_col].std(),
#         'first': grp.apply(lambda g: g.sort_values(time_col)[value_col].iloc[0]),
#         'last': grp.apply(lambda g: g.sort_values(time_col)[value_col].iloc[-1])
#     })
#     # Compute slope = (last - first) / (time_diff in months)
#     time_first = grp[time_col].min()
#     time_last = grp[time_col].max()
#     time_diff_months = (time_last - time_first) / 30.0
#     summary['slope'] = (summary['last'] - summary['first']) / time_diff_months
#     # If only one observation, slope will be NaN (time_diff=0) as per guidelines
#     summary.loc[time_diff_months == 0, 'slope'] = np.nan
#     return summary

# # Filter data to first 90 days
# alsfrs_3m = alsfrs_df[alsfrs_df['subject_id'].isin(valid_patients) & (alsfrs_df['ALSFRS_Delta'] <= 90)]
# fvc_df['FVC'] = fvc_df[['Subject_Liters_Trial_1','Subject_Liters_Trial_2','Subject_Liters_Trial_3']].max(axis=1)
# fvc_3m = fvc_df[fvc_df['subject_id'].isin(valid_patients) & (fvc_df['Forced_Vital_Capacity_Delta'] <= 90)]
# vitals_3m = vitals_df[vitals_df['subject_id'].isin(valid_patients) & (vitals_df['Vital_Signs_Delta'] <= 90)]
# labs_3m = labs_df[labs_df['subject_id'].isin(valid_patients) & (labs_df['Laboratory_Delta'] <= 90)]

# # Compute summaries for ALSFRS (using ALSFRS_Total_orig)
# alsfrs_summary = summarize_timeseries(alsfrs_3m, 'ALSFRS_Delta', 'ALSFRS_Total_orig')
# alsfrs_summary.columns = [f'ALSFRS_{col}' for col in alsfrs_summary.columns]

# # Compute summaries for FVC (using max FVC per visit)
# fvc_summary = summarize_timeseries(fvc_3m, 'Forced_Vital_Capacity_Delta', 'FVC')
# fvc_summary.columns = [f'FVC_{col}' for col in fvc_summary.columns]

# # Vital signs: Weight, Blood Pressure (Sys/Dia), Pulse, Respiratory Rate
# vital_features = {}
# for feat in ['Weight', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Pulse', 'Respiratory_Rate']:
#     sub_df = vitals_3m[vitals_3m[feat].notna()]
#     vital_features[feat] = summarize_timeseries(sub_df, 'Vital_Signs_Delta', feat)
#     vital_features[feat].columns = [f'{feat}_{col}' for col in vital_features[feat].columns]

# # Lab tests: select top lab features (based on data availability >70%)
# top_lab_tests = ['ALT(SGPT)','AST(SGOT)','Creatinine','Albumin','Hemoglobin','White Blood Cell (WBC)']
# lab_features = {}
# for test in top_lab_tests:
#     sub_df = labs_3m[labs_3m['Test_Name'] == test].copy()
#     # Convert lab results to numeric (coerce non-numeric to NaN)
#     sub_df['Test_Result'] = pd.to_numeric(sub_df['Test_Result'], errors='coerce')
#     sub_df = sub_df[sub_df['Test_Result'].notna()]
#     lab_features[test] = summarize_timeseries(sub_df, 'Laboratory_Delta', 'Test_Result')
#     prefix = ''.join(ch for ch in test if ch.isalnum() or ch == '_')
#     lab_features[test].columns = [f'{prefix}_{col}' for col in lab_features[test].columns]

# # Compile all features into one DataFrame
# features_df = pd.DataFrame(index=valid_patients)
# # Static features: onset delta (time from onset to baseline), onset site, riluzole usage
# # Onset delta (in months)
# onset_df = onset_df.drop_duplicates(subset='subject_id', keep='first')  # one record per patient
# features_df['Onset_Delta_months'] = -onset_df.set_index('subject_id')['Onset_Delta'] / 30.0  # negative stored, make positive months
# # Onset site (bulbar vs limb etc.)
# features_df = features_df.join(pd.get_dummies(onset_df.set_index('subject_id')['Site_of_Onset'], prefix='Site'))
# # Riluzole usage (Yes=1, No=0)
# riluzole_usage = riluzole_df.drop_duplicates('subject_id').set_index('subject_id')['Subject_used_Riluzole'].map({'Yes':1, 'No':0})
# features_df['Riluzole'] = riluzole_usage
# features_df.fillna({'Riluzole': 0}, inplace=True)  # assume missing as No

# # Demographics
# demographics_df = demographics_df.drop_duplicates(subset='subject_id', keep='first').copy()
# features_df = features_df.join(demographics_df.set_index('subject_id')[['Age', 'Sex']], how='left')
# features_df = features_df.join(pd.get_dummies(demographics_df.set_index('subject_id')['Sex'], prefix='Sex'), how='left')

# # Merge longitudinal summary features
# features_df = features_df.join(alsfrs_summary, how='left')
# features_df = features_df.join(fvc_summary, how='left')
# for feat_df in vital_features.values():
#     features_df = features_df.join(feat_df, how='left')
# for lab_df in lab_features.values():
#     features_df = features_df.join(lab_df, how='left')

# print("Total feature columns before cleaning:", features_df.shape[1])
# features_df.head(3)


In [None]:
# Drop features with >30% missing
missing_frac = features_df.isna().mean()
drop_cols = missing_frac[missing_frac > 0.30].index
features_df.drop(columns=drop_cols, inplace=True)
print(f"Dropped {len(drop_cols)} features due to >30% missing data")

# Impute remaining missing values
for col in features_df.columns:
    if features_df[col].isna().any():
        if features_df[col].dtype in [np.float64, np.int64]:
            # continuous: use median
            features_df[col].fillna(features_df[col].median(), inplace=True)
        else:
            # discrete/categorical: use mode
            features_df[col].fillna(features_df[col].mode()[0], inplace=True)


Dropped 202 features due to >30% missing data


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features_df[col].fillna(features_df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features_df[col].fillna(features_df[col].median(), inplace=True)


In [None]:
# --- Align X and y on common patient IDs (prevents KeyError) ---
common_ids = features_df.index.intersection(target_df.index)

X = features_df.loc[common_ids].copy()
y = target_df.loc[common_ids].copy()

# Drop the original 'Sex' column and the 'Sex_demo' column if they exist
X = X.drop(columns=['Sex', 'Sex_demo'], errors='ignore')

# Handle categorical features (Site_of_Onset)
# One-hot encode 'Site_of_Onset'
X = pd.get_dummies(X, columns=['Site_of_Onset'], prefix='Site', dummy_na=False)

# Convert 'Subject_used_Riluzole' to numerical (1 for Yes, 0 for No)
if 'Subject_used_Riluzole' in X.columns:
    X['Subject_used_Riluzole'] = X['Subject_used_Riluzole'].map({'Yes': 1, 'No': 0})


# (Optional) sanity checks
print("X shape (before split):", X.shape)
print("y shape (before split):", y.shape)
assert X.index.equals(y.index), "Indices are not aligned!"

# --- Train/test split (split X and y together) ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# --- Scale features (fit on train, transform test) ---
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    index=X_train.index, columns=X_train.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    index=X_test.index, columns=X_test.columns
)

print("X_train_scaled:", X_train_scaled.shape, "| X_test_scaled:", X_test_scaled.shape)
print("y_train:", y_train.shape, "| y_test:", y_test.shape)

X shape (before split): (2439, 131)
y shape (before split): (2439,)
X_train_scaled: (1951, 131) | X_test_scaled: (488, 131)
y_train: (1951,) | y_test: (488,)


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train_scaled, y_train)
importances = pd.Series(rf.feature_importances_, index=X_train_scaled.columns).sort_values(ascending=False)
top_features_rf = importances.head(10).index.tolist()

# Pearson correlation absolute values
corr = X_train_scaled.corrwith(y_train).abs().sort_values(ascending=False)
top_features_corr = corr.head(10).index.tolist()

# Combine and choose top 8 features (taking into account both methods)
candidate_features = list(dict.fromkeys(top_features_rf + top_features_corr))
selected_features = candidate_features[:16]
print("Top features selected for QNN:", selected_features)
# Reduce feature sets to selected features
X_train_sel = X_train_scaled[selected_features]
X_test_sel = X_test_scaled[selected_features]


Top features selected for QNN: ['ALSFRS_slope_3to12m', 'Blood_Pressure_Systolic_slope', 'Q1_Speech_slope', 'Blood_Pressure_Systolic_first', 'Q5a_Cutting_without_Gastrostomy_slope', 'Blood_Pressure_Systolic_std', 'ALSFRS_Total_orig_median', 'Q9_Climbing_Stairs_slope', 'Pulse_slope', 'Pulse_std', 'ALSFRS_Total_orig_first', 'ALSFRS_Total_orig_max', 'ALSFRS_Total_orig_last', 'ALSFRS_Total_orig_min', 'ALSFRS_Total_orig_std', 'Subject_used_Riluzole']


In [None]:
len(selected_features)

16

In [None]:
!pip install pennylane  # Install PennyLane for quantum modeling
import pennylane as qml
import numpy as np

n_qubits = len(selected_features)  # number of qubits = number of selected features
dev = qml.device('default.qubit', wires=n_qubits)

# Define the variational quantum circuit (ansatz)
@qml.qnode(dev)
def quantum_model(feature_vector, weights):
    # Encode features as rotations (angle encoding using Ry rotations)
    for i in range(n_qubits):
        qml.RY(feature_vector[i], wires=i)
    # Variational layers
    # weights shape: (L, n_qubits), where L = number of layers
    for layer in weights:
        # Apply a trainable rotation on each qubit
        for i in range(n_qubits):
            qml.RY(layer[i], wires=i)
        # Apply entangling CNOT gates (chain topology)
        for i in range(n_qubits - 1):
            qml.CNOT(wires=[i, i+1])
    # Return expectation of PauliZ on first qubit as prediction
    return qml.expval(qml.PauliZ(0))

# Initialize trainable weights (e.g., 2 layers of rotations)
layers = 2
np.random.seed(0)
weights = 0.01 * np.random.randn(layers, n_qubits)

# Normalize target slopes to [-1,1] for training
y_min, y_max = y_train.min(), y_train.max()
y_train_norm = 2 * (y_train - y_min) / (y_max - y_min) - 1
y_test_norm = 2 * (y_test - y_min) / (y_max - y_min) - 1


Collecting pennylane
  Downloading pennylane-0.43.0-py3-none-any.whl.metadata (11 kB)
Collecting rustworkx>=0.14.0 (from pennylane)
  Downloading rustworkx-0.17.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting appdirs (from pennylane)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting autoray==0.8.0 (from pennylane)
  Downloading autoray-0.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting pennylane-lightning>=0.43 (from pennylane)
  Downloading pennylane_lightning-0.43.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting diastatic-malt (from pennylane)
  Downloading diastatic_malt-2.15.2-py3-none-any.whl.metadata (2.6 kB)
Collecting scipy-openblas32>=0.3.26 (from pennylane-lightning>=0.43->pennylane)
  Downloading scipy_openblas32-0.3.30.0.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1

In [13]:
# --- QNN training (PennyLane-compatible) ---

from pennylane import numpy as pnp

# Optimizer: use 'stepsize' (not 'learning_rate')
opt = qml.AdamOptimizer(stepsize=0.1)
batch_size = 32
num_epochs = 10

# Convert data to PennyLane numpy for autograd compatibility
X_train_array = pnp.array(X_train_sel.values, dtype=float)
# Make sure y aligns with X_train_sel index
y_train_array = pnp.array(y_train_norm.loc[X_train_sel.index].values, dtype=float)

# Ensure weights are a trainable pnp array
weights = pnp.array(weights, requires_grad=True)

for epoch in range(1, num_epochs + 1):
    # Shuffle training data
    perm = pnp.random.permutation(len(X_train_array))
    X_train_array = X_train_array[perm]
    y_train_array = y_train_array[perm]

    # Mini-batch gradient descent
    for i in range(0, len(X_train_array), batch_size):
        X_batch = X_train_array[i:i + batch_size]
        y_batch = y_train_array[i:i + batch_size]

        # Batch cost must use pnp ops so PennyLane can differentiate
        def batch_cost(w):
            preds = pnp.stack([quantum_model(x, w) for x in X_batch])
            return pnp.mean((preds - y_batch) ** 2)

        # Update weights
        weights = opt.step(batch_cost, weights)

    # Monitor training loss
    if epoch % 10 == 0:
        train_preds = pnp.stack([quantum_model(x, weights) for x in X_train_array])
        train_loss = pnp.mean((train_preds - y_train_array) ** 2)
        print(f"Epoch {epoch}: Training MSE = {train_loss:.4f}")


Epoch 10: Training MSE = 0.0847


In [14]:
# Evaluate on test set
test_preds_norm = np.array([quantum_model(x, weights) for x in X_test_sel.values], dtype=float)
# Invert normalization to original scale
test_preds = 0.5 * (test_preds_norm + 1) * (y_max - y_min) + y_min

# Calculate performance metrics
from math import sqrt
test_true = y_test.values
rmsd = sqrt(((test_preds - test_true)**2).mean())  # Root Mean Squared Deviation:contentReference[oaicite:19]{index=19}
pcc = np.corrcoef(test_preds, test_true)[0, 1]     # Pearson Correlation Coefficient:contentReference[oaicite:20]{index=20}

print(f"Test RMSD: {rmsd:.3f} ALSFRS points/month")
print(f"Test PCC:  {pcc:.3f}")


Test RMSD: 0.580 ALSFRS points/month
Test PCC:  0.706


In [15]:
# Classify patients based on predicted slope
pred_fast = test_preds <= -1.0   # boolean array for fast progressors
true_fast = test_true <= -1.0
# Compute classification metrics (e.g., accuracy or confusion matrix)
accuracy = (pred_fast == true_fast).mean()
print(f"Fast progressor classification accuracy: {accuracy:.2%}")
print(f"Predicted fast progressors: {pred_fast.sum()} / {len(pred_fast)}")


Fast progressor classification accuracy: 89.96%
Predicted fast progressors: 90 / 488
