# Flatiron Health aNSCLC: Calculating patient risk scores

**OBJECTIVE: Risk scores were calculated for each patient using 4-fold cross-validation, where each patient's score was generated from a model trained on 75% of the data (excluding that patient). The model used gradient boosting survival analysis with pre-specified hyperparameters (see Jupyter notebook titled: crude_model_build). For each fold, patients in the held-out set received risk scores from a model trained on the remaining patients, ensuring unbiased prediction scores for the entire dataset.**

**OUTLINE:**
1. **Preprocessing**
2. **Calculating patient risk scores** 

## 1. Preprocessing 

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

### Importing full dataframes

In [3]:
train = pd.read_csv('train_full.csv')

In [4]:
row_ID(train)

(54786, 54786)

In [5]:
test = pd.read_csv('test_full.csv')

In [6]:
row_ID(test)

(13697, 13697)

In [7]:
df = pd.concat([train, test])

In [8]:
row_ID(df)

(68483, 68483)

In [9]:
df = df.set_index('PatientID')

In [10]:
df.sample(3)

Unnamed: 0_level_0,PracticeType,gender,race,ethnicity,age,region,Histology,SmokingStatus,stage,adv_year,...,depression,elixhauser_other,icd_count,other_cancer,cns_met,bone_met,liver_met,resp_met,adrenal_met,other_met
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F59FDCE7C7BC5,COMMUNITY,M,unknown,unknown,82,south,Squamous cell carcinoma,History of smoking,IV,2018,...,0.0,1.0,8.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
F61391D65AC5E,COMMUNITY,M,Asian,unknown,70,midwest,Non-squamous cell carcinoma,No history of smoking,II,2018,...,0.0,1.0,9.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
F6D129D5323CA,COMMUNITY,F,White,unknown,73,west,Squamous cell carcinoma,History of smoking,I,2019,...,0.0,1.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dropping unnecessary demographic variables 

In [11]:
df = df.drop(columns = ['race',
                        'ethnicity',
                        'region',
                        'commercial',
                        'medicare',
                        'medicaid',
                        'other_insurance'])

### Converting datatypes 

In [12]:
list(df.select_dtypes(include = ['object']).columns)

['PracticeType',
 'gender',
 'Histology',
 'SmokingStatus',
 'stage',
 'ALK',
 'BRAF',
 'EGFR',
 'KRAS',
 'ROS1',
 'pdl1',
 'pdl1_n',
 'ecog_diagnosis']

In [13]:
to_be_categorical = list(df.select_dtypes(include = ['object']).columns)

In [14]:
to_be_categorical.append('adv_year')

In [15]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    df[x] = df[x].astype('category')

In [16]:
list(df.select_dtypes(include = ['category']).columns)

['PracticeType',
 'gender',
 'Histology',
 'SmokingStatus',
 'stage',
 'adv_year',
 'ALK',
 'BRAF',
 'EGFR',
 'KRAS',
 'ROS1',
 'pdl1',
 'pdl1_n',
 'ecog_diagnosis']

In [17]:
# Convert death_status into True or False (required for scikit-survival). 
df['death_status'] = df['death_status'].astype('bool')

### Dropping unneeded labs 

In [18]:
drop_labs = [
    'albumin_avg',
    'alp_avg',
    'alt_avg',
    'ast_avg',
    'bicarb_avg',
    'bun_avg',
    'calcium_avg',
    'chloride_avg',
    'creatinine_avg',
    'hemoglobin_avg',
    'neutrophil_count_avg',
    'platelet_avg',
    'potassium_avg',
    'sodium_avg',
    'total_bilirubin_avg',
    'wbc_avg',
    'albumin_max',
    'bicarb_max',
    'bun_max',
    'chloride_max',
    'hemoglobin_max',
    'neutrophil_count_max',
    'platelet_max',
    'potassium_max',
    'sodium_max',
    'alp_min',
    'alt_min',
    'ast_min',
    'bun_min',
    'calcium_min',
    'chloride_min',
    'creatinine_min',
    'neutrophil_count_min',
    'potassium_min',
    'total_bilirubin_min',
    'albumin_std',
    'alp_std',
    'alt_std',
    'ast_std',
    'bicarb_std',
    'bun_std',
    'calcium_std',
    'chloride_std',
    'creatinine_std',
    'hemoglobin_std',
    'neutrophil_count_std',
    'platelet_std',
    'potassium_std',
    'sodium_std',
    'total_bilirubin_std',
    'wbc_std',
    'albumin_slope',
    'alp_slope',
    'alt_slope',
    'ast_slope',
    'bicarb_slope',
    'bun_slope',
    'calcium_slope',
    'chloride_slope',
    'creatinine_slope',
    'hemoglobin_slope',
    'neutrophil_count_slope',
    'platelet_slope',
    'potassium_slope',
    'sodium_slope',
    'total_bilirubin_slope',
    'wbc_slope',
    'albumin_slope_na',
    'alp_slope_na',
    'alt_slope_na',
    'ast_slope_na',
    'bicarb_slope_na',
    'bun_slope_na',
    'calcium_slope_na',
    'chloride_slope_na',
    'creatinine_slope_na',
    'hemoglobin_slope_na',
    'neutrophil_count_slope_na',
    'platelet_slope_na',
    'potassium_slope_na',
    'sodium_slope_na',
    'total_bilirubin_slope_na',
    'wbc_slope_na']

In [19]:
df.shape

(68483, 205)

In [20]:
df = df.drop(columns = drop_labs)

In [21]:
df.shape

(68483, 122)

In [22]:
df.loc[:, 'alp_max_na'] = np.where(df['alp_max'].isna(), 1, 0)
df.loc[:, 'alt_max_na'] = np.where(df['alt_max'].isna(), 1, 0)
df.loc[:, 'ast_max_na'] = np.where(df['ast_max'].isna(), 1, 0)
df.loc[:, 'calcium_max_na'] = np.where(df['calcium_max'].isna(), 1, 0)
df.loc[:, 'creatinine_max_na'] = np.where(df['creatinine_max'].isna(), 1, 0)
df.loc[:, 'total_bilirubin_max_na'] = np.where(df['total_bilirubin_max'].isna(), 1, 0)
df.loc[:, 'wbc_max_na'] = np.where(df['wbc_max'].isna(), 1, 0)
df.loc[:, 'albumin_min_na'] = np.where(df['albumin_min'].isna(), 1, 0)
df.loc[:, 'bicarb_min_na'] = np.where(df['bicarb_min'].isna(), 1, 0)
df.loc[:, 'hemoglobin_min_na'] = np.where(df['hemoglobin_min'].isna(), 1, 0)
df.loc[:, 'platelet_min_na'] = np.where(df['platelet_min'].isna(), 1, 0)
df.loc[:, 'sodium_min_na'] = np.where(df['sodium_min'].isna(), 1, 0)
df.loc[:, 'wbc_min_na'] = np.where(df['wbc_min'].isna(), 1, 0)

In [23]:
df.shape

(68483, 135)

### Separate into X and Y 

In [24]:
# 'X' datasets
df_x = df.drop(columns = ['death_status', 'timerisk_activity']) #80% of data 

In [25]:
# 'Y' datasets
# Death status and time until event needs to be stored as a structured array to be compatible with scikit-survival
y_dtypes = df[['death_status', 'timerisk_activity']].dtypes

df_y = np.array([tuple(x) for x in df[['death_status', 'timerisk_activity']].values],
                dtype = list(zip(y_dtypes.index, y_dtypes)))

### Pipeline 

In [26]:
# List of numeric variables, excluding binary variables. 
numerical_features = [
    'age',
    'delta_adv_diagnosis',
    'weight_diag',
    'bmi_diag',
    'weight_pct_change',
    'weight_slope',
    'albumin_diag',
    'alp_diag',
    'alt_diag',
    'ast_diag',
    'bicarb_diag',
    'bun_diag',
    'calcium_diag',
    'chloride_diag',
    'creatinine_diag',
    'hemoglobin_diag',
    'neutrophil_count_diag',
    'platelet_diag',
    'potassium_diag',
    'sodium_diag',
    'total_bilirubin_diag',
    'wbc_diag',
    'alp_max',
    'alt_max',
    'ast_max',
    'calcium_max',
    'creatinine_max',
    'total_bilirubin_max',
    'wbc_max',
    'albumin_min',
    'bicarb_min',
    'hemoglobin_min',
    'platelet_min',
    'sodium_min',
    'wbc_min',
    'icd_count']

# Transformer will impute column medians and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [27]:
# List of categorical features.
categorical_features = list(df_x.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [28]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

## 2. Calculating patient risk scores 

In [29]:
from sklearn.pipeline import make_pipeline
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sklearn.model_selection import StratifiedKFold

import time
import pickle
import os

In [30]:
df_x.shape

(68483, 133)

In [31]:
df_y.shape

(68483,)

In [32]:
# Run cross-validation on full dataset
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# For storing indices and predictions
patient_predictions = []

# Create folder for saved results if it doesn't exist
if not os.path.exists('cv_results'):
    os.makedirs('cv_results')

# Time the process
start_time = time.time()

# Get original patients IDs in the order they appear in df_x
all_patient_ids = df_x.index

for fold_idx, (train_idx, test_idx) in enumerate(cv.split(df_x, df_y)):
    # Split data for this fold
    # For df_x - use PatientIDs
    patient_ids_test = all_patient_ids[test_idx]
    X_train = df_x.drop(patient_ids_test)
    X_test = df_x.loc[patient_ids_test]
    
    # For df_y - use array indexing
    y_train = df_y[train_idx]
    y_test = df_y[test_idx]
    
    fold_model = make_pipeline(
        preprocessor,
        GradientBoostingSurvivalAnalysis(
            n_estimators=1150,
            learning_rate=0.05,
            max_depth=2,
            subsample=0.75,
            verbose=1,
            random_state=42
        )
    )
    
    # Fit model and time it
    fold_start_time = time.time()
    fold_model.fit(X_train, y_train)
    fold_time = time.time() - fold_start_time
    
    # Get predictions for test set
    test_predictions = fold_model.predict(X_test)
    
    # Store PatientIDs and their predictions
    fold_predictions = list(zip(patient_ids_test, test_predictions))
    patient_predictions.extend(fold_predictions)
    
    # Save results from this fold
    fold_results = {
        'fold': fold_idx + 1,
        'predictions': fold_predictions,
        'fold_time': fold_time,
        'timestamp': time.strftime("%Y%m%d-%H%M%S")
    }
    
    with open(f'cv_results/fold_{fold_idx + 1}_results.pkl', 'wb') as f:
        pickle.dump(fold_results, f)
    
    print(f"Fold {fold_idx + 1} completed")
    print(f"Fold training time: {fold_time/60:.2f} minutes")
    print(f"Results saved to cv_results/fold_{fold_idx + 1}_results.pkl")
    print("-------------------")

total_time = time.time() - start_time
print(f"\nTotal time: {total_time/60:.2f} minutes")

# Add predictions back to original dataframe
pred_dict = dict(patient_predictions)
df_x['risk_score'] = df_x.index.map(pred_dict)
df_x.to_csv('crude_risk_score_df_kfold.csv', index = True, header = True)

# Save final results too
final_results = {
    'all_predictions': patient_predictions,
    'total_time': total_time
}

with open('cv_results/final_results.pkl', 'wb') as f:
    pickle.dump(final_results, f)

# Check how many predictions we have
n_scores = df_x['risk_score'].notna().sum()
print(f"\nNumber of patients with risk scores: {n_scores}")
print(f"Percentage of patients with risk scores: {(n_scores/len(df_x))*100:.1f}%")



      Iter       Train Loss      OOB Improve   Remaining Time 
         1      267339.4505          36.0420          580.55m
         2      267658.7668          33.5967          580.63m
         3      266967.4651          31.3677          581.08m
         4      267218.1165          30.2169          580.40m
         5      267242.6962          29.4493          579.88m
         6      266829.2255          26.9265          579.26m
         7      266785.3275          25.7135          578.90m
         8      266398.2157          23.6673          579.01m
         9      267039.8984          23.1709          578.92m
        10      265713.7313          22.3575          578.67m
        20      266659.4433          15.0603          574.46m
        30      264912.7846          11.4847          569.19m
        40      265875.4329           9.5719          563.74m
        50      264962.4870           7.2961          558.37m
        60      264631.9584           6.7861          553.19m
       