# Flatiron Health mCRC: Final GBM build 

**OBJECTIVE: Build a gradient boosted survival model on the entirety of the Flation Health metastatic colorectal cancer data set. This model will be used for the webtool.**

**BACKGROUND: For details on hyperparameter tuning of the gradient boosted model see notebook *Machine learning crude imputation*. Missingness will be imputed with median given similar test-set AUC performance when compared to MICE.** 

**OUTLINE:**
1. **Preprocessing**
2. **Gradient boosted model** 

## 1. Preprocessing 

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

### Importing full dataframes

In [3]:
# Import training set and set PatientID as index.
train = pd.read_csv('train_full.csv')

In [4]:
row_ID(train)

(27452, 27452)

In [5]:
# Import test set and set PatientID as index.
test = pd.read_csv('test_full.csv')

In [6]:
row_ID(test)

(6863, 6863)

In [7]:
df = pd.concat([train, test])

In [8]:
row_ID(df)

(34315, 34315)

In [9]:
df = df.set_index('PatientID')

In [10]:
df.sample(3)

Unnamed: 0_level_0,gender,race,ethnicity,age,p_type,region,stage,met_year,delta_met_diagnosis,crc_site,...,icd_count,other_cancer,thorax_met,peritoneum_met,liver_met,other_gi_met,cns_met,bone_met,other_met,ses
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FC7A59F2CD290,F,White,Not Hispanic or Latino,79,COMMUNITY,northeast,IV,2016,0,colon_unknown,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
F679F358C7F31,F,White,Not Hispanic or Latino,79,COMMUNITY,northeast,unknown,2016,277,colon_left,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
F5DBD99551492,F,White,Not Hispanic or Latino,53,COMMUNITY,south,III,2016,272,colon_left,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


### Dropping unnecessary demographic variables 

In [11]:
df = df.drop(columns = ['race',
                        'ethnicity',
                        'region',
                        'medicare',
                        'medicaid',
                        'medicare_medicaid',
                        'commercial',
                        'patient_assistance',
                        'other_govt',
                        'self_pay',
                        'other', 
                        'ses'])

### Converting datatypes 

In [12]:
list(df.select_dtypes(include = ['object']).columns)

['gender',
 'p_type',
 'stage',
 'crc_site',
 'KRAS',
 'dMMR_MSIh',
 'NRAS',
 'BRAF',
 'ecog_diagnosis']

In [13]:
to_be_categorical = list(df.select_dtypes(include = ['object']).columns)

In [14]:
to_be_categorical.append('met_year')

In [15]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    df[x] = df[x].astype('category')

In [16]:
list(df.select_dtypes(include = ['category']).columns)

['gender',
 'p_type',
 'stage',
 'met_year',
 'crc_site',
 'KRAS',
 'dMMR_MSIh',
 'NRAS',
 'BRAF',
 'ecog_diagnosis']

In [17]:
# Convert death_status into True or False (required for scikit-survival). 
df['death_status'] = df['death_status'].astype('bool')

### Dropping unneeded labs 

In [18]:
drop_labs = [
    'albumin_avg',
    'alp_avg',
    'alt_avg',
    'ast_avg',
    'bicarb_avg',
    'bun_avg',
    'calcium_avg',
    'cea_avg',
    'chloride_avg',
    'creatinine_avg',
    'hemoglobin_avg',
    'neutrophil_count_avg',
    'platelet_avg',
    'potassium_avg',
    'sodium_avg',
    'total_bilirubin_avg',
    'wbc_avg',
    'albumin_max',
    'bicarb_max',
    'bun_max',
    'calcium_max',
    'chloride_max',
    'hemoglobin_max',
    'neutrophil_count_max',
    'platelet_max',
    'potassium_max',
    'sodium_max',
    'alp_min',
    'alt_min',
    'ast_min',
    'bun_min',
    'calcium_min',
    'cea_min',
    'chloride_min',
    'creatinine_min',
    'neutrophil_count_min',
    'potassium_min',
    'total_bilirubin_min',
    'albumin_std',
    'alp_std',
    'alt_std',
    'ast_std',
    'bicarb_std',
    'bun_std',
    'calcium_std',
    'cea_std',
    'chloride_std',
    'creatinine_std',
    'hemoglobin_std',
    'neutrophil_count_std',
    'platelet_std',
    'potassium_std',
    'sodium_std',
    'total_bilirubin_std',
    'wbc_std',
    'albumin_slope',
    'alp_slope',
    'alt_slope',
    'ast_slope',
    'bicarb_slope',
    'bun_slope',
    'calcium_slope',
    'cea_slope',
    'chloride_slope',
    'creatinine_slope',
    'hemoglobin_slope',
    'neutrophil_count_slope',
    'platelet_slope',
    'potassium_slope',
    'sodium_slope',
    'total_bilirubin_slope',
    'wbc_slope',
    'albumin_slope_na',
    'alp_slope_na',
    'alt_slope_na',
    'ast_slope_na',
    'bicarb_slope_na',
    'bun_slope_na',
    'calcium_slope_na',
    'cea_slope_na',
    'chloride_slope_na',
    'creatinine_slope_na',
    'hemoglobin_slope_na',
    'neutrophil_count_slope_na',
    'platelet_slope_na',
    'potassium_slope_na',
    'sodium_slope_na',
    'total_bilirubin_slope_na',
    'wbc_slope_na']

In [19]:
df.shape

(34315, 211)

In [20]:
df = df.drop(columns = drop_labs)

In [21]:
df.shape

(34315, 122)

In [22]:
df.loc[:, 'alp_max_na'] = np.where(df['alp_max'].isna(), 1, 0)
df.loc[:, 'alt_max_na'] = np.where(df['alt_max'].isna(), 1, 0)
df.loc[:, 'ast_max_na'] = np.where(df['ast_max'].isna(), 1, 0)
df.loc[:, 'cea_max_na'] = np.where(df['cea_max'].isna(), 1, 0)
df.loc[:, 'creatinine_max_na'] = np.where(df['creatinine_max'].isna(), 1, 0)
df.loc[:, 'total_bilirubin_max_na'] = np.where(df['total_bilirubin_max'].isna(), 1, 0)
df.loc[:, 'wbc_max_na'] = np.where(df['wbc_max'].isna(), 1, 0)
df.loc[:, 'albumin_min_na'] = np.where(df['albumin_min'].isna(), 1, 0)
df.loc[:, 'bicarb_min_na'] = np.where(df['bicarb_min'].isna(), 1, 0)
df.loc[:, 'hemoglobin_min_na'] = np.where(df['hemoglobin_min'].isna(), 1, 0)
df.loc[:, 'platelet_min_na'] = np.where(df['platelet_min'].isna(), 1, 0)
df.loc[:, 'sodium_min_na'] = np.where(df['sodium_min'].isna(), 1, 0)
df.loc[:, 'wbc_min_na'] = np.where(df['wbc_min'].isna(), 1, 0)

In [23]:
df.shape

(34315, 135)

### Separate into X and Y 

In [24]:
# 'X' datasets
df_x = df.drop(columns = ['death_status', 'timerisk_activity']) #80% of data 

In [25]:
# 'Y' datasets
# Death status and time until event needs to be stored as a structured array to be compatible with scikit-survival
y_dtypes = df[['death_status', 'timerisk_activity']].dtypes

df_y = np.array([tuple(x) for x in df[['death_status', 'timerisk_activity']].values],
                dtype = list(zip(y_dtypes.index, y_dtypes)))

### Pipeline 

In [26]:
# List of numeric variables, excluding binary variables. 
numerical_features = [
    'age',
    'delta_met_diagnosis',
    'weight_diag',
    'bmi_diag',
    'weight_pct_change',
    'weight_slope',
    'albumin_diag',
    'alp_diag',
    'alt_diag',
    'ast_diag',
    'bicarb_diag',
    'bun_diag',
    'calcium_diag',
    'cea_diag',
    'chloride_diag',
    'creatinine_diag',
    'hemoglobin_diag',
    'neutrophil_count_diag',
    'platelet_diag',
    'potassium_diag',
    'sodium_diag',
    'total_bilirubin_diag',
    'wbc_diag',
    'alp_max',
    'alt_max',
    'ast_max',
    'cea_max',
    'creatinine_max',
    'total_bilirubin_max',
    'wbc_max',
    'albumin_min',
    'bicarb_min',
    'hemoglobin_min',
    'platelet_min',
    'sodium_min',
    'wbc_min',
    'icd_count']

# Transformer will impute column medians and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [27]:
# List of categorical features.
categorical_features = list(df_x.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [28]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

## 2. Gradient boosted model 

In [29]:
from sklearn.pipeline import make_pipeline

from sksurv.ensemble import GradientBoostingSurvivalAnalysis

from joblib import dump, load 

In [30]:
df_x.shape

(34315, 133)

In [31]:
df_y.shape

(34315,)

In [32]:
gbm_final_colorectal = make_pipeline(preprocessor, GradientBoostingSurvivalAnalysis(n_estimators = 1125,
                                                                                    learning_rate = 0.05,
                                                                                    max_depth = 4,
                                                                                    subsample = 0.75,
                                                                                    verbose = 1,
                                                                                    random_state = 42))

gbm_final_colorectal.fit(df_x, df_y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1      148066.2197          27.7535          269.96m
         2      148166.3434          26.6813          269.09m
         3      147963.1549          24.3110          268.63m
         4      148491.6911          24.1694          268.26m
         5      147617.2430          23.2766          268.03m
         6      148216.7670          22.5511          267.83m
         7      147992.7694          22.1337          267.60m
         8      147595.6790          20.1758          267.37m
         9      147428.4555          19.5667          267.14m
        10      147375.9559          19.3977          266.88m
        20      146524.3325          13.2721          264.38m
        30      146372.0103           9.7547          261.95m
        40      147047.2328           6.9774          259.53m
        50      146128.7197           5.8765          257.15m
        60      145641.6966           4.8564          254.74m
       

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'delta_met_diagnosis',
                                                   'weight_diag', 'bmi_diag',
                                                   'weight_pct_change',
                                                   'weight_slope',
                                                   'albumin_diag', 'alp_diag',
                                                   'alt_diag', 'ast_diag',
                                            

In [33]:
dump(gbm_final_colorectal, 'gbm_final_colorectal.joblib') 

['gbm_final_colorectal.joblib']