# Flatiron Health aNSCLC: Final GBM build 

**OBJECTIVE: Build a gradient boosted survival model on the entirety of the Flation Health advanced NSCLC data set. This model will be used for the webtool.**

**BACKGROUND: For details on hyperparameter tuning of the gradient boosted model see notebook *Machine learning crude imputation*. Missingness will be crudely imputatated given similar test-set AUC performance when compared to MICE.** 

**OUTLINE:**
1. **Preprocessing**
2. **Gradient boosted model** 

## 1. Preprocessing 

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

### Importing full dataframes

In [3]:
train = pd.read_csv('train_full.csv')

In [4]:
row_ID(train)

(54786, 54786)

In [5]:
test = pd.read_csv('test_full.csv')

In [6]:
row_ID(test)

(13697, 13697)

In [7]:
df = pd.concat([train, test])

In [8]:
row_ID(df)

(68483, 68483)

In [9]:
df = df.set_index('PatientID')

In [10]:
df.sample(3)

Unnamed: 0_level_0,PracticeType,gender,race,ethnicity,age,region,Histology,SmokingStatus,stage,adv_year,...,depression,elixhauser_other,icd_count,other_cancer,cns_met,bone_met,liver_met,resp_met,adrenal_met,other_met
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F7132E00BC2CB,COMMUNITY,F,Black or African American,unknown,69,south,Non-squamous cell carcinoma,No history of smoking,IV,2012,...,0.0,1.0,7.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
F771094962C02,COMMUNITY,F,White,unknown,68,midwest,Non-squamous cell carcinoma,No history of smoking,IIIA,2019,...,0.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F4EF0CE105876,ACADEMIC,F,White,unknown,75,unknown,Non-squamous cell carcinoma,History of smoking,I,2018,...,1.0,1.0,102.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dropping unnecessary demographic variables 

In [11]:
df = df.drop(columns = ['race',
                        'ethnicity',
                        'region',
                        'commercial',
                        'medicare',
                        'medicaid',
                        'other_insurance'])

### Converting datatypes 

In [12]:
list(df.select_dtypes(include = ['object']).columns)

['PracticeType',
 'gender',
 'Histology',
 'SmokingStatus',
 'stage',
 'ALK',
 'BRAF',
 'EGFR',
 'KRAS',
 'ROS1',
 'pdl1',
 'pdl1_n',
 'ecog_diagnosis']

In [13]:
to_be_categorical = list(df.select_dtypes(include = ['object']).columns)

In [14]:
to_be_categorical.append('adv_year')

In [15]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    df[x] = df[x].astype('category')

In [16]:
list(df.select_dtypes(include = ['category']).columns)

['PracticeType',
 'gender',
 'Histology',
 'SmokingStatus',
 'stage',
 'adv_year',
 'ALK',
 'BRAF',
 'EGFR',
 'KRAS',
 'ROS1',
 'pdl1',
 'pdl1_n',
 'ecog_diagnosis']

In [17]:
# Convert death_status into True or False (required for scikit-survival). 
df['death_status'] = df['death_status'].astype('bool')

### Dropping unneeded labs 

In [18]:
drop_labs = [
    'albumin_avg',
    'alp_avg',
    'alt_avg',
    'ast_avg',
    'bicarb_avg',
    'bun_avg',
    'calcium_avg',
    'chloride_avg',
    'creatinine_avg',
    'hemoglobin_avg',
    'neutrophil_count_avg',
    'platelet_avg',
    'potassium_avg',
    'sodium_avg',
    'total_bilirubin_avg',
    'wbc_avg',
    'albumin_max',
    'bicarb_max',
    'bun_max',
    'chloride_max',
    'hemoglobin_max',
    'neutrophil_count_max',
    'platelet_max',
    'potassium_max',
    'sodium_max',
    'alp_min',
    'alt_min',
    'ast_min',
    'bun_min',
    'calcium_min',
    'chloride_min',
    'creatinine_min',
    'neutrophil_count_min',
    'potassium_min',
    'total_bilirubin_min',
    'albumin_std',
    'alp_std',
    'alt_std',
    'ast_std',
    'bicarb_std',
    'bun_std',
    'calcium_std',
    'chloride_std',
    'creatinine_std',
    'hemoglobin_std',
    'neutrophil_count_std',
    'platelet_std',
    'potassium_std',
    'sodium_std',
    'total_bilirubin_std',
    'wbc_std',
    'albumin_slope',
    'alp_slope',
    'alt_slope',
    'ast_slope',
    'bicarb_slope',
    'bun_slope',
    'calcium_slope',
    'chloride_slope',
    'creatinine_slope',
    'hemoglobin_slope',
    'neutrophil_count_slope',
    'platelet_slope',
    'potassium_slope',
    'sodium_slope',
    'total_bilirubin_slope',
    'wbc_slope',
    'albumin_slope_na',
    'alp_slope_na',
    'alt_slope_na',
    'ast_slope_na',
    'bicarb_slope_na',
    'bun_slope_na',
    'calcium_slope_na',
    'chloride_slope_na',
    'creatinine_slope_na',
    'hemoglobin_slope_na',
    'neutrophil_count_slope_na',
    'platelet_slope_na',
    'potassium_slope_na',
    'sodium_slope_na',
    'total_bilirubin_slope_na',
    'wbc_slope_na']

In [19]:
df.shape

(68483, 205)

In [20]:
df = df.drop(columns = drop_labs)

In [21]:
df.shape

(68483, 122)

In [22]:
df.loc[:, 'alp_max_na'] = np.where(df['alp_max'].isna(), 1, 0)
df.loc[:, 'alt_max_na'] = np.where(df['alt_max'].isna(), 1, 0)
df.loc[:, 'ast_max_na'] = np.where(df['ast_max'].isna(), 1, 0)
df.loc[:, 'calcium_max_na'] = np.where(df['calcium_max'].isna(), 1, 0)
df.loc[:, 'creatinine_max_na'] = np.where(df['creatinine_max'].isna(), 1, 0)
df.loc[:, 'total_bilirubin_max_na'] = np.where(df['total_bilirubin_max'].isna(), 1, 0)
df.loc[:, 'wbc_max_na'] = np.where(df['wbc_max'].isna(), 1, 0)
df.loc[:, 'albumin_min_na'] = np.where(df['albumin_min'].isna(), 1, 0)
df.loc[:, 'bicarb_min_na'] = np.where(df['bicarb_min'].isna(), 1, 0)
df.loc[:, 'hemoglobin_min_na'] = np.where(df['hemoglobin_min'].isna(), 1, 0)
df.loc[:, 'platelet_min_na'] = np.where(df['platelet_min'].isna(), 1, 0)
df.loc[:, 'sodium_min_na'] = np.where(df['sodium_min'].isna(), 1, 0)
df.loc[:, 'wbc_min_na'] = np.where(df['wbc_min'].isna(), 1, 0)

In [23]:
df.shape

(68483, 135)

### Separate into X and Y 

In [24]:
# 'X' datasets
df_x = df.drop(columns = ['death_status', 'timerisk_activity']) #80% of data 

In [25]:
# 'Y' datasets
# Death status and time until event needs to be stored as a structured array to be compatible with scikit-survival
y_dtypes = df[['death_status', 'timerisk_activity']].dtypes

df_y = np.array([tuple(x) for x in df[['death_status', 'timerisk_activity']].values],
                dtype = list(zip(y_dtypes.index, y_dtypes)))

### Pipeline 

In [26]:
# List of numeric variables, excluding binary variables. 
numerical_features = [
    'age',
    'delta_adv_diagnosis',
    'weight_diag',
    'bmi_diag',
    'weight_pct_change',
    'weight_slope',
    'albumin_diag',
    'alp_diag',
    'alt_diag',
    'ast_diag',
    'bicarb_diag',
    'bun_diag',
    'calcium_diag',
    'chloride_diag',
    'creatinine_diag',
    'hemoglobin_diag',
    'neutrophil_count_diag',
    'platelet_diag',
    'potassium_diag',
    'sodium_diag',
    'total_bilirubin_diag',
    'wbc_diag',
    'alp_max',
    'alt_max',
    'ast_max',
    'calcium_max',
    'creatinine_max',
    'total_bilirubin_max',
    'wbc_max',
    'albumin_min',
    'bicarb_min',
    'hemoglobin_min',
    'platelet_min',
    'sodium_min',
    'wbc_min',
    'icd_count']

# Transformer will impute column medians and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [27]:
# List of categorical features.
categorical_features = list(df_x.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [28]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

## 2. Gradient boosted model 

In [29]:
from sklearn.pipeline import make_pipeline

from sksurv.ensemble import GradientBoostingSurvivalAnalysis

from joblib import dump, load 

In [30]:
df_x.shape

(68483, 133)

In [30]:
df_y.shape

(68483,)

In [31]:
gbm_final_lung = make_pipeline(preprocessor, GradientBoostingSurvivalAnalysis(n_estimators = 1150,
                                                                              learning_rate = 0.05,
                                                                              max_depth = 2,
                                                                              subsample = 0.75,
                                                                              verbose = 1,
                                                                              random_state = 42))

gbm_final_lung.fit(df_x, df_y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1      366565.6717          47.8134         1042.75m
         2      367080.5085          45.9179         1036.95m
         3      367103.9753          44.1083         1034.00m
         4      366774.0141          41.1388         1032.40m
         5      366684.9908          38.1312         1030.88m
         6      366177.2551          36.2076         1029.55m
         7      366926.7864          34.5375         1028.27m
         8      366440.4940          32.3351         1027.00m
         9      366173.5435          33.1226         1026.06m
        10      365306.2828          30.6347         1025.02m
        20      365349.6679          20.1921         1017.05m
        30      365328.7948          15.7504         1009.20m
        40      364634.9764          12.2010         1001.49m
        50      364657.3110          11.0786          993.41m
        60      362916.2055           8.1341          985.18m
       

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'delta_adv_diagnosis',
                                                   'weight_diag', 'bmi_diag',
                                                   'weight_pct_change',
                                                   'weight_slope',
                                                   'albumin_diag', 'alp_diag',
                                                   'alt_diag', 'ast_diag',
                                            

In [32]:
dump(gbm_final_lung, 'gbm_final_lung.joblib') 

['gbm_final_lung.joblib']