# Overview

This notebook contains code used to develop the model.

The flow of the code is as follows:
1. Setup - load packages
2. Ingest the data 
3. Inspect the data
4. Imputation and encoding
5. Model selection loop
6. Inspect the results
7. View the best model
8. Save the results

# Setup

In [None]:
#Standard Library Imports
import itertools
import json
import os
import re
import warnings

#Third Party Library Imports
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import uniform, randint
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer, average_precision_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#Local Imports
from utils import *

# Define Global Parameters

In [2]:
#Global settings
n_jobs = -1
k_fold = 5 
n_repeats = 100
n_iter = 500
seed = 2020
np.random.seed(seed)
verbose = False
return_train_score =  False
summary_dict = {}
key_dict = {1:'Resolved',0:'Unresolved'}
test_size = 0.25
drop_first = True
missing_indicator = False

# Ingest Data

In [3]:
outcome = pd.read_csv('processed_data/outcome.csv')
predictors = pd.read_csv('processed_data/predictors.csv')
categorical_features = pd.read_csv('processed_data/categorical_features.csv').values.tolist()
categorical_features = [item for sublist in categorical_features for item in sublist]
numeric_features = pd.read_csv('processed_data/numeric_features.csv').values.tolist()
numeric_features = [item for sublist in numeric_features for item in sublist]

# Inspect Data

In [4]:
print(f'Dataset size: {len(outcome)}')
print()
print('Class Breakdown, count:')
print(outcome['lvtstatus'].value_counts())
print()
print('Class Breakdown, %:')
print(outcome['lvtstatus'].value_counts(normalize=True))

Dataset size: 244

Class Breakdown, count:
1    156
0     88
Name: lvtstatus, dtype: int64

Class Breakdown, %:
1    0.639344
0    0.360656
Name: lvtstatus, dtype: float64


# Obtain Training Data Predictors (x_train)

The x_train matrix is used to get mean and standard deviations used in standardization & scaling, and median imputation of scalar variables. It is also used to get the mode for categorical variables for imputation.

In [5]:
x_train,_,_,_ = train_test_split(predictors,outcome,test_size=test_size,random_state=seed,stratify=outcome)
train_indices = x_train.index

# Full List Of Covariates Before Onehot Encoding

In [6]:
print('All predictors:')
list(x_train)

All predictors:


['Age, years',
 'Sex',
 'Height, cm',
 'Weight, kg',
 'Body Mass Index',
 'Diabetes Mellitus/Prediabetes',
 'Chronic Kidney Disease',
 'Venous Thromboembolism',
 'Cerebrovascular Accident/Transient Ischemic Attack',
 'Heart Failure',
 'Post-AMI Atrial Fibrillation',
 'Post-AMI Cardiogenic Shock',
 'Cardiopulmonary Resuscitation',
 'Peak Troponin I, ng/dL',
 'Hemoglobin, g/dL',
 'White Blood Cell Count, 10^9/L',
 'Lymphocyte Count, 10^9/L',
 'Neutrophil Count, 10^9/L',
 'Platelet Count, 10^9/dL',
 'Prothrombin Time, seconds',
 'International Normalized Ratio',
 'Activated Partial Thromboplastin Time, seconds',
 'Creatinine, mmol/L',
 'ACS Type',
 'Visual Ejection Fraction, %',
 'Left Ventricle Internal Diameter At End-diastole, mm',
 'Left Ventricle Internal Diameter At End-systole, mm',
 'Left Ventricle Outflow Tract, mm',
 'Wall Motion Abnormality',
 'Left Ventricular Aneurysm',
 'LV Thrombus Mobility',
 'Protrusion',
 'Aspirin Use',
 'Second Antiplatelet Agent',
 'Coronary Artery Dis

In [7]:
def impute_and_encode(df,train_indices,categorical_features=categorical_features):
    """
    Takes a dataframe and perform univariate imputation by column
    
    Parameters
    ----------
    df: pandas.DataFrame
        Dataset to be imputed.
    train_indices: array-like
        An array of indices for training data - used to fit SimpleImputer obtain
    categorical_features: list
        An list of strings containing column names for categorical objects. Used to determine type of imputation and whether centering and scaling is necessary
    
    Returns
    -------
    imputed_df: pandas.DataFrame
        A dataframe containing the imputed and scaled dataset
        
    """
    imputed_df = pd.DataFrame()
    for column in df.columns:
        if df[column].isna().sum() != 0:
            array = df[column].values.reshape(-1, 1)
            if column in numeric_features: 
                si = SimpleImputer(strategy='median',missing_values=np.nan,add_indicator=missing_indicator)
                si.fit(array[train_indices])
                out = si.transform(array)
            else:
                si = SimpleImputer(strategy='most_frequent',missing_values=np.nan,add_indicator=missing_indicator)
                si.fit(array[train_indices])
                out = si.transform(array)
            if out.shape[1] == 1:
                out = out.flatten()
                imputed_df[column] = out
            else:
                imputed_df[column] = out[:,0]
                imputed_df[column+'_missing'] = out[:,1].astype('bool') 
        else:
            imputed_df[column] = df[column]
    
    for column in df.columns:
        if column not in categorical_features:
            array = imputed_df[column].values.reshape(-1, 1)
            std_scaler = StandardScaler()
            std_scaler.fit(array[train_indices])
            out = std_scaler.transform(array)
            out = out.flatten()
            imputed_df[column] = out
    
    for varname in categorical_features:
        onehot = pd.get_dummies(imputed_df[varname],prefix=varname,prefix_sep='_',drop_first=drop_first)
        imputed_df = imputed_df.drop(varname,axis=1).join(onehot)
    return imputed_df
predictors = impute_and_encode(predictors,train_indices=train_indices)

# Full List of Covariates After Onehot Encoding

In [8]:
predictors

Unnamed: 0,"Age, years","Height, cm","Weight, kg",Body Mass Index,"Peak Troponin I, ng/dL","Hemoglobin, g/dL","White Blood Cell Count, 10^9/L","Lymphocyte Count, 10^9/L","Neutrophil Count, 10^9/L","Platelet Count, 10^9/dL",...,Protrusion_Yes,Aspirin Use_Yes,Second Antiplatelet Agent_Yes,Coronary Artery Disease_No Vessel Disease,Coronary Artery Disease_Single Vessel Disease,Coronary Artery Disease_Triple Vessel Disease,Number of Culprit Arteries_1.0,Number of Culprit Arteries_2.0,Number of Culprit Arteries_3.0,Revascularization Procedure_Yes
0,0.300690,-0.280067,-0.534568,-0.532665,-0.684497,-1.833510,-0.933802,1.924035,-1.768132,-0.475690,...,0,1,1,0,0,1,0,0,1,1
1,0.221856,1.294908,1.051381,0.462623,-0.552544,0.843878,-0.853208,0.756168,-1.301658,0.077017,...,0,1,1,0,0,0,0,1,0,1
2,-0.329984,1.426155,1.805673,1.152554,-0.747631,-0.823175,0.652760,0.212974,0.564239,2.123208,...,0,1,1,0,0,0,0,1,0,1
3,0.773695,-0.411315,-1.804617,-1.972613,-0.825860,-1.782993,0.454727,-1.661046,-0.404364,2.323124,...,0,1,0,0,0,1,1,0,0,0
4,-0.014647,1.557403,0.193937,-0.509024,-0.825584,-0.570591,-1.415067,0.059069,-1.595804,-0.899041,...,0,1,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,-0.960657,0.048053,-1.243732,-0.183215,-0.798155,0.641811,1.983722,-0.149156,-0.019597,-0.969599,...,0,1,1,0,1,0,1,0,0,1
240,-2.222005,0.376172,0.458262,0.286799,-0.798155,0.035610,-0.058776,-0.149156,-0.019597,1.323547,...,0,1,1,0,0,0,0,1,0,1
241,-1.591331,0.113676,-0.515227,-0.674917,-0.798155,0.136644,-0.095619,-0.149156,-0.019597,-0.117019,...,0,1,1,0,0,0,1,0,0,1
242,-0.645320,-1.592546,1.102956,2.248635,-0.798155,0.439744,0.201430,-0.149156,-0.019597,-0.028821,...,0,1,1,0,1,0,1,0,0,1


In [9]:
list(predictors)

['Age, years',
 'Height, cm',
 'Weight, kg',
 'Body Mass Index',
 'Peak Troponin I, ng/dL',
 'Hemoglobin, g/dL',
 'White Blood Cell Count, 10^9/L',
 'Lymphocyte Count, 10^9/L',
 'Neutrophil Count, 10^9/L',
 'Platelet Count, 10^9/dL',
 'Prothrombin Time, seconds',
 'International Normalized Ratio',
 'Activated Partial Thromboplastin Time, seconds',
 'Creatinine, mmol/L',
 'Visual Ejection Fraction, %',
 'Left Ventricle Internal Diameter At End-diastole, mm',
 'Left Ventricle Internal Diameter At End-systole, mm',
 'Left Ventricle Outflow Tract, mm',
 'Sex_Male',
 'Diabetes Mellitus/Prediabetes_Yes',
 'Chronic Kidney Disease_Yes',
 'Venous Thromboembolism_Yes',
 'Cerebrovascular Accident/Transient Ischemic Attack_Yes',
 'Heart Failure_Yes',
 'Post-AMI Atrial Fibrillation_Yes',
 'Post-AMI Cardiogenic Shock_Yes',
 'Cardiopulmonary Resuscitation_Yes',
 'ACS Type_STEMI',
 'Wall Motion Abnormality_Regional',
 'Left Ventricular Aneurysm_Yes',
 'LV Thrombus Mobility_Yes',
 'Protrusion_Yes',
 'A

# Train Test Split

In [10]:
x_train,x_test,y_train,y_test = train_test_split(predictors,outcome,test_size=test_size,random_state=seed,stratify=outcome)
y_train = y_train.values.flatten()
y_test = y_test.values.flatten()
batch_size = len(x_train)

In [11]:
pd.DataFrame(x_train).to_csv('processed_data/x_train.csv')
pd.DataFrame(x_test).to_csv('processed_data/x_test.csv')
pd.DataFrame(y_train).to_csv('processed_data/y_train.csv')
pd.DataFrame(y_test).to_csv('processed_data/y_test.csv')

In [12]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(183, 41)
(61, 41)
(183,)
(61,)


# Model Selection

The following cell contain a wrapper function used to perform model selection using a randomised search algorithm.

In [13]:
def model_selection(summary_dict,model_lst,param_dict,technique,x_train=x_train,y_train=y_train,x_test=x_test,y_test=y_test,n_iter=n_iter,k_fold=k_fold,n_repeats=n_repeats):
    """
    A wrapper function for the model selection loop
    
    Parameters
    ----------
    summary_dict: dict
        An empty dictionary used to store results.
    model_lst: list
        A list of tuples containing ('model_name',model), models are sklearn estimators
    param_dict: dict
        A dictionary containing model parameter distributions - to be passed to RandomizedSearchCV
    technique: str
        A string indicating technique used. Only relevant if testing techniques such as oversampling/SMOTE.
    x_train: array-like
        An array training set predictors
    y_train: array-like
        An array containing training set labels
    x_test: array-like
        An array containing test set predictors
    y_test: array-like
        An array containing test set labels
    n_iter: int
        Number of crossvalidation iterations - to be passed to RandomizedSearchCV. Defaults to n_iter parameter at top of script
    k_fold: int
        Number of crossvalidation folds - to be passed to RandomizedSearchCV. Defaults to k_fold parameter at top of script
    n_repeats: int
        Number of crossvalidation repeats - to be passed to RandomizedSearchCV. Defaults to n_repeats parameter at top of script
        
    Returns
    -------
    summary_dict: pandas.DataFrame
        A dataframe containing the best model object and associated crossvalidation results
    result_table: pandas.DataFrame
        A dataframe containing all model objects and associated crossvalidation results
    """
    iterations = n_iter
    
    #Full list of scoring metrics, but only roc_auc is used in the end
    scoring = {'roc_auc':'roc_auc','average_precision':'average_precision','accuracy': 'accuracy'}
    
    #Create an empty list used to store the results
    result_list = []
    
    #Loop through the list of models
    for name, model in model_lst:

        #Define the cross-validation folds
        cv = RepeatedStratifiedKFold(n_splits=k_fold,n_repeats=n_repeats)
        
        #Set AUROC as the optimizing metric
        refit_score = 'roc_auc'
        
        #Set the estimator as the model currently being optimized
        estimator = model

        #Create the RandomizedSearchCV object
        search = RandomizedSearchCV(estimator,param_distributions=param_dict.get(name),random_state=seed,cv=cv,n_iter=iterations,n_jobs=n_jobs,
                                      scoring=scoring,refit=refit_score,verbose=verbose,return_train_score=return_train_score)
        
        #Begin the grid search process
        search.fit(x_train, y_train)

        #Calculate some metrics on the full training dataset (purely for diagnostics)
        y_pred = search.best_estimator_.predict(x_train)
        
        print('Classification report of best model:')
        print(classification_report(y_true=y_train,y_pred=y_pred))
        print(f'CV score of best model: {search.best_score_}')
        print()
    
        #Append the results of the best model to results_list
        result_list.append((name,search,search.best_score_,search.cv_results_))
        
        ##End of loop
    
    #The following code tidies result_list in to a dataframe
    result_table = pd.DataFrame(result_list,columns=['name','model','scores','score_dict'])
    best_model_index = result_table['scores']==max(result_table['scores'])
    model_name = result_table['name'][best_model_index].values.tolist()[0]
    best_model = result_table['model'][best_model_index].values.tolist()[0]
    summary_dict[technique] = {'Model':model_name}
    metrics = ['mean_test_roc_auc','mean_test_average_precision','mean_test_accuracy']
    if hasattr(best_model,'best_score_'):
        best_score = best_model.best_score_ 
        for key in [key for key in best_model.cv_results_.keys() if key in metrics]:
            summary_dict[technique][key.split('mean_test_')[1]] = best_model.cv_results_[key][best_model.best_index_]
        summary_dict[technique]['model obj'] = best_model.best_estimator_
    else:
        best_score = mean_cv_roc_auc
        summary_dict[technique]['model obj'] = best_model
        for key in [key for key in best_search.cv_results_.keys() if key in metrics]:
            summary_dict[technique][key.split('mean_test_')[1]] = result_table['score_dict'][best_model_index].get(key)
    
    #Find the overall results
    print(f"Best Cross-Validation score: {best_score}")        
    
    return summary_dict, result_table

In [None]:
#This cell runs the model selection loop 

#Create model objects
logistic = SGDClassifier(loss='log',random_state=seed)
gbm = GradientBoostingClassifier(random_state=seed)
classifier_list = [('lr',logistic),('gbm',gbm)]

#Define the hyperparameter search space
params = {'lr':{'alpha':uniform(1e-5,10),
                'penalty':['l1', 'l2', 'elasticnet'],
                'l1_ratio':uniform(0.01,0.30),
                'class_weight':[None,'balanced']},
          'gbm':{'loss':['deviance','exponential'],
                 'learning_rate':uniform(0.003, 0.3),
                 'n_estimators':randint(5, 2000),
                 'subsample':uniform(0.5, 0.5),
                 'criterion':['friedman_mse','mse','mae'],
                 'min_samples_split':randint(2,20),
                 'min_samples_leaf':randint(2,20),
                 'max_depth':randint(2,10),
                 'max_features':['sqrt', 'log2']}}

#Run the model selection loop
summary_dict, conventional_results = model_selection(summary_dict=summary_dict,model_lst=classifier_list,param_dict=params,technique='conventional')

# Results

In [15]:
summary = pd.DataFrame.from_dict(summary_dict,orient='index').applymap(lambda cell: np.round(cell,2) if isinstance(cell,float) else cell)
summary.to_csv(f'results/train_summary_results.csv')
conventional_results.sort_values(by=['scores'],ascending=False).reset_index(drop=True)

Unnamed: 0,Model,roc_auc,average_precision,accuracy,model obj
conventional,gbm,0.75,0.82,0.71,"([DecisionTreeRegressor(ccp_alpha=0.0, criteri..."


# Best Model

In [18]:
best_model = summary['model obj'][summary['roc_auc'] == max(summary['roc_auc'])][0]
best_technique = summary.index[summary['roc_auc'] == max(summary['roc_auc'])][0]
joblib.dump(best_model,f'pickled_objects/{protocol}_best_model.pkl')
conventional_modified.to_json('results/train_results.csv')
print(f'Best Model: {best_model}')



Best Model: GradientBoostingClassifier(ccp_alpha=0.0, criterion='mae', init=None,
                           learning_rate=0.0101437649430095, loss='exponential',
                           max_depth=2, max_features='sqrt',
                           max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_impurity_split=None, min_samples_leaf=18,
                           min_samples_split=17, min_weight_fraction_leaf=0.0,
                           n_estimators=282, n_iter_no_change=None,
                           presort='deprecated', random_state=2020,
                           subsample=0.6753887541590888, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
