In [1]:
#Standard Library Imports
import itertools
import json
import os
import re
import warnings

#Third Party Library Imports
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import uniform, randint
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer, average_precision_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
import tensorflow.keras.regularizers as regularizers
import xgboost as xgb

#Local Imports
from utils import *

In [2]:
#Global settings
protocol = 'LVTRES'
n_jobs = -1
k_fold = 5
n_repeats = 50
n_iter = 200
seed = 1
np.random.seed(seed)
verbose = False
return_train_score =  False
summary_dict = {}
key_dict = {1:'Resolved',0:'Unresolved'}
test_size = 0.25
drop_first = True
missing_indicator = False

In [3]:
outcome = pd.read_csv('processed_data/outcome.csv')
predictors = pd.read_csv('processed_data/predictors.csv')
categorical_features = pd.read_csv('processed_data/categorical_features.csv').values.tolist()
categorical_features = [item for sublist in categorical_features for item in sublist]
numeric_features = pd.read_csv('processed_data/numeric_features.csv').values.tolist()
numeric_features = [item for sublist in numeric_features for item in sublist]

In [4]:
print(f'Dataset size: {len(outcome)}')
print()
print('Class Breakdown, count:')
print(outcome['lvtstatus'].value_counts())
print()
print('Class Breakdown, %:')
print(outcome['lvtstatus'].value_counts(normalize=True))

Dataset size: 244

Class Breakdown, count:
1    156
0     88
Name: lvtstatus, dtype: int64

Class Breakdown, %:
1    0.639344
0    0.360656
Name: lvtstatus, dtype: float64


In [5]:
x_train,_,_,_ = train_test_split(predictors,outcome,test_size=test_size,random_state=seed,stratify=outcome)
train_indices = x_train.index

In [6]:
print('All predictors:')
list(x_train)

All predictors:


['Age, years',
 'Sex',
 'Height, cm',
 'Weight, kg',
 'Body Mass Index',
 'Diabetes Mellitus/Prediabetes',
 'Chronic Kidney Disease',
 'Venous Thromboembolism',
 'Cerebrovascular Accident/Transient Ischemic Attack',
 'Heart Failure',
 'Post-AMI Atrial Fibrillation',
 'Post-AMI Cardiogenic Shock',
 'Cardiopulmonary Resuscitation',
 'Peak Troponin I, ng/dL',
 'Hemoglobin, g/dL',
 'White Blood Cell Count, 10^9/L',
 'Lymphocyte Count, 10^9/L',
 'Neutrophil Count, 10^9/L',
 'Platelet Count, 10^9/dL',
 'Prothrombin Time, seconds',
 'International Normalized Ratio',
 'Activated Partial Thromboplastin Time, seconds',
 'Creatinine, mmol/L',
 'ACS Type',
 'Visual Ejection Fraction, %',
 'Left Ventricle Internal Diameter At End-diastole, mm',
 'Left Ventricle Internal Diameter At End-systole, mm',
 'Left Ventricle Outflow Tract, mm',
 'Wall Motion Abnormality',
 'Left Ventricular Aneurysm',
 'LV Thrombus Mobility',
 'Protrusion',
 'Aspirin Use',
 'Second Antiplatelet Agent',
 'Coronary Artery Dis

In [7]:
def impute_and_encode(df,train_indices,categorical_features=categorical_features):
    """
    Takes a dataframe and perform univariate imputation by column
    
    Parameters
    ----------
    df: pandas.DataFrame
        Dataset to be imputed.
    train_indices: array-like
        An array of indices for training data - used to fit SimpleImputer obtain
    categorical_features: list
        An list of strings containing column names for categorical objects. Used to determine type of imputation and whether centering and scaling is necessary
    
    Returns
    -------
    imputed_df: pandas.DataFrame
        A dataframe containing the imputed and scaled dataset
        
    """
    imputed_df = pd.DataFrame()
    for column in df.columns:
        if df[column].isna().sum() != 0:
            array = df[column].values.reshape(-1, 1)
            if column in numeric_features: 
                si = SimpleImputer(strategy='median',missing_values=np.nan,add_indicator=missing_indicator)
                si.fit(array[train_indices])
                out = si.transform(array)
            else:
                si = SimpleImputer(strategy='most_frequent',missing_values=np.nan,add_indicator=missing_indicator)
                si.fit(array[train_indices])
                out = si.transform(array)
            if out.shape[1] == 1:
                out = out.flatten()
                imputed_df[column] = out
            else:
                imputed_df[column] = out[:,0]
                imputed_df[column+'_missing'] = out[:,1].astype('bool') 
        else:
            imputed_df[column] = df[column]
    
    for column in df.columns:
        if column not in categorical_features:
            array = imputed_df[column].values.reshape(-1, 1)
            std_scaler = StandardScaler()
            std_scaler.fit(array[train_indices])
            out = std_scaler.transform(array)
            out = out.flatten()
            imputed_df[column] = out
    
    for varname in categorical_features:
        onehot = pd.get_dummies(imputed_df[varname],prefix=varname,prefix_sep='_',drop_first=drop_first)
        imputed_df = imputed_df.drop(varname,axis=1).join(onehot)
    return imputed_df
predictors = impute_and_encode(predictors,train_indices=train_indices)

In [8]:
predictors

Unnamed: 0,"Age, years","Height, cm","Weight, kg",Body Mass Index,"Peak Troponin I, ng/dL","Hemoglobin, g/dL","White Blood Cell Count, 10^9/L","Lymphocyte Count, 10^9/L","Neutrophil Count, 10^9/L","Platelet Count, 10^9/dL",...,Protrusion_Yes,Aspirin Use_Yes,Second Antiplatelet Agent_Yes,Coronary Artery Disease_No Vessel Disease,Coronary Artery Disease_Single Vessel Disease,Coronary Artery Disease_Triple Vessel Disease,Number of Culprit Arteries_1.0,Number of Culprit Arteries_2.0,Number of Culprit Arteries_3.0,Revascularization Procedure_Yes
0,0.305389,-0.232911,-0.532608,-0.561564,-0.765259,-1.934962,-0.929700,2.032006,-1.786166,-0.380517,...,0,1,1,0,0,1,0,0,1,1
1,0.227117,1.138331,1.132862,0.468128,-0.634885,0.861938,-0.845878,0.823217,-1.335505,0.191755,...,0,1,1,0,0,0,0,1,0,1
2,-0.320787,1.252602,1.924976,1.181908,-0.827637,-0.879528,0.720400,0.260990,0.467139,2.310381,...,0,1,1,0,0,0,0,1,0,1
3,0.775021,-0.347182,-1.866338,-2.051287,-0.904930,-1.882190,0.514437,-1.678693,-0.468629,2.517373,...,0,1,0,0,1,0,1,0,0,0
4,-0.007699,1.366872,0.232425,-0.537106,-0.904658,-0.615670,-1.430238,0.101692,-1.619680,-0.818853,...,0,1,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,-0.946963,0.109899,-1.277331,-0.162810,-0.877557,0.650851,2.104663,-0.170051,-0.013662,-0.891909,...,0,1,1,0,1,0,1,0,0,1
240,-2.199314,0.338440,0.510004,0.286225,-0.877557,0.017591,-0.019630,-0.170051,-0.013662,1.482412,...,0,1,1,0,0,0,0,1,0,1
241,-1.573138,0.109899,-0.512297,-0.708734,-0.877557,0.070362,-0.122612,-0.170051,-0.013662,-0.124821,...,0,1,1,0,0,0,1,0,0,1
242,-0.633875,-1.375614,1.187024,2.315876,-0.877557,0.439764,0.250995,-0.170051,-0.013662,0.082171,...,0,1,1,0,1,0,1,0,0,1


In [9]:
list(predictors)

['Age, years',
 'Height, cm',
 'Weight, kg',
 'Body Mass Index',
 'Peak Troponin I, ng/dL',
 'Hemoglobin, g/dL',
 'White Blood Cell Count, 10^9/L',
 'Lymphocyte Count, 10^9/L',
 'Neutrophil Count, 10^9/L',
 'Platelet Count, 10^9/dL',
 'Prothrombin Time, seconds',
 'International Normalized Ratio',
 'Activated Partial Thromboplastin Time, seconds',
 'Creatinine, mmol/L',
 'Visual Ejection Fraction, %',
 'Left Ventricle Internal Diameter At End-diastole, mm',
 'Left Ventricle Internal Diameter At End-systole, mm',
 'Left Ventricle Outflow Tract, mm',
 'Sex_Male',
 'Diabetes Mellitus/Prediabetes_Yes',
 'Chronic Kidney Disease_Yes',
 'Venous Thromboembolism_Yes',
 'Cerebrovascular Accident/Transient Ischemic Attack_Yes',
 'Heart Failure_Yes',
 'Post-AMI Atrial Fibrillation_Yes',
 'Post-AMI Cardiogenic Shock_Yes',
 'Cardiopulmonary Resuscitation_Yes',
 'ACS Type_STEMI',
 'Wall Motion Abnormality_Regional',
 'Left Ventricular Aneurysm_Yes',
 'LV Thrombus Mobility_Yes',
 'Protrusion_Yes',
 'A

In [10]:
x_train,x_test,y_train,y_test = train_test_split(predictors,outcome,test_size=test_size,random_state=seed,stratify=outcome)
y_train = y_train.values.flatten()
y_test = y_test.values.flatten()
batch_size = len(x_train)

In [11]:
pd.DataFrame(x_train).to_csv('processed_data/x_train.csv')
pd.DataFrame(x_test).to_csv('processed_data/x_test.csv')
pd.DataFrame(y_train).to_csv('processed_data/y_train.csv')
pd.DataFrame(y_test).to_csv('processed_data/y_test.csv')

In [12]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(183, 41)
(61, 41)
(183,)
(61,)


In [13]:
def model_selection(summary_dict,model_lst,param_dict,technique,x_train=x_train,y_train=y_train,x_test=x_test,y_test=y_test,n_iter=n_iter,k_fold=k_fold,n_repeats=n_repeats):
    """
    A wrapper function for the model selection loop
    
    Parameters
    ----------
    summary_dict: dict
        An empty dictionary used to store results.
    model_lst: list
        A list of tuples containing ('model_name',model), models are sklearn estimators
    param_dict: dict
        A dictionary containing model parameter distributions - to be passed to RandomizedSearchCV
    technique: str
        A string indicating technique used. Only relevant if testing techniques such as oversampling/SMOTE.
    x_train: array-like
        An array training set predictors
    y_train: array-like
        An array containing training set labels
    x_test: array-like
        An array containing test set predictors
    y_test: array-like
        An array containing test set labels
    n_iter: int
        Number of crossvalidation iterations - to be passed to RandomizedSearchCV. Defaults to n_iter parameter at top of script
    k_fold: int
        Number of crossvalidation folds - to be passed to RandomizedSearchCV. Defaults to k_fold parameter at top of script
    n_repeats: int
        Number of crossvalidation repeats - to be passed to RandomizedSearchCV. Defaults to n_repeats parameter at top of script
        
    Returns
    -------
    summary_dict: pandas.DataFrame
        A dataframe containing the best model object and associated crossvalidation results
    result_table: pandas.DataFrame
        A dataframe containing all model objects and associated crossvalidation results
    """
    iterations = n_iter
    scoring = {'roc_auc':'roc_auc','average_precision':'average_precision','accuracy': 'accuracy','f1':'f1'}
    
    result_list = []
    for name, model in model_lst:

        cv = RepeatedStratifiedKFold(n_splits=k_fold,n_repeats=n_repeats)
        refit_score = 'roc_auc'
        
        estimator = model
        if name =='DNN':
            n_jobs = 1
            x_train = x_train.to_numpy().astype('float64')
            if iterations > 10:
                iterations = np.amin([iterations,100])
        elif name == 'lgb':
            x_train.columns = [re.sub(r'\W+', '', colname) for colname in x_train.columns]
        else:
            n_jobs = -1

        search = RandomizedSearchCV(estimator,param_distributions=param_dict.get(name),random_state=seed,cv=cv,n_iter=iterations,n_jobs=n_jobs,
                                      scoring=scoring,refit=refit_score,verbose=verbose,return_train_score=return_train_score)
        search.fit(x_train, y_train)

        print(f'Model: {name}')
        if search.best_score_ == 0:
            print('No candidate models met minimum requirements')
        else:
            y_pred = search.best_estimator_.predict(x_train)
            if name == 'DNN':
                y_pred = y_pred > 0.5

            print('Classification report of best model:')
            print(classification_report(y_true=y_train,y_pred=y_pred))
            print(f'CV score of best model: {search.best_score_}')

        result_list.append((name,search,search.best_score_,search.cv_results_))
        print()
    
    print(f'Model: Ensemble')
    unpacked_results = [(r[0],r[1].best_estimator_) for r in result_list if r[0] not in ['svm','DNN']]
    ensemble = VotingClassifier(unpacked_results,voting='soft')
    cross_v = cross_validate(ensemble,x_train,y_train,cv=cv,n_jobs=n_jobs,scoring=scoring)
    ensemble.fit(x_train,y_train)
    y_pred = ensemble.predict(x_train)
    print(classification_report(y_true=y_train,y_pred=y_pred))
    score_dict = {}
    for i in range(k_fold*n_repeats):
        score_dict[f'split{i}_test_roc_auc'] = cross_v['test_roc_auc']
        score_dict[f'split{i}_test_average_precision'] = cross_v['test_average_precision']
        score_dict[f'split{i}_test_accuracy'] = cross_v['test_accuracy']
        score_dict[f'split{i}_test_f1_score'] = cross_v['test_f1']
    mean_cv_roc_auc = np.mean(cross_v['test_roc_auc'])
    print(f'Cross-Validation Score:{mean_cv_roc_auc}')
    result_list.append(('Ensemble',ensemble,mean_cv_roc_auc,score_dict))
    print()
    
    result_table = pd.DataFrame(result_list,columns=['name','model','scores','score_dict'])
    
    best_model_index = result_table['scores']==max(result_table['scores'])
    model_name = result_table['name'][best_model_index].values.tolist()[0]
    best_model = result_table['model'][best_model_index].values.tolist()[0]
    
    summary_dict[technique] = {'Model':model_name}
    
    metrics = ['mean_test_roc_auc','mean_test_average_precision','mean_test_accuracy','mean_test_f1_score']
    
    if hasattr(best_model,'best_score_'):
        best_score = best_model.best_score_ 
        for key in [key for key in best_model.cv_results_.keys() if key in metrics]:
            summary_dict[technique][key.split('mean_test_')[1]] = best_model.cv_results_[key][best_model.best_index_]
        summary_dict[technique]['model obj'] = best_model.best_estimator_
    else:
        best_score = mean_cv_roc_auc
        summary_dict[technique]['model obj'] = best_model
        for key in [key for key in best_search.cv_results_.keys() if key in metrics]:
            summary_dict[technique][key.split('mean_test_')[1]] = result_table['score_dict'][best_model_index].get(key)
        
    print(f"Best Cross-Validation score: {best_score}")        
    
    return summary_dict, result_table

# Model Selection

In [14]:
logistic = SGDClassifier(loss='log',random_state=seed)
svm = SGDClassifier(loss='hinge',random_state=seed)
gbm = GradientBoostingClassifier(random_state=seed)

classifier_list = [('lr',logistic),('svm',svm),('gbm',gbm)]
params = {'lr':{'alpha':uniform(1e-5,10),
                'penalty':['l1', 'l2', 'elasticnet'],
                'l1_ratio':uniform(0.01,0.30),
                'class_weight':[None,'balanced']},
          'svm':{'alpha':uniform(1e-5,10),
                 'class_weight':[None,'balanced']},
          'rf':{'bootstrap':[True,False],
                'criterion':['gini','entropy'],
                'max_depth':randint(2,10),
                'max_features':['sqrt', 'log2'],
                'min_samples_leaf':randint(2,20),
                'min_samples_split':randint(2,20),
                'n_estimators':randint(5, 2000),
                'class_weight':[None,'balanced']},
          'gbm':{'loss':['deviance','exponential'],
                 'learning_rate':uniform(0.003, 0.3),
                 'n_estimators':randint(5, 2000),
                 'subsample':uniform(0.5, 0.5),
                 'criterion':['friedman_mse','mse','mae'],
                 'min_samples_split':randint(2,20),
                 'min_samples_leaf':randint(2,20),
                 'max_depth':randint(2,10),
                 'max_features':['sqrt', 'log2']}}

summary_dict, conventional_results = model_selection(summary_dict=summary_dict,model_lst=classifier_list,param_dict=params,technique='conventional')

Model: lr
Classification report of best model:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        66
           1       0.64      1.00      0.78       117

    accuracy                           0.64       183
   macro avg       0.32      0.50      0.39       183
weighted avg       0.41      0.64      0.50       183

CV score of best model: 0.6928318903318904



  _warn_prf(average, modifier, msg_start, len(result))


Model: svm
Classification report of best model:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        66
           1       0.64      1.00      0.78       117

    accuracy                           0.64       183
   macro avg       0.32      0.50      0.39       183
weighted avg       0.41      0.64      0.50       183

CV score of best model: 0.6889310966810968



  _warn_prf(average, modifier, msg_start, len(result))


Model: rf
Classification report of best model:
              precision    recall  f1-score   support

           0       0.95      0.30      0.46        66
           1       0.72      0.99      0.83       117

    accuracy                           0.74       183
   macro avg       0.83      0.65      0.65       183
weighted avg       0.80      0.74      0.70       183

CV score of best model: 0.7037914862914864

Model: gbm
Classification report of best model:
              precision    recall  f1-score   support

           0       0.81      0.59      0.68        66
           1       0.80      0.92      0.86       117

    accuracy                           0.80       183
   macro avg       0.81      0.76      0.77       183
weighted avg       0.80      0.80      0.79       183

CV score of best model: 0.6963697691197692

Model: xgb
Classification report of best model:
              precision    recall  f1-score   support

           0       0.90      0.65      0.75        66
      

In [15]:
summary = pd.DataFrame.from_dict(summary_dict,orient='index').applymap(lambda cell: np.round(cell,2) if isinstance(cell,float) else cell)
summary.to_csv(f'results.csv')
summary

Unnamed: 0,Model,roc_auc,average_precision,accuracy,model obj
conventional,rf,0.7,0.81,0.66,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."


In [16]:
#This loop extracts the 95% confidence intervals for performance metrics obtained during repeated k-fold crossvalidation
confidence_intervals = []
for index in range(len(conventional_results)):
    cv_dict = {}
    name = conventional_results['name']
    roc_list = []
    for i in range(k_fold*n_repeats):
        roc_list.append(conventional_results['score_dict'][index].get(f'split{i}_test_roc_auc'))
    array = np.array(roc_list)
    array = array.flatten()
    lowerbound = np.quantile(array,0.025)
    upperbound = np.quantile(array,0.975)
    confidence_intervals.append((lowerbound,upperbound))
confidence_intervals = pd.DataFrame(confidence_intervals,columns=['0.025%','0.975%'])
conventional_results = conventional_results.join(confidence_intervals)

In [17]:
conventional_results.sort_values(by=['scores'],ascending=False).reset_index(drop=True)

Unnamed: 0,name,model,scores,score_dict,0.025%,0.975%
0,rf,RandomizedSearchCV(cv=RepeatedStratifiedKFold(...,0.703791,"{'mean_fit_time': [2.278679951667786, 0.662842...",0.428571,0.916667
1,gbm,RandomizedSearchCV(cv=RepeatedStratifiedKFold(...,0.69637,"{'mean_fit_time': [0.5662438645362854, 0.35783...",0.375,0.896104
2,lr,RandomizedSearchCV(cv=RepeatedStratifiedKFold(...,0.692832,"{'mean_fit_time': [0.003051578998565674, 0.003...",0.480519,0.87013
3,svm,RandomizedSearchCV(cv=RepeatedStratifiedKFold(...,0.688931,"{'mean_fit_time': [0.0029326467514038087, 0.00...",0.441558,0.896104
4,Ensemble,"VotingClassifier(estimators=[('lr',\n ...",0.678415,"{'split0_test_roc_auc': [0.8928571428571428, 0...",0.430556,0.904762
5,xgb,RandomizedSearchCV(cv=RepeatedStratifiedKFold(...,0.66713,"{'mean_fit_time': [0.2494613356590271, 0.25274...",0.369048,0.861111
6,lgb,RandomizedSearchCV(cv=RepeatedStratifiedKFold(...,0.637144,"{'mean_fit_time': [0.024657958984375, 0.033246...",0.38961,0.805195


In [18]:
best_model = summary['model obj'][summary['roc_auc'] == max(summary['roc_auc'])][0]
best_technique = summary.index[summary['roc_auc'] == max(summary['roc_auc'])][0]
joblib.dump(best_model,f'pickled_objects/{protocol}_best_model.pkl')
conventional_results.to_csv('train_results.csv',index=False)
print(f'Best Model: {best_model}')

Model: RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=12, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=114,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

Classification report on test set:
              precision    recall  f1-score   support

           0       0.82      0.41      0.55        22
           1       0.74      0.95      0.83        39

    accuracy                           0.75        61
   macro avg       0.78      0.68      0.69        61
weighted avg       0.77      0.75      0.73        61

