In [32]:
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import eli5
from eli5.sklearn import PermutationImportance

%matplotlib inline

In [2]:
train = pd.read_csv('df_train2.csv') # dummified dataset
test = pd.read_csv('df_test2.csv')

In [3]:
train.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_2,diag_3,...,A1Cresult_0,A1Cresult_1,diag_1_1.0,diag_1_2.0,diag_1_3.0,diag_1_4.0,diag_1_5.0,diag_1_6.0,diag_1_7.0,diag_1_8.0
0,2,-0.438603,0.805968,-0.80931,0.277594,-0.264763,-0.205982,-0.329314,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,4,-0.777582,0.051157,-0.238557,0.036471,-0.264763,-0.205982,-0.329314,4.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,5,-1.116561,0.403402,-0.80931,-0.928017,-0.264763,-0.205982,-0.329314,8.0,4.0,...,0,0,0,0,0,0,0,0,0,1
3,6,-0.438603,-0.603012,2.615208,0.036471,-0.264763,-0.205982,-0.329314,1.0,4.0,...,0,0,1,0,0,0,0,0,0,0
4,7,-0.099625,1.359497,-0.238557,0.639277,-0.264763,-0.205982,-0.329314,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0


In [4]:
test.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_2,diag_3,...,A1Cresult_0,A1Cresult_1,diag_1_1.0,diag_1_2.0,diag_1_3.0,diag_1_4.0,diag_1_5.0,diag_1_6.0,diag_1_7.0,diag_1_8.0
0,3,-0.777582,-1.609427,2.044455,-0.325212,1.592509,-0.205982,1.224358,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,9,2.951182,1.258855,0.332196,1.483204,-0.264763,-0.205982,-0.329314,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
2,5,1.595268,0.202119,0.332196,0.157032,-0.264763,-0.205982,-0.329314,1.0,5.0,...,0,0,0,0,0,1,0,0,0,0
3,8,-0.438603,-1.206861,1.473702,0.277594,-0.264763,-0.205982,-0.329314,2.0,1.0,...,0,0,0,0,0,0,0,0,0,1
4,8,-1.116561,0.504044,-0.80931,-0.686895,-0.264763,-0.205982,-0.329314,4.0,0.0,...,0,0,0,1,0,0,0,0,0,0


In [5]:
# machine learning
import warnings

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from boruta import BorutaPy
from xgboost import XGBClassifier, XGBRanker

from scipy.stats import skew

print(__doc__)

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.filterwarnings('ignore')

Automatically created module for IPython interactive environment


In [6]:
# Create X and y for train and test sets 
X = train.drop('readmitted', axis=1)
y = train['readmitted']

X_test = test.drop('readmitted', axis=1)
y_test = test['readmitted']

In [7]:
# split train data set in train and test sets for model training
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
classifiers = [KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Precision", "Recall", "AUC", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    pre = precision_score(y_test, train_predictions)
    rec = recall_score(y_test, train_predictions)
    AUC = roc_auc_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    print("Precision: {:.4%}".format(pre))
    print("Recall: {:.4%}".format(rec))
    print("AUC: {}".format(AUC))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, pre*100, rec*100, AUC, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 93.6782%
Precision: 5.2083%
Recall: 2.1692%
AUC: 0.5013191863147989
Log Loss: 1.3319151154997568
DecisionTreeClassifier
****Results****
Accuracy: 86.5375%
Precision: 5.9583%
Recall: 13.0152%
AUC: 0.5155051513896307
Log Loss: 4.6498515657165855
RandomForestClassifier
****Results****
Accuracy: 95.3760%
Precision: 37.5000%
Recall: 0.6508%
AUC: 0.5029920708023123
Log Loss: 0.6920168429071297
AdaBoostClassifier
****Results****
Accuracy: 95.3960%
Precision: 0.0000%
Recall: 0.0000%
AUC: 0.5
Log Loss: 0.6699656013401631
GradientBoostingClassifier
****Results****
Accuracy: 95.3560%
Precision: 16.6667%
Recall: 0.2169%
AUC: 0.5008228734053493
Log Loss: 0.18929930806700315
GaussianNB
****Results****
Accuracy: 78.5379%
Precision: 8.2591%
Recall: 36.2256%
AUC: 0.584027898894327
Log Loss: 0.8657813125460656
LinearDiscriminantAnalysis
****Results****
Accuracy: 95.3860%
Precision: 45.4545%
Recall: 1.0846%
AUC: 0.505108923140649
Log Loss: 0.1892069556547436

In [9]:
log

Unnamed: 0,Classifier,Accuracy,Precision,Recall,AUC,Log Loss
0,KNeighborsClassifier,93.678,5.208,2.169,0.501,1.332
0,DecisionTreeClassifier,86.538,5.958,13.015,0.516,4.65
0,RandomForestClassifier,95.376,37.5,0.651,0.503,0.692
0,AdaBoostClassifier,95.396,0.0,0.0,0.5,0.67
0,GradientBoostingClassifier,95.356,16.667,0.217,0.501,0.189
0,GaussianNB,78.538,8.259,36.226,0.584,0.866
0,LinearDiscriminantAnalysis,95.386,45.455,1.085,0.505,0.189
0,QuadraticDiscriminantAnalysis,4.844,4.615,100.0,0.501,32.866


In [10]:
sgd = SGDClassifier()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
dtc = DecisionTreeClassifier()
abc = AdaBoostClassifier()
xgb = XGBClassifier()

In [11]:
model=abc.fit(X_train, y_train)
perm = PermutationImportance(abc, random_state=1).fit(X_train, y_train)

In [12]:
eli5.show_weights(perm, top=None)

Weight,Feature
0.0002  ± 0.0001,x63
0.0000  ± 0.0001,x56
0.0000  ± 0.0000,x58
0.0000  ± 0.0000,x73
0.0000  ± 0.0000,x0
0.0000  ± 0.0000,x43
0.0000  ± 0.0000,x42
0  ± 0.0000,x32
0  ± 0.0000,x31
0  ± 0.0000,x34


In [None]:
time_in_hospital, num_procedures, number_emergency, admission_type_id

In [34]:
from sklearn.pipeline import make_pipeline

In [36]:

vec = DictVectorizer()
pipeline = make_pipeline(vec, rfc)

def evaluate(_clf):
    scores = cross_val_score(_clf, X_dev.values, y_dev.values, scoring='accuracy', cv=10)
    print('Accuracy: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    _clf.fit(X_train, y_train)  # so that parts of the original pipeline are fitted

evaluate(pipeline)

AttributeError: 'numpy.ndarray' object has no attribute 'items'

In [13]:
model3 = rfc.fit(X_train, y_train)
perm3 = PermutationImportance(rfc, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm3, top=None)

Weight,Feature
0.0360  ± 0.0013,x44
0.0312  ± 0.0009,x43
0.0309  ± 0.0011,x39
0.0308  ± 0.0007,x41
0.0308  ± 0.0010,x46
0.0291  ± 0.0008,x42
0.0286  ± 0.0012,x0
0.0252  ± 0.0005,x2
0.0252  ± 0.0007,x56
0.0220  ± 0.0003,x40


#### Explore using BorutaPy to Select and Rank Features

In [15]:
# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X = X_train.values
y = y_train.values
y = y.ravel()

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	81
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	81
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	81
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	81
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	81
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	81
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	81
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	9 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	10 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	11 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	12 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	13 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	14 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	15 / 100
Confirmed: 	12
Tentative: 	2
Rejected: 	67
Iteration: 	16 / 100
Confirmed: 	12
Tentative: 	2
Reject

In [16]:
feat_selector.support_

array([ True,  True,  True, False,  True, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False,  True,  True,  True,  True,
       False,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

In [17]:
feat_selector.ranking_

array([ 1,  1,  1, 11,  1, 20, 21,  3, 13, 12,  1, 19, 52, 48, 55, 40, 64,
       22, 43, 57, 39, 44, 53, 64, 64, 57, 14, 54, 64, 64, 64, 64, 33, 10,
       15,  7, 22,  2, 27,  1,  4,  1,  1,  1,  1,  9,  1,  7, 50, 37, 50,
       49, 24, 18, 64, 45,  1, 33, 38, 46, 64, 64, 64, 16, 35, 30, 57, 42,
       64, 47, 28, 25, 25,  7,  5, 30, 30, 36, 17, 40, 32])

In [18]:
X_filtered = feat_selector.transform(X)

In [19]:
X_filtered

array([[ 7.        , -0.43860341, -0.35140873, ..., 63.        ,
        27.        ,  0.        ],
       [ 6.        , -0.09962485, -0.20044649, ..., 30.        ,
        20.        ,  0.        ],
       [ 6.        ,  0.57833227, -0.09980499, ..., 18.        ,
        18.        ,  0.        ],
       ...,
       [ 8.        , -0.77758197,  1.00725143, ..., 56.        ,
        14.        ,  0.        ],
       [ 7.        ,  0.23935371,  1.56077964, ..., 49.        ,
        35.        ,  0.        ],
       [ 7.        , -1.11656053, -2.11263485, ..., 35.        ,
         5.        ,  0.        ]])

In [20]:
print ('\n Initial features: ', X_train.columns.tolist() )

# number of selected features
print ('\n Number of selected features:')
print (feat_selector.n_features_)

feature_df = pd.DataFrame(X_train.columns.tolist(), columns=['features'])
feature_df['rank']=feat_selector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % feat_selector.n_features_)
print (feature_df.head(feat_selector.n_features_))
# feature_df.to_csv('boruta-feature-ranking.csv', index=False)

# check ranking of features
print ('\n Feature ranking:')
print (feat_selector.ranking_)



 Initial features:  ['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide.metformin', 'glipizide.metformin', 'glimepiride.pioglitazone', 'metformin.rosiglitazone', 'metformin.pioglitazone', 'change', 'diabetesMed', 'numchange', 'num_meds', 'number_emergency_log1p', 'number_inpatient_log1p', 'number_outpatient_log1p', 'num_medications|time_in_hospital', 'num_medications|num_procedures', 'time_in_hospital|num_lab_procedures', 'num_medications|num_lab_procedures', 'num_medications|number_diagnoses', 'age|number_diagnoses', 'change|num_medications', 'number_diagnoses|time_in_hospital', 'num_medications|numchange', 'rac

#### Use Synthetic Minority Over-Sampling Technique to balance train data set (90:10 -> 50:50)

In [21]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print('Original dataset shape {}'.format(Counter(y_train)))
sm = SMOTE(random_state=42)
X_train_new, y_train_new = sm.fit_sample(X_train, y_train)
print('New dataset shape {}'.format(Counter(y_train_new)))

Original dataset shape Counter({0: 37885, 1: 3522})
New dataset shape Counter({0: 37885, 1: 37885})


In [22]:
classifiers = [KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Precision", "Recall", "AUC", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train_new, y_train_new)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    pre = precision_score(y_test, train_predictions)
    rec = recall_score(y_test, train_predictions)
    AUC = roc_auc_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    print("Precision: {:.4%}".format(pre))
    print("Recall: {:.4%}".format(rec))
    print("AUC: {}".format(AUC))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, pre*100, rec*100, AUC, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 71.8865%
Precision: 4.6261%
Recall: 26.0304%
AUC: 0.5006501687759114
Log Loss: 5.048523582614294
DecisionTreeClassifier
****Results****
Accuracy: 86.6973%
Precision: 7.6774%
Recall: 17.1367%
AUC: 0.535955491484901
Log Loss: 4.594592046142118
RandomForestClassifier
****Results****
Accuracy: 95.2961%
Precision: 18.7500%
Recall: 0.6508%
AUC: 0.5025733103333007
Log Loss: 0.6214956935030046
AdaBoostClassifier
****Results****
Accuracy: 95.3960%
Precision: 0.0000%
Recall: 0.0000%
AUC: 0.5
Log Loss: 0.6748945180467008
GradientBoostingClassifier
****Results****
Accuracy: 95.3960%
Precision: 0.0000%
Recall: 0.0000%
AUC: 0.5
Log Loss: 0.21311327462080326
GaussianNB
****Results****
Accuracy: 42.0254%
Precision: 5.3028%
Recall: 68.7636%
AUC: 0.547492410534233
Log Loss: 2.263192696224542
LinearDiscriminantAnalysis
****Results****
Accuracy: 66.2539%
Precision: 7.7836%
Recall: 58.3514%
AUC: 0.624933348048994
Log Loss: 0.6541171408582589
QuadraticDiscrimin

In [23]:
log

Unnamed: 0,Classifier,Accuracy,Precision,Recall,AUC,Log Loss
0,KNeighborsClassifier,71.887,4.626,26.03,0.501,5.049
0,DecisionTreeClassifier,86.697,7.677,17.137,0.536,4.595
0,RandomForestClassifier,95.296,18.75,0.651,0.503,0.621
0,AdaBoostClassifier,95.396,0.0,0.0,0.5,0.675
0,GradientBoostingClassifier,95.396,0.0,0.0,0.5,0.213
0,GaussianNB,42.025,5.303,68.764,0.547,2.263
0,LinearDiscriminantAnalysis,66.254,7.784,58.351,0.625,0.654
0,QuadraticDiscriminantAnalysis,5.043,4.624,100.0,0.502,32.792
