In [1]:
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import eli5
from eli5.sklearn import PermutationImportance

%matplotlib inline

In [2]:
train = pd.read_csv('../data/le_train.csv', index_col=0)
test = pd.read_csv('../data/le_test.csv', index_col=0)

In [3]:
train.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,pioglitazone,rosiglitazone,acarbose,miglitol,tolazamide,insulin,glyburide.metformin,change,diabetesMed,readmitted
1,2,0,1,1,1,7,3,71,59,0,...,1,1,1,1,0,3,1,0,1,0
2,2,1,3,1,1,7,2,71,44,1,...,1,1,1,1,0,3,1,0,1,0
3,2,1,4,1,1,7,1,71,51,0,...,1,1,1,1,0,2,1,0,1,0
4,2,1,5,2,1,2,3,71,31,6,...,1,1,1,1,0,2,1,1,1,0
5,2,1,6,3,1,2,4,71,70,1,...,1,1,1,1,0,2,1,0,1,0


In [4]:
test.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,pioglitazone,rosiglitazone,acarbose,miglitol,tolazamide,insulin,glyburide.metformin,change,diabetesMed,readmitted
0,0,0,2,1,1,7,2,71,11,5,...,1,1,1,1,0,1,1,1,1,0
1,2,0,8,2,1,4,13,71,68,2,...,1,1,1,1,0,2,1,0,1,0
2,0,0,4,1,1,7,9,71,47,2,...,1,1,1,1,0,2,1,1,1,0
3,2,0,4,1,3,7,7,11,60,0,...,1,1,1,1,0,0,1,0,1,1
4,0,0,7,3,1,2,3,71,19,4,...,1,1,1,1,0,2,1,0,1,0


In [5]:
# Create X and y for train and test sets 
X_train = train.drop('readmitted', axis=1)
y_train = train['readmitted']

X_test = test.drop('readmitted', axis=1)
y_test = test['readmitted']

In [6]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [7]:
# machine learning
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import skew
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, roc_auc_score
import warnings
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from boruta import BorutaPy
from xgboost import XGBClassifier, XGBRanker
print(__doc__)

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.filterwarnings('ignore')

Automatically created module for IPython interactive environment


In [8]:
classifiers = [KNeighborsClassifier(3),
#     SVC(kernel="rbf", C=0.025, probability=True),
#     NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Precision", "Recall", "AUC", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    pre = precision_score(y_test, train_predictions)
    rec = recall_score(y_test, train_predictions)
    AUC = roc_auc_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    print("Precision: {:.4%}".format(pre))
    print("Recall: {:.4%}".format(rec))
    print("AUC: {}".format(AUC))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, pre*100, rec*100, AUC, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 86.2110%
Precision: 17.0437%
Recall: 5.1270%
AUC: 0.5094100626851196
Log Loss: 2.8204798062992618
DecisionTreeClassifier
****Results****
Accuracy: 79.3662%
Precision: 16.7681%
Recall: 20.0091%
AUC: 0.535469290866086
Log Loss: 7.126676985991073
RandomForestClassifier
****Results****
Accuracy: 88.3047%
Precision: 34.4828%
Recall: 1.8149%
AUC: 0.5068323899851579
Log Loss: 1.1889028203266792
AdaBoostClassifier
****Results****
Accuracy: 88.5083%
Precision: 53.0612%
Recall: 1.1797%
AUC: 0.5052198604996156
Log Loss: 0.675931239332307
GradientBoostingClassifier
****Results****
Accuracy: 88.5135%
Precision: 54.7619%
Recall: 1.0436%
AUC: 0.5046572807993748
Log Loss: 0.3375117894105859
GaussianNB
****Results****
Accuracy: 82.8747%
Precision: 22.8831%
Recall: 20.5989%
AUC: 0.557859149135246
Log Loss: 0.7240534089332293
LinearDiscriminantAnalysis
****Results****
Accuracy: 88.1533%
Precision: 36.8421%
Recall: 4.1289%
AUC: 0.5160422417032243
Log Loss: 0.

In [9]:
log

Unnamed: 0,Classifier,Accuracy,Precision,Recall,AUC,Log Loss
0,KNeighborsClassifier,86.211,17.044,5.127,0.509,2.82
0,DecisionTreeClassifier,79.366,16.768,20.009,0.535,7.127
0,RandomForestClassifier,88.305,34.483,1.815,0.507,1.189
0,AdaBoostClassifier,88.508,53.061,1.18,0.505,0.676
0,GradientBoostingClassifier,88.514,54.762,1.044,0.505,0.338
0,GaussianNB,82.875,22.883,20.599,0.558,0.724
0,LinearDiscriminantAnalysis,88.153,36.842,4.129,0.516,0.345
0,QuadraticDiscriminantAnalysis,83.204,23.523,20.417,0.559,0.719


In [10]:
sgd = SGDClassifier()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
dtc = DecisionTreeClassifier()
abc = AdaBoostClassifier()
xgb = XGBClassifier()

In [11]:
classifiers = [sgd, rfc, gbc, dtc, abc]
for clf in classifiers:
    model = clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    perm = PermutationImportance(clf, random_state=1).fit(X_train, y_train)
    eli5.show_weights(perm, top=None)

print("="*30)

SGDClassifier
RandomForestClassifier
GradientBoostingClassifier
DecisionTreeClassifier
AdaBoostClassifier


In [22]:
eli5.show_weights(perm, top=None)

Weight,Feature
0.0005  ± 0.0002,x13
0.0000  ± 0.0000,x25
0.0000  ± 0.0000,x7
0.0000  ± 0.0001,x16
0.0000  ± 0.0000,x11
0  ± 0.0000,x0
0  ± 0.0000,x23
0  ± 0.0000,x5
0  ± 0.0000,x9
0  ± 0.0000,x34


In [24]:
model=abc.fit(X_train, y_train)
perm = PermutationImportance(abc, random_state=1).fit(X_train, y_train)

In [25]:
eli5.show_weights(perm, top=None)

Weight,Feature
0.0005  ± 0.0002,x13
0.0000  ± 0.0000,x25
0.0000  ± 0.0000,x7
0.0000  ± 0.0001,x16
0.0000  ± 0.0000,x11
0  ± 0.0000,x0
0  ± 0.0000,x23
0  ± 0.0000,x5
0  ± 0.0000,x9
0  ± 0.0000,x34


In [None]:
time_in_hospital, num_procedures, number_emergency, admission_type_id

In [12]:
model2 = sgd.fit(X_train, y_train)
print('Coefficients: \n', sgd.coef_)
perm2 = PermutationImportance(sgd, random_state=1).fit(X_train, y_train)

Coefficients: 
 [[-0.31283075  1.1470461   3.20651522 -1.64236146  3.41506906  0.23462307
  -1.17311533  1.87698452 -1.25132301 -3.25865368  0.54745382 -0.26069229
   2.24195373 10.45376101 -1.30346147  0.59959228  0.78207688  2.4765768
  -0.54745382  4.61425361 -3.5454152   3.85824596  0.26069229  0.52138459
  -2.37229988  0.99063072  0.18248461 -0.7038692  -0.41710767  0.20855384
   0.96456149 -0.13034615  0.52138459  1.19918455 -0.13034615  8.83746878]]


In [21]:
eli5.show_weights(perm2, top=None)

Weight,Feature
0.0068  ± 0.0014,x7
0.0038  ± 0.0009,x13
0.0023  ± 0.0013,x4
0.0018  ± 0.0003,x3
0.0014  ± 0.0004,x14
0.0010  ± 0.0009,x2
0.0010  ± 0.0005,x6
0.0010  ± 0.0002,x17
0.0002  ± 0.0003,x5
0.0002  ± 0.0002,x20


In [26]:
model3 = rfc.fit(X_train, y_train)
perm3 = PermutationImportance(rfc, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm3, top=None)

Weight,Feature
0.0562  ± 0.0006,x6
0.0552  ± 0.0010,x13
0.0544  ± 0.0009,x10
0.0541  ± 0.0012,x4
0.0501  ± 0.0008,x8
0.0485  ± 0.0005,x16
0.0422  ± 0.0010,x14
0.0405  ± 0.0009,x2
0.0385  ± 0.0010,x15
0.0373  ± 0.0005,x17


Explore using BorutaPy to Select and Rank Features

In [29]:
# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X = X_train.values
y = y_train.values
y = y.ravel()

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	13
Tentative: 	5
Rejected: 	18
Iteration: 	9 / 100
Confirmed: 	13
Tentative: 	5
Rejected: 	18
Iteration: 	10 / 100
Confirmed: 	13
Tentative: 	5
Rejected: 	18
Iteration: 	11 / 100
Confirmed: 	13
Tentative: 	5
Rejected: 	18
Iteration: 	12 / 100
Confirmed: 	14
Tentative: 	2
Rejected: 	20
Iteration: 	13 / 100
Confirmed: 	14
Tentative: 	2
Rejected: 	20
Iteration: 	14 / 100
Confirmed: 	14
Tentative: 	2
Rejected: 	20
Iteration: 	15 / 100
Confirmed: 	14
Tentative: 	2
Rejected: 	20
Iteration: 	16 / 100
Confirmed: 	14
Tentative: 	2
Reject

In [30]:
feat_selector.support_

array([False, False,  True, False,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False])

In [31]:
feat_selector.ranking_

array([10, 14,  1,  4,  1,  5,  1,  1,  1,  2,  1,  1,  1,  1,  1,  3,  1,
        1,  7,  6,  1, 14, 19, 21, 14,  9, 11, 12, 16, 19, 21, 21,  1, 17,
        8,  2])

In [32]:
X_filtered = feat_selector.transform(X)

In [33]:
X_filtered

array([[ 1,  1,  3, ...,  9,  1,  3],
       [ 3,  1,  2, ...,  7,  1,  3],
       [ 4,  1,  1, ...,  5,  1,  2],
       ...,
       [ 8,  4,  5, ...,  9,  1,  2],
       [ 8,  3, 10, ...,  9,  1,  3],
       [ 7,  1,  6, ...,  9,  1,  1]])

In [37]:
print ('\n Initial features: ', X_train.columns.tolist() )

# number of selected features
print ('\n Number of selected features:')
print (feat_selector.n_features_)

feature_df = pd.DataFrame(X_train.columns.tolist(), columns=['features'])
feature_df['rank']=feat_selector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % feat_selector.n_features_)
print (feature_df.head(feat_selector.n_features_))
# feature_df.to_csv('boruta-feature-ranking.csv', index=False)

# check ranking of features
print ('\n Feature ranking:')
print (feat_selector.ranking_)



 Initial features:  ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'tolazamide', 'insulin', 'glyburide.metformin', 'change', 'diabetesMed']

 Number of selected features:
14

 Top 14 features:
                    features  rank
0           number_diagnoses     1
1                    insulin     1
2                        age     1
3                  metformin     1
4   discharge_disposition_id     1
5                     diag_3     1
6           time_in_hospital     1
7          medical_specialty     1
8         num_lab_procedures     1
9    