In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import sklearn.tree as tree

# Classifiers 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

dir_out = '../plots/'

In [39]:
df = pd.read_csv("../data/heart.csv")
display(df.head())

# one-hot encoding:  'cp', 'restecg', 'slope', 'ca'?, 'thal'
# onehot_vars = ['cp', 'restecg', 'slope', 'ca', 'thal']
onehot_vars = ['cp', 'restecg', 'slope', 'thal']
for var in onehot_vars:
    df_add = pd.get_dummies(df[[var]].astype(str),prefix=[var], drop_first=True)
    df = pd.concat([df, df_add], axis=1)

df.drop(onehot_vars, axis=1, inplace=True)

# move target to the last column
df_target = df.pop('target') 
df['target'] = df_target
display(df.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,slope_1,slope_2,thal_1,thal_2,thal_3,target
0,63,1,145,233,1,150,0,2.3,0,0,0,1,0,0,0,0,1,0,0,1
1,37,1,130,250,0,187,0,3.5,0,0,1,0,1,0,0,0,0,1,0,1
2,41,0,130,204,0,172,0,1.4,0,1,0,0,0,0,0,1,0,1,0,1
3,56,1,120,236,0,178,0,0.8,0,1,0,0,1,0,0,1,0,1,0,1
4,57,0,120,354,0,163,1,0.6,0,0,0,0,1,0,0,1,0,1,0,1


In [127]:
RANDOM_STATE_DATA = 0
X = df.drop(['target'],axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE_DATA) 


In [126]:
# Random Forest Model, grid search to find best parameter
RANDOM_STATE = 42
clf = RandomForestClassifier(random_state=RANDOM_STATE, oob_score=True, n_jobs = -1)
param_grid = {'max_depth': [2,5,10],'n_estimators': [10,20,50,100]}
CV_clf = GridSearchCV(estimator = clf, param_grid = param_grid, cv =5, verbose = 3)

CV_clf.fit(X_train, y_train)
print(CV_clf.best_params_)
print(CV_clf.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] max_depth=2, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......... max_depth=2, n_estimators=10, score=0.66, total=   2.7s
[CV] max_depth=2, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] ......... max_depth=2, n_estimators=10, score=0.75, total=   0.1s
[CV] max_depth=2, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV] ....... max_depth=2, n_estimators=10, score=0.8125, total=   0.1s
[CV] max_depth=2, n_estimators=10 ....................................
[CV]  max_depth=2, n_estimators=10, score=0.7916666666666666, total=   0.1s
[CV] max_depth=2, n_estimators=10 ....................................
[CV] ......... max_depth=2, n_estimators=10, score=0.75, total=   0.1s
[CV] max_depth=2, n_estimators=20 ....................................
[CV] .......... max_depth=2, n_estimators=20, score=0.7, total=   0.1s
[CV] max_depth=2, n_estimators=20 ....................................
[CV]  max_depth=2, n_estimators=20, score=0.7916666666666666, total=   0.1s
[CV] max_depth=2, n_estimators=20 ....................................
[CV] ....... max_depth=2, n_estimators=20, score=0.8125, total=   0.1s
[CV] max_depth=2, n_estimators=20 ....................................
[CV] ........ max_depth=2, n_estimators=20, score=0.875, total=   0.1s
[CV] max_depth=2, n_estimators=20 .................................

[CV]  max_depth=10, n_estimators=100, score=0.8333333333333334, total=   0.2s
{'max_depth': 5, 'n_estimators': 50}
0.8264462809917356


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   18.8s finished


In [128]:
CV_rfc.best_params_

{'max_depth': 5, 'n_estimators': 50}

In [129]:
clf = RandomForestClassifier(random_state = RANDOM_STATE,
                             max_depth=CV_rfc.best_params_['max_depth'],
                             n_estimators=CV_rfc.best_params_['n_estimators'])
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [144]:
def plot_confusion_matrix(cm, clf_name, cmap=plt.cm.Blues):
    labels = [0, 1]
    plt.rcParams["figure.figsize"] = (8, 8) # (w, h)
    plt.rcParams.update({'font.size': 18})
    fig, ax = plt.subplots()
    
    cax = ax.matshow(cm, cmap=cmap)
    for (i, j), z in np.ndenumerate(cm):
        ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center',fontsize = 15)

    fig.colorbar(cax)
    plt.title('Confusion Matrix: ' + clf_name)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()

    plt.savefig(dir_out +'cm '+ clf_name +'.png')
    #plt.show()
    plt.close()

cm = confusion_matrix(y_train, clf.predict(X_train), labels=labels)
plot_confusion_matrix(cm = cm, clf_name = 'Random Forest (Train)')

cm = confusion_matrix(y_test, clf.predict(X_test), labels=labels)
plot_confusion_matrix(cm = cm, clf_name = 'Random Forest (Test)')


In [131]:
confusion_matrix(y_train,regr.predict(X_train), labels=[0, 1])


array([[ 98,  13],
       [  4, 127]])