In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import sklearn.tree as tree
from sklearn.metrics import plot_confusion_matrix

# Classifiers 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

dir_out = '../plots/'

In [2]:
df = pd.read_csv("../data/heart.csv")
display(df.head())

# one-hot encoding:  'cp', 'restecg', 'slope', 'ca'?, 'thal'
# onehot_vars = ['cp', 'restecg', 'slope', 'ca', 'thal']
onehot_vars = ['cp', 'restecg', 'slope', 'thal']
for var in onehot_vars:
    df_add = pd.get_dummies(df[[var]].astype(str),prefix=[var], drop_first=True)
    df = pd.concat([df, df_add], axis=1)

df.drop(onehot_vars, axis=1, inplace=True)

# move target to the last column
df_target = df.pop('target') 
df['target'] = df_target
display(df.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,slope_1,slope_2,thal_1,thal_2,thal_3,target
0,63,1,145,233,1,150,0,2.3,0,0,0,1,0,0,0,0,1,0,0,1
1,37,1,130,250,0,187,0,3.5,0,0,1,0,1,0,0,0,0,1,0,1
2,41,0,130,204,0,172,0,1.4,0,1,0,0,0,0,0,1,0,1,0,1
3,56,1,120,236,0,178,0,0.8,0,1,0,0,1,0,0,1,0,1,0,1
4,57,0,120,354,0,163,1,0.6,0,0,0,0,1,0,0,1,0,1,0,1


In [3]:
RANDOM_STATE_DATA = 0
X = df.drop(['target'],axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE_DATA) 


In [6]:
# DecisionTreeClassifier, grid search to find best parameter
RANDOM_STATE_MODEL = 42
clf = svm.SVC(kernel='rbf', random_state=RANDOM_STATE_MODEL)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10],
              'gamma': [0.001, 0.01, 0.1, 1]}
CV_clf = GridSearchCV(estimator = clf, param_grid = param_grid, cv =5, verbose = 3)

CV_clf.fit(X_train, y_train)
print(CV_clf.best_params_)
print(CV_clf.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] C=0.001, gamma=0.001 ............................................
[CV] ................ C=0.001, gamma=0.001, score=0.551, total=   0.0s
[CV] C=0.001, gamma=0.001 ............................................
[CV] ................ C=0.001, gamma=0.001, score=0.531, total=   0.0s
[CV] C=0.001, gamma=0.001 ............................................
[CV] ................ C=0.001, gamma=0.001, score=0.542, total=   0.0s
[CV] C=0.001, gamma=0.001 ............................................
[CV] ................ C=0.001, gamma=0.001, score=0.542, total=   0.0s
[CV] C=0.001, gamma=0.001 ............................................
[CV] ................ C=0.001, gamma=0.001, score=0.542, total=   0.0s
[CV] C=0.001, gamma=0.01 .............................................
[CV] ................. C=0.001, gamma=0.01, score=0.551, total=   0.0s
[CV] C=0.001, gamma=0.01 .............................................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................... C=0.01, gamma=0.1, score=0.542, total=   0.0s
[CV] C=0.01, gamma=0.1 ...............................................
[CV] ................... C=0.01, gamma=0.1, score=0.542, total=   0.0s
[CV] C=0.01, gamma=0.1 ...............................................
[CV] ................... C=0.01, gamma=0.1, score=0.542, total=   0.0s
[CV] C=0.01, gamma=1 .................................................
[CV] ..................... C=0.01, gamma=1, score=0.551, total=   0.0s
[CV] C=0.01, gamma=1 .................................................
[CV] ..................... C=0.01, gamma=1, score=0.531, total=   0.0s
[CV] C=0.01, gamma=1 .................................................
[CV] ..................... C=0.01, gamma=1, score=0.542, total=   0.0s
[CV] C=0.01, gamma=1 .................................................
[CV] ..................... C=0.01, gamma=1, score=0.542, total=   0.0s
[CV] C=0.01, gamma=1 .................................................
[CV] .

[CV] ....................... C=10, gamma=1, score=0.531, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] ....................... C=10, gamma=1, score=0.542, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] ....................... C=10, gamma=1, score=0.542, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] ....................... C=10, gamma=1, score=0.542, total=   0.0s
{'C': 10, 'gamma': 0.001}
0.6487244897959183


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished


In [7]:
RANDOM_STATE_MODEL = 42
clf = svm.SVC(kernel='rbf', random_state=RANDOM_STATE_MODEL,
              C=CV_clf.best_params_['C'],
              gamma=CV_clf.best_params_['gamma'],
            )
clf.fit(X_train, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [8]:
labels = [0,1]
label_names = ['0','1']
plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams.update({'font.size': 18})

plot_confusion_matrix(clf, X_train, y_train,labels = labels, display_labels = label_names, cmap=plt.cm.Blues)
plt.title('Confusion Matrix:\n SVM (Train)', fontsize = 20)
plt.tight_layout()
plt.savefig(dir_out +'CM_SVM_train.png')
plt.close()

plot_confusion_matrix(clf, X_test, y_test,labels = labels, display_labels = label_names, cmap=plt.cm.Blues)
plt.title('Confusion Matrix:\n SVM (Test)', fontsize = 20)
plt.tight_layout()
plt.savefig(dir_out +'CM_SVM_test.png')
plt.close()


In [12]:
from plot_learning_curve import plot_learning_curve
from sklearn.model_selection import ShuffleSplit

In [11]:
title =  'Learning Curve of SVM'
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=RANDOM_STATE_DATA)

plot_learning_curve(clf, title, X, y, ylim=(0.5, 1.01),
                    cv=cv, n_jobs=4)#, train_sizes = np.linspace(0.1, 1.0, 10))
plt.savefig(dir_out +'LC_SVM.png')
plt.close()