<a href="https://colab.research.google.com/github/vincentjunitio00/Heart_Disease_Clustering/blob/main/Heart_Disease_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import library

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/content/heart.csv')
display(df)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


Save target value in 'y' and feature values in 'X'.

In [5]:
X = df.drop(['target'], axis=1)

In [6]:
y = df['target'].values

# Supervised Learning

Split data into train and test set.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=42)

1. KNN

In [48]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [49]:
prediction_knn = knn.predict(X_test)

In [50]:
print(classification_report(y_test, prediction_knn))

              precision    recall  f1-score   support

           0       0.69      0.62      0.65        29
           1       0.69      0.75      0.72        32

    accuracy                           0.69        61
   macro avg       0.69      0.69      0.69        61
weighted avg       0.69      0.69      0.69        61



In [69]:
param_grid = {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [70]:
grid_knn = grid.best_estimator_

In [72]:
prediction_grid_knn = grid_knn.predict(X_test)
print(classification_report(y_test, prediction_grid_knn))

              precision    recall  f1-score   support

           0       0.71      0.59      0.64        29
           1       0.68      0.78      0.72        32

    accuracy                           0.69        61
   macro avg       0.69      0.68      0.68        61
weighted avg       0.69      0.69      0.69        61



Try KNN with features that are important.

In [29]:
dc = DecisionTreeClassifier()
dc.fit(X_train, y_train)

importance = dc.feature_importances_
for i,v in enumerate(importance):
	print('Feature: %s, Score: %.5f' % (X_train.columns[int(i)],v))

Feature: age, Score: 0.11925
Feature: sex, Score: 0.03488
Feature: cp, Score: 0.22526
Feature: trestbps, Score: 0.06905
Feature: chol, Score: 0.11743
Feature: fbs, Score: 0.01669
Feature: restecg, Score: 0.02862
Feature: thalach, Score: 0.03676
Feature: exang, Score: 0.07542
Feature: oldpeak, Score: 0.09240
Feature: slope, Score: 0.03854
Feature: ca, Score: 0.11848
Feature: thal, Score: 0.02721


Only take the score > 0.05.

In [73]:
X_train_important = X_train[['age', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'ca']]

In [74]:
knn_1 = KNeighborsClassifier()
knn_1.fit(X_train_important, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [75]:
X_test_important = X_test[['age', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'ca']]
pred_1 = knn_1.predict(X_test_important)

In [76]:
print(classification_report(y_test, pred_1))

              precision    recall  f1-score   support

           0       0.69      0.62      0.65        29
           1       0.69      0.75      0.72        32

    accuracy                           0.69        61
   macro avg       0.69      0.69      0.69        61
weighted avg       0.69      0.69      0.69        61



In [79]:
param_grid = {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'metric':['minkowski', 'euclidean']}

grid_knn = GridSearchCV(knn_1, param_grid, cv=10, scoring='accuracy')
grid_knn.fit(X_train_important, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'metric': ['minkowski', 'euclidean'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [80]:
prediction_grid_knn = grid_knn.predict(X_test_important)
print(classification_report(y_test, prediction_grid_knn))

              precision    recall  f1-score   support

           0       0.71      0.59      0.64        29
           1       0.68      0.78      0.72        32

    accuracy                           0.69        61
   macro avg       0.69      0.68      0.68        61
weighted avg       0.69      0.69      0.69        61



2. SVM

In [81]:
svc = SVC()
svc.fit(X_train, y_train)

pred_svc = svc.predict(X_test)

print(classification_report(y_test, pred_svc))

              precision    recall  f1-score   support

           0       0.79      0.52      0.62        29
           1       0.67      0.88      0.76        32

    accuracy                           0.70        61
   macro avg       0.73      0.70      0.69        61
weighted avg       0.73      0.70      0.69        61



In [82]:
svc

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [86]:
param_grid = {'C':[0.1, 1, 10, 100], 'kernel':['rbf', 'linear']}

grid_svc = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy')
grid_svc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [87]:
prediction_grid_svc = grid_svc.predict(X_test)
print(classification_report(y_test, prediction_grid_svc))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        29
           1       0.90      0.84      0.87        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [89]:
from xgboost import XGBClassifier
xgbc = XGBClassifier()

xgbc.fit(X_train, y_train)
pred_xgbc = xgbc.predict(X_test)

print(classification_report(y_test, pred_xgbc))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85        29
           1       0.87      0.84      0.86        32

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61



In [90]:
xgbc

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [91]:
param_grid = {'max_depth':[3, 4, 5, 6], 'min_child_weight': [1, 2, 3]}
grid_xgbc = GridSearchCV(xgbc, param_grid, cv=5, scoring='accuracy')
grid_xgbc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5, 6],
                         'min_child_weight': [1, 2, 3]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score

In [92]:
prediction_grid_xgbc = grid_xgbc.predict(X_test)
print(classification_report(y_test, prediction_grid_xgbc))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85        29
           1       0.90      0.81      0.85        32

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.86      0.85      0.85        61



# Conclusion

We tried to train and predicted the data with KNN, SVM, and Xgboost. The proposed paper performed classification with Decision Tree, Logistic Regression, Random Forest, and Naive Bayes.

Our accuracy for KNN was 0.69, SVM was 0.87, and Xgboost was 0.85. Our svm model performed better than the proposed paper's Decision Tree, Logistic Regression, and Naive Bayes by 0.02%. Random Forest still outperformed the rest of the algorithm.