In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
from imblearn.over_sampling import SMOTE

import pandas as pd
from sklearn.preprocessing import StandardScaler # stanardization
from sklearn.preprocessing import LabelEncoder # Label --> Number

from sklearn.model_selection import train_test_split # Training/Test split

from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.naive_bayes import MultinomialNB # Naive Bayes

from sklearn.metrics import confusion_matrix, classification_report # Confusion Matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import roc_curve

In [2]:
bank_df = pd.read_csv('heart.csv')
bank_df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
X = bank_df[['BMI','Smoker','MentHlth','Age','Education','Income']] # (5 points)

In [4]:
y = bank_df['HeartDiseaseorAttack']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1,stratify=y)

In [6]:
y_train.value_counts()

HeartDiseaseorAttack
0.0    160851
1.0     16725
Name: count, dtype: int64

In [7]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
y_pred_p = tree.predict_proba(X_test)

In [8]:
confmat = pd.DataFrame(confusion_matrix(y_test, y_pred),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
confmat

Unnamed: 0,Predict[0],Predict[1]
True[0],66505,2431
True[1],6565,603


In [9]:
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

         0.0       0.91      0.96      0.94     68936
         1.0       0.20      0.08      0.12      7168

    accuracy                           0.88     76104
   macro avg       0.55      0.52      0.53     76104
weighted avg       0.84      0.88      0.86     76104



In [10]:
print('accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

accuracy: 0.882
precision: 0.199
recall: 0.084
F1: 0.118


In [11]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
y_train.value_counts()

HeartDiseaseorAttack
1.0    160851
0.0    160851
Name: count, dtype: int64

In [12]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
y_pred_p = tree.predict_proba(X_test)

In [13]:
confmat = pd.DataFrame(confusion_matrix(y_test, y_pred),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
confmat

Unnamed: 0,Predict[0],Predict[1]
True[0],59346,9590
True[1],5093,2075


In [59]:
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

         0.0       0.92      0.86      0.89     68936
         1.0       0.18      0.29      0.22      7168

    accuracy                           0.81     76104
   macro avg       0.55      0.58      0.56     76104
weighted avg       0.85      0.81      0.83     76104



In [60]:
print('accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

accuracy: 0.810
precision: 0.181
recall: 0.290
F1: 0.223
