In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
from imblearn.over_sampling import SMOTE

import pandas as pd
from sklearn.preprocessing import StandardScaler # stanardization
from sklearn.preprocessing import LabelEncoder # Label --> Number
from sklearn.preprocessing import minmax_scale

from sklearn.model_selection import train_test_split, cross_val_predict # Training/Test split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import LogisticRegression #LR
import statsmodels.api as sm

from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.naive_bayes import MultinomialNB # Naive Bayes

from sklearn.svm import LinearSVC, SVC #SVM

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import roc_curve

### Question 1: Load the "heart.csv" dataset.

In [2]:
# Load the data
heart_df = pd.read_csv('heart.csv')
heart_df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


### Question 2: Split features into input and output.

In [3]:
# Input
X = heart_df[['BMI', 'Smoker', 'MentHlth', 'Age', 'Education', 'Income']]
X.head()

Unnamed: 0,BMI,Smoker,MentHlth,Age,Education,Income
0,40.0,1.0,18.0,9.0,4.0,3.0
1,25.0,1.0,0.0,7.0,6.0,1.0
2,28.0,0.0,30.0,9.0,4.0,8.0
3,27.0,0.0,0.0,11.0,3.0,6.0
4,24.0,0.0,3.0,11.0,5.0,4.0


In [4]:
# Output 
y = heart_df['HeartDiseaseorAttack']
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: HeartDiseaseorAttack, dtype: float64

### Question 3: Split data into training and test data

In [5]:
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

### Question 4: Use Naïve Bayes, draw a confusion matrix and show the f1-score

In [6]:
# Processing Naive Bayes using default option with alpha = 1
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train, y_train)
y_pred_NB = mnb.predict(X_test)

# Construct the confusion matrix
confmat_NB = pd.DataFrame(confusion_matrix(y_test, y_pred_NB),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_NB)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_NB))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_NB))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_NB))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_NB))

         Predict[0]  Predict[1]
True[0]       41380        4577
True[1]        3899         880
accuracy: 0.833
precision: 0.161
recall: 0.184
F1: 0.172


### Question 5: Use Logistic Regression, draw a confusion matrix and show the f1-score

In [7]:
# Processing Logistic Regression using default option
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
y_pred_LR = logistic.predict(X_test)

# Construct the confusion matrix
confmat_LR = pd.DataFrame(confusion_matrix(y_test, y_pred_LR),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_LR)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_LR))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_LR))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_LR))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_LR))

         Predict[0]  Predict[1]
True[0]       45912          45
True[1]        4745          34
accuracy: 0.906
precision: 0.430
recall: 0.007
F1: 0.014


### Question 6: Use SVM, draw a confusion matrix and show the f1-score

In [8]:
# Processing SVM using default option
svm = LinearSVC(dual = False)
svm.fit(X_train, y_train)
y_pred_SVM = svm.predict(X_test)

# Construct the confusion matrix
confmat_SVM = pd.DataFrame(confusion_matrix(y_test, y_pred_SVM),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_SVM)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_SVM))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_SVM, zero_division=0))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_SVM, zero_division=0))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_SVM, zero_division=0))

         Predict[0]  Predict[1]
True[0]       45957           0
True[1]        4779           0
accuracy: 0.906
precision: 0.000
recall: 0.000
F1: 0.000


### Question 7: Using SMOTE and repeat the process

In [9]:
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
y_train_smote.value_counts()

HeartDiseaseorAttack
0.0    183830
1.0    183830
Name: count, dtype: int64

In [10]:
# Naive Bayes after SMOTE
mnb_smote = MultinomialNB(alpha=1.0)
mnb_smote.fit(X_train_smote, y_train_smote)
y_pred_NB_smote = mnb_smote.predict(X_test)
confmat_NB_smote = pd.DataFrame(confusion_matrix(y_test, y_pred_NB_smote),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_NB_smote)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_NB_smote))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_NB_smote))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_NB_smote))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_NB_smote))

         Predict[0]  Predict[1]
True[0]       36962        8995
True[1]        3039        1740
accuracy: 0.763
precision: 0.162
recall: 0.364
F1: 0.224


In [11]:
# Logistic Regression after SMOTE
logistic_smote = LogisticRegression()
logistic_smote.fit(X_train_smote, y_train_smote)
y_pred_LR_smote = logistic_smote.predict(X_test)
confmat_LR_smote = pd.DataFrame(confusion_matrix(y_test, y_pred_LR_smote),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_LR_smote)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_LR_smote))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_LR_smote))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_LR_smote))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_LR_smote))

         Predict[0]  Predict[1]
True[0]       30643       15314
True[1]        1255        3524
accuracy: 0.673
precision: 0.187
recall: 0.737
F1: 0.298


In [12]:
# SVM after SMOTE
svm_smote = LinearSVC(dual = False)
svm_smote.fit(X_train_smote, y_train_smote)
y_pred_SVM_smote = svm_smote.predict(X_test)
confmat_SVM_smote = pd.DataFrame(confusion_matrix(y_test, y_pred_SVM_smote),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_SVM_smote)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_SVM_smote))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_SVM_smote))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_SVM_smote))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_SVM_smote))

         Predict[0]  Predict[1]
True[0]       30168       15789
True[1]        1199        3580
accuracy: 0.665
precision: 0.185
recall: 0.749
F1: 0.297


### Problem 8: Choosing Logistic Regression, Improve the model

In [13]:
logistic_1 = LogisticRegression(solver = 'saga', penalty = 'elasticnet', l1_ratio= 0.5, max_iter = 10000, class_weight = {0: 1, 1:7.2})
logistic_1.fit(X_train, y_train)
y_pred_LR_1 = logistic_1.predict(X_test)
confmat_LR_1 = pd.DataFrame(confusion_matrix(y_test, y_pred_LR_1),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_LR_1)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_LR_1))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_LR_1))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_LR_1))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_LR_1))

         Predict[0]  Predict[1]
True[0]       34363       11594
True[1]        1710        3069
accuracy: 0.738
precision: 0.209
recall: 0.642
F1: 0.316
