In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
from imblearn.over_sampling import SMOTE

import pandas as pd
from sklearn.preprocessing import StandardScaler # stanardization
from sklearn.preprocessing import LabelEncoder # Label --> Number
from sklearn.preprocessing import minmax_scale

from sklearn.model_selection import train_test_split, cross_val_predict # Training/Test split

import statsmodels.api as sm


from sklearn.tree import DecisionTreeClassifier,plot_tree # Decision Tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import roc_curve

### Question 1: Load the "heart.csv" dataset.

In [2]:
# Load the data
heart_df = pd.read_csv('heart.csv')
heart_df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


### Question 2: Split features into input and output.

In [3]:
# Input
X = heart_df[['BMI', 'Smoker', 'MentHlth', 'Age', 'Education', 'Income']]
X.head()

Unnamed: 0,BMI,Smoker,MentHlth,Age,Education,Income
0,40.0,1.0,18.0,9.0,4.0,3.0
1,25.0,1.0,0.0,7.0,6.0,1.0
2,28.0,0.0,30.0,9.0,4.0,8.0
3,27.0,0.0,0.0,11.0,3.0,6.0
4,24.0,0.0,3.0,11.0,5.0,4.0


In [4]:
# Output 
y = heart_df['HeartDiseaseorAttack']
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: HeartDiseaseorAttack, dtype: float64

### Question 3: Split data into training and test data

In [5]:
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Question 4: Use Random Forest with different estimators

In [6]:
# n_estimators = 10
RF_10 = RandomForestClassifier(n_estimators=10)
RF_10.fit(X_train, y_train)

# n_estimators = 50
RF_50 = RandomForestClassifier(n_estimators=50)
RF_50.fit(X_train, y_train)

# n_estimators = 100
RF_100 = RandomForestClassifier(n_estimators=100)
RF_100.fit(X_train, y_train)

# n_estimators = 200
RF_200 = RandomForestClassifier(n_estimators=200)
RF_200.fit(X_train, y_train)

# n_estimators = 500
RF_500 = RandomForestClassifier(n_estimators=500)
RF_500.fit(X_train, y_train)

# n_estimators = 1000
RF_1000 = RandomForestClassifier(n_estimators=1000)
RF_1000.fit(X_train, y_train)

### Compare f1 scores for training data

In [7]:
y1_pred_10 = RF_10.predict(X_train)
y1_pred_50 = RF_50.predict(X_train)
y1_pred_100 = RF_100.predict(X_train)
y1_pred_200 = RF_200.predict(X_train)
y1_pred_500 = RF_500.predict(X_train)
y1_pred_1000 = RF_1000.predict(X_train)

print('F1_10: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_10))
print('F1_50: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_50))
print('F1_100: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_100))
print('F1_200: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_200))
print('F1_500: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_500))
print('F1_1000: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_1000))

F1_10: 0.453
F1_50: 0.481
F1_100: 0.480
F1_200: 0.476
F1_500: 0.472
F1_1000: 0.472


### Compare f1 scores for test data

In [8]:
y2_pred_10 = RF_10.predict(X_test)
y2_pred_50 = RF_50.predict(X_test)
y2_pred_100 = RF_100.predict(X_test)
y2_pred_200 = RF_200.predict(X_test)
y2_pred_500 = RF_500.predict(X_test)
y2_pred_1000 = RF_1000.predict(X_test)

print('F1_10: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_10))
print('F1_50: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_50))
print('F1_100: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_100))
print('F1_200: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_200))
print('F1_500: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_500))
print('F1_1000: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_1000))

F1_10: 0.092
F1_50: 0.085
F1_100: 0.086
F1_200: 0.081
F1_500: 0.084
F1_1000: 0.082


### Question 5: Use XGBoost with different estimators

In [9]:
# n_estimators = 10
XG_10 = XGBClassifier(n_estimators=10)
XG_10.fit(X_train,y_train)

# n_estimators = 50
XG_50 = XGBClassifier(n_estimators=50)
XG_50.fit(X_train,y_train)

# n_estimators = 100
XG_100 = XGBClassifier(n_estimators=100)
XG_100.fit(X_train,y_train)

# n_estimators = 200
XG_200 = XGBClassifier(n_estimators=200)
XG_200.fit(X_train,y_train)

# n_estimators = 500
XG_500 = XGBClassifier(n_estimators=500)
XG_500.fit(X_train,y_train)

# n_estimators = 1000
XG_1000 = XGBClassifier(n_estimators=1000)
XG_1000.fit(X_train,y_train)

### Compare f1 scores for training data

In [10]:
y1_pred_10_XG = XG_10.predict(X_train)
y1_pred_50_XG = XG_50.predict(X_train)
y1_pred_100_XG = XG_100.predict(X_train)
y1_pred_200_XG = XG_200.predict(X_train)
y1_pred_500_XG= XG_500.predict(X_train)
y1_pred_1000_XG = XG_1000.predict(X_train)

print('F1_10: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_10_XG))
print('F1_50: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_50_XG))
print('F1_100: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_100_XG))
print('F1_200: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_200_XG))
print('F1_500: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_500_XG))
print('F1_1000: %.3f' % f1_score(y_true=y_train, y_pred=y1_pred_1000_XG))

F1_10: 0.001
F1_50: 0.012
F1_100: 0.024
F1_200: 0.047
F1_500: 0.105
F1_1000: 0.172


### Compare f1 scores for test data

In [11]:
y2_pred_10_XG = XG_10.predict(X_test)
y2_pred_50_XG = XG_50.predict(X_test)
y2_pred_100_XG = XG_100.predict(X_test)
y2_pred_200_XG = XG_200.predict(X_test)
y2_pred_500_XG = XG_500.predict(X_test)
y2_pred_1000_XG = XG_1000.predict(X_test)

print('F1_10: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_10_XG))
print('F1_50: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_50_XG))
print('F1_100: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_100_XG))
print('F1_200: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_200_XG))
print('F1_500: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_500_XG))
print('F1_1000: %.3f' % f1_score(y_true=y_test, y_pred=y2_pred_1000_XG))

F1_10: 0.000
F1_50: 0.003
F1_100: 0.008
F1_200: 0.012
F1_500: 0.026
F1_1000: 0.042


### Conclusion: For Ramdom Forest, the best f1 score for training data (0.481) is generated by n_estimators = 50; the best f1 socre for testing data (0.092) is generated by n_estimators = 10. For XGBoost, the best f1 score for training data (0.172) is generated by n_estimators = 1000; the best f1 socre for testing data (0.042) is generated by n_estimators = 1.