In [None]:
import os
import numpy as np 
import pandas as pd 
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows",50)
from sklearn import preprocessing
import matplotlib as mpl
mpl.style.use('ggplot')
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
import keras
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
import keras.backend as K
import tensorflow as tf

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]

In [None]:
df = pd.read_csv("../input/heart-failure-prediction/heart.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.ST_Slope.value_counts()

In [None]:
df.ExerciseAngina.value_counts()

In [None]:
df.RestingECG.value_counts()

In [None]:
df.ChestPainType.value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

# Check the correlation between the features

In [None]:
correlation = df.corr()
sns.heatmap(correlation, annot= True, cmap = 'RdBu')

In [None]:
df['HeartDisease']= df['HeartDisease'].astype('category')

# Data Visulaization 

In [None]:
sns.histplot(data = df ,x = 'Age', hue = 'HeartDisease')
plt.title("Distribution of Heart Diseases over Age")

In [None]:
px.histogram(data_frame = df, x = 'Sex', color = 'HeartDisease', title="Distribution of Heart Diseases over Sex",barmode="group")

In [None]:
sns.histplot(data = df ,x = 'RestingECG', hue = 'HeartDisease',palette = 'Set1')
plt.title("Distribution of Heart Diseases over RestingECG")

In [None]:
px.histogram(data_frame = df, x = 'ChestPainType', color = 'HeartDisease', title="Distribution of Heart Diseases over ChestPainType",barmode="group")


In [None]:
sns.pairplot(df,hue="HeartDisease")
plt.title("Looking for Insites in Data")
plt.legend("HeartDisease")
plt.tight_layout()
plt.plot()

# **Check the Distribution and Skewness of the features**

In [None]:
plt.figure(figsize=(15,10))
for i,col in enumerate(df.columns,1):
    plt.subplot(4,3,i)
    plt.title(f"Distribution of {col} Data")
    sns.histplot(df[col],kde=True)
    plt.tight_layout()
    plt.plot()
    

# Check the Outliers using Box Plots

In [None]:
fig = px.box(df,y="RestingBP",x="HeartDisease",title=f"Distrubution of RestingBP")
fig.show()

In [None]:
fig = px.box(df,y="Age",x="HeartDisease",title=f"Distrubution of Age",color="Sex")
fig.show()

In [None]:
fig = px.box(df,y="Cholesterol",x="HeartDisease",title=f"Distrubution of Cholesterol",color="Sex")
fig.show()

In [None]:
fig = px.box(df,y="Oldpeak",x="HeartDisease",title=f"Distrubution of Oldpeak")
fig.show()

In [None]:
fig = px.box(df,y="MaxHR",x="HeartDisease",title=f"Distrubution of MaxHR",color="Sex")
fig.show()

# Robust Scaler will be used to prevent the outliers from causing training problems for Non-Tree algorithims

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
numeric_columns

In [None]:
object_columns = df.select_dtypes(include='object').columns.tolist()
object_columns

In [None]:
features = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS','RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
target = 'HeartDisease'

In [None]:
X = df[features]
y = df[target]

In [None]:
encoder = LabelEncoder()
X = X.apply(encoder.fit_transform)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42,stratify = y)

# Tree Algorithims

In [None]:
feature_col_tree= X_train.columns.to_list()

In [None]:
from sklearn.metrics import confusion_matrix 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 15
from sklearn.ensemble import RandomForestClassifier

In [None]:
predictors = feature_col_tree
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=10):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], y_train)
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validate(alg, dtrain[predictors], y_train, cv=cv_folds, scoring='accuracy')
    
    accuracy = metrics.accuracy_score(np.array(y_train),np.array(dtrain_predictions))
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : \n" + f'{accuracy}')
    #print ("AUC Score (Train): %f" + f'{metrics.roc_auc_score(dtrain['Decision'], dtrain_predprob)})
    
    if performCV:
         print ("CV Score :" f'{cv_score}')
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')

# Gradient Boosting Trees

In [None]:
#Choose all predictors except target & IDcols
gbc = GradientBoostingClassifier(random_state=10)
modelfit(gbc, X_train, predictors)

In [None]:
param_test = {'max_features':range(2,6,2)}
gsearch = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators = 60,max_depth = 9, min_samples_split = 15, subsample = 0.8, random_state=10),
param_grid = param_test, scoring='accuracy',n_jobs = -1,iid=False, cv=10)
gsearch.fit(X_train ,y_train)

In [None]:
modelfit(gsearch.best_estimator_, X_train, predictors)

In [None]:
param_test2 = {'min_samples_split':range(10,500,20),'learning_rate':np.arange(0.02, 0.1, 0.02),'max_features':range(2,6,2)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(max_features=4,n_estimators=100,max_depth=10,min_samples_leaf=40, random_state=10),
                        param_grid = param_test2, 
                        scoring='accuracy',
                        iid=False,
                        cv=10)
gsearch2.fit(X_train ,y_train)

In [None]:
modelfit(gsearch2.best_estimator_, X_train, predictors)

# Random Forest Classifier

In [None]:
rf_param_test3 = {'min_samples_split':range(10,100,15),'min_samples_leaf':range(2,20,4),'bootstrap':[True,False],'max_depth':range(3,15,3)}
rf_gsearch = GridSearchCV(estimator = RandomForestClassifier(max_features=10,n_estimators=100, random_state=10),
                        param_grid = rf_param_test3 , 
                        scoring='accuracy',
                        iid=False,
                        cv=10)
rf_gsearch.fit(X_train,y_train)

In [None]:
modelfit(rf_gsearch.best_estimator_, X_train, predictors)

# Try the best tree model on test data

In [None]:
Y_pred = gsearch2.predict(X_test)

In [None]:
cm = confusion_matrix(Y_pred,y_test)
plt.figure()
sns.heatmap(cm, annot = True,fmt='d' ,cmap='RdBu')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, Y_pred, target_names = ['Safe','Failure']))

# Non - Tree Algorithims

**One Hot Encoding**

In [None]:
## Creaeting one hot encoded features for working with non tree based algorithms 
X2 = pd.get_dummies(X,columns = object_columns,drop_first=False)
X2.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.15, random_state=42,stratify = y)

In [None]:
X_train

**Robust Scaler**

In [None]:
scaler = preprocessing.RobustScaler()
scaler = scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns = X2.columns)
X_test = pd.DataFrame(X_test, columns = X2.columns)

In [None]:
X_train

In [None]:
predictors = X_train.columns

# K-NN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier



rf_param_test4 = {'n_neighbors':range(3,12,2)}
knn_gsearch = GridSearchCV(estimator = KNeighborsClassifier(),
                        param_grid = rf_param_test4 , 
                        scoring='accuracy',
                        iid=False,
                        cv=10)
knn_gsearch.fit(X_train,y_train)

In [None]:
modelfit(knn_gsearch.best_estimator_, X_train, predictors, printFeatureImportance = False)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression


rf_param_test5 = {'warm_start':[True,False]}
LR_gsearch = GridSearchCV(estimator = LogisticRegression(),
                        param_grid = rf_param_test5 , 
                        scoring='accuracy',
                        iid=False,
                        cv=10)
LR_gsearch.fit(X_train,y_train)

In [None]:
modelfit(LR_gsearch.best_estimator_, X_train, predictors, printFeatureImportance = False)

In [None]:
from sklearn.svm import SVC

rf_param_test6 = {'C':[1,0.5,2],'kernel':['linear', 'poly', 'rbf', 'sigmoid']}
svc_gsearch = GridSearchCV(estimator = SVC( probability= True),
                        param_grid = rf_param_test6 , 
                        scoring='accuracy',
                        iid=False,
                        cv=10)
svc_gsearch.fit(X_train,y_train)


In [None]:
modelfit(svc_gsearch.best_estimator_, X_train, predictors, printFeatureImportance = False)

# Deep Learning - Artificial Neural Networks

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

In [None]:
X_train

In [None]:
X_test

In [None]:
model = Sequential()


model.add(Dense(units=20,activation='relu'))

model.add(Dense(units=15,activation='relu'))
model.add(Dense(units=10,activation='relu'))
model.add(Dense(units=5,activation='relu'))


model.add(Dense(units=1,activation='sigmoid'))
callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=4)

# For a binary classification problem
model.compile(loss='binary_crossentropy', optimizer='adam',metrics= 'accuracy')

In [None]:
model.fit(X_train, 
          y_train, 
          epochs=100,
          callbacks = [callback],
          validation_data=(X_test, y_test), verbose=1
          )

# Looks like the best 2 models are Gradient boosting trees and ANN for this dataset :) 