In [48]:
##Libraries to be imported 
import pandas as pd 
import numpy as np 
from sklearn.metrics import confusion_matrix,roc_auc_score, mean_squared_error,classification_report,roc_curve,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [35]:
##Data to be used
data = pd.read_csv('forModel.csv')
data.drop(['Checkup','Unnamed: 0.1','Unnamed: 0'], axis = 1, inplace =True)

##Cloned Data 
clonedData = pd.read_csv('forModel.csv')
clonedData['Heart_Disease'] = clonedData['Heart_Disease'].map({'Yes':1,'No':0})
data.head(5)


Unnamed: 0,General_Health,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,SexBinary
0,Very Good,Yes,No,No,No,No,No,Yes,Female,55-59,170.0,68.04,23.49,No,0.0,90.0,4.0,1.0,0
1,Excellent,No,No,No,No,No,No,No,Female,80+,168.0,77.11,27.44,Yes,30.0,2.0,30.0,0.0,0
2,Good,Yes,No,Yes,No,No,No,Yes,Female,80+,175.0,72.57,23.63,No,0.0,60.0,20.0,1.0,0
3,Very Good,Yes,No,No,No,No,No,No,Male,55-59,180.0,99.79,30.68,No,20.0,30.0,16.0,4.0,1
4,Good,Yes,No,No,No,No,No,Yes,Male,30-34,188.0,70.31,19.9,Yes,24.0,8.0,24.0,20.0,1


### Model Preparation ! 
---

In [36]:
## lets first evaluate the datas that we will be using ! 

print(f"The data has {data.shape[0]} rows and {data.shape[1]} columns")
print(data.dtypes)

The data has 160000 rows and 19 columns
General_Health                   object
Exercise                         object
Heart_Disease                    object
Skin_Cancer                      object
Other_Cancer                     object
Depression                       object
Diabetes                         object
Arthritis                        object
Sex                              object
Age_Category                     object
Height_(cm)                     float64
Weight_(kg)                     float64
BMI                             float64
Smoking_History                  object
Alcohol_Consumption             float64
Fruit_Consumption               float64
Green_Vegetables_Consumption    float64
FriedPotato_Consumption         float64
SexBinary                         int64
dtype: object


---
### Feature Engineering 

In [37]:
## Lets convert few columns in order to create few 
##Lets use the One Hot Encoder to change all the categorical data into the numerical datatypes 
onlyCategorical = data.select_dtypes("object")
for everything in onlyCategorical.columns:
    print(f"The number of unique features that the columns {everything} has is {data[everything].nunique()}")


The number of unique features that the columns General_Health has is 5
The number of unique features that the columns Exercise has is 2
The number of unique features that the columns Heart_Disease has is 2
The number of unique features that the columns Skin_Cancer has is 2
The number of unique features that the columns Other_Cancer has is 2
The number of unique features that the columns Depression has is 2
The number of unique features that the columns Diabetes has is 4
The number of unique features that the columns Arthritis has is 2
The number of unique features that the columns Sex has is 2
The number of unique features that the columns Age_Category has is 13
The number of unique features that the columns Smoking_History has is 2


In [38]:
## As seen lets convert the binary category using the .map and nominal category using the onehotencoding 
data['General_Health'] = data['General_Health'].apply(lambda x: 'Poor' if x == 'Poor' else 'Good')
data['General_Health'] = data['General_Health'].map({"Poor":1, "Good":0})
data['Sex'] = data['Sex'].map({'Male':1, 'Female':0})
data['Diabetes'] = data['Diabetes'].apply(lambda x:'No' if x == 'No' else 'Yes')
## Since most of the Binary have Yes and No we will convert all of them at once 
cols = ['Exercise','Heart_Disease','Skin_Cancer','Other_Cancer','Depression','Arthritis','Smoking_History','Diabetes']
for each in cols:
    data[each] = data[each].map({'Yes':1,'No':0}) ##This converts the whole data into the binary 

dummied_data = pd.get_dummies(data, columns=['Age_Category'], dtype = 'int')


### The reason to convert the binary categories using the .map instead of using OneHotEncoding is to minimize the number of columns. 

---

In [39]:
## Lets figure out the independent variable and dependent varibale 
SEED = 9
X = dummied_data.drop('Heart_Disease', axis = 1)
y = clonedData['Heart_Disease']
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = SEED)


### We will be selecting the best models among the LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, AdaBoostClassifer, GradientBoostClassifer through its ROC_AUC_SCORE and other different parameters

---
### Model One : Logistic Regression

In [40]:
##Pipeline Preparation 
pipeline_steps = [('imputer',SimpleImputer(strategy='median')),('scalar',StandardScaler()),('logit',LogisticRegression(solver='liblinear',max_iter=1000))]
logit_pipeline = Pipeline(pipeline_steps)

##Lets do the hyperparameer tuining 
params = {'logit__C':[0.01,0.1],'logit__penalty':['l1','l2']}
hyper_tuning = RandomizedSearchCV(logit_pipeline, param_distributions= params, cv = 3, n_jobs=-1, n_iter=4) ## The best parameter has been calcuated and updated accordingly in the logit_model 
hyper_tuning.fit(X_train,y_train)
hyper_tuned_model = hyper_tuning.best_estimator_
training_predict = hyper_tuned_model.predict(X_train)
testing_predict = hyper_tuned_model.predict(X_test)
probabilites_predict = hyper_tuned_model.predict_proba(X_test)[:,1]
logit_training_score = accuracy_score(y_train,training_predict) #0.74

logit_testing_score = accuracy_score(y_test,testing_predict) #0.74
## classification report 
class_report_logit = classification_report(y_test, testing_predict)
##ROC AUC score 
logit_fpr, logit_tpr, logit_thresholds = roc_curve(y_test,probabilites_predict)
logit_roc_score = roc_auc_score(y_test,probabilites_predict) ##81%


### The logistic regression model has 76% precision, 71%recall, and 73% f1-score. Whereas, the training and testing score is equaivalent(~74%) which means the model has no problem with over fitting and underfitting. Also, the roc score of the model is 81% which is good 
---


### Model Two: Decision Tree Classifier 

In [44]:
## Pipeline Preparation 
decision_steps = [('Imputer',SimpleImputer(strategy='median')),('Scalar',StandardScaler()),('DecisionTree',DecisionTreeClassifier(random_state=SEED, class_weight='balanced'))] ## This is the pipeline we will be using 
decision_pipeline = Pipeline(decision_steps)
#cross validation 
params = {'DecisionTree__max_depth':[3,4,5,6,7], 'DecisionTree__max_features':['sqrt','log2',None, 0.5]}
cross_validation = cross_val_score(decision_pipeline, X,y, cv=5, scoring='roc_auc')
#Hyperparameter tuning
decision_hyper_tuning = RandomizedSearchCV(decision_pipeline, param_distributions=params, cv = 5, n_jobs = -1, n_iter = 20)
decision_hyper_tuning.fit(X_train,y_train)
best_decision_model = decision_hyper_tuning.best_estimator_
decision_training_predict = best_decision_model.predict(X_train) #predicting the training accuracy 
decision_testing_predict = best_decision_model.predict(X_test) #prediciting the testing accuracy
decision_predicited_proba = best_decision_model.predict_proba(X_test)[:,1] 
decision_testing_accuracy_score = accuracy_score(y_test, decision_testing_predict) # Testing Accuracy -> 0.72
decision_training_accuracy_score = accuracy_score(y_train, decision_training_predict) # Training Accuracy -> 0.72

##Classification report -> Analysis of precision, recall and f1 score
decision_classification = classification_report(y_test, decision_testing_predict)
decision_fpr, decision_tpr, decision_thresholds = roc_curve(y_test, decision_predicited_proba)
decision_roc_score = roc_auc_score(y_test, decision_predicited_proba) ## 0.78

### The logistic regression model has 75% precision, 67%recall, and 71% f1-score. Whereas, the training and testing score is equaivalent(~72%) which means the model has no problem with over fitting and underfitting. Also, the roc score of the model is 78%
---


## Model Three: Bagging Classifier 

In [47]:
## Pipeline Preparation 
decision_steps = [("imputer",SimpleImputer()),("decision_tree",DecisionTreeClassifier())]