In [332]:
##Libraries to be imported 
import pandas as pd 
import numpy as np 
from sklearn.metrics import confusion_matrix,roc_auc_score, mean_squared_error,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer


In [333]:
##Data to be used
data = pd.read_csv('forModel.csv')
data.drop(['Checkup','Unnamed: 0.1','Unnamed: 0'], axis = 1, inplace =True)

##Cloned Data 
clonedData = pd.read_csv('forModel.csv')
clonedData['Heart_Disease'] = clonedData['Heart_Disease'].map({'Yes':1,'No':0})


### Model Preparation ! 
---

In [334]:
## lets first evaluate the datas that we will be using ! 

print(f"The data has {data.shape[0]} rows and {data.shape[1]} columns")
print(data.dtypes)

The data has 160000 rows and 19 columns
General_Health                   object
Exercise                         object
Heart_Disease                    object
Skin_Cancer                      object
Other_Cancer                     object
Depression                       object
Diabetes                         object
Arthritis                        object
Sex                              object
Age_Category                     object
Height_(cm)                     float64
Weight_(kg)                     float64
BMI                             float64
Smoking_History                  object
Alcohol_Consumption             float64
Fruit_Consumption               float64
Green_Vegetables_Consumption    float64
FriedPotato_Consumption         float64
SexBinary                         int64
dtype: object


---
### Feature Engineering 

In [335]:
## Lets convert few columns in order to create few 
##Lets use the One Hot Encoder to change all the categorical data into the numerical datatypes 
onlyCategorical = data.select_dtypes("object")
for everything in onlyCategorical.columns:
    print(f"The number of unique features that the columns {everything} has is {data[everything].nunique()}")


The number of unique features that the columns General_Health has is 5
The number of unique features that the columns Exercise has is 2
The number of unique features that the columns Heart_Disease has is 2
The number of unique features that the columns Skin_Cancer has is 2
The number of unique features that the columns Other_Cancer has is 2
The number of unique features that the columns Depression has is 2
The number of unique features that the columns Diabetes has is 4
The number of unique features that the columns Arthritis has is 2
The number of unique features that the columns Sex has is 2
The number of unique features that the columns Age_Category has is 13
The number of unique features that the columns Smoking_History has is 2


In [336]:
## As seen lets convert the binary category using the .map and nominal category using the onehotencoding 
data['General_Health'] = data['General_Health'].apply(lambda x: 'Poor' if x == 'Poor' else 'Good')
data['General_Health'] = data['General_Health'].map({"Poor":1, "Good":0})
data['Sex'] = data['Sex'].map({'Male':1, 'Female':0})
data['Diabetes'] = data['Diabetes'].apply(lambda x:'No' if x == 'No' else 'Yes')
## Since most of the Binary have Yes and No we will convert all of them at once 
cols = ['Exercise','Heart_Disease','Skin_Cancer','Other_Cancer','Depression','Arthritis','Smoking_History','Diabetes']
for each in cols:
    data[each] = data[each].map({'Yes':1,'No':0}) ##This converts the whole data into the binary 

dummiedData = pd.get_dummies(data, columns=['Age_Category'], dtype = 'int')


### The reason to convert the binary categories using the .map instead of using OneHotEncoding is to minimize the number of columns. 

In [337]:
## Lets fit and transform the data to simple imputer in order to avoid any null values
imputer = SimpleImputer(strategy='median')
imputed = imputer.fit_transform(dummiedData)
imputedDataFrame = pd.DataFrame(imputed, columns =dummiedData.columns)


## Lets use the Standard Scalar 
standard = StandardScaler()
scaled = standard.fit_transform(imputedDataFrame)
scaledData = pd.DataFrame(scaled, columns=imputedDataFrame.columns)

### Dimension Reduction 

In [338]:
X = scaledData.drop('Heart_Disease', axis = 1).values 
y = clonedData['Heart_Disease'].values
X_train,X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state=32)

dimensionReduction = PCA(n_components=0.95) ## Our goal is to achieve the 95% of the variance of the data
X_train_pca = dimensionReduction.fit_transform(X_train)
PC1 = X_train_pca[:,0]
PC2 = X_train_pca[:,1]
X_test_pca = dimensionReduction.transform(X_test)


---
### Model Preparation

In [344]:
logit = LogisticRegression(C= 10) ## Model we will be using 
logit.fit(X_train_pca,y_train)
y_pred = logit.predict(X_test_pca)

In [345]:
### Lets check the model's performance manually
confusionMatrix = confusion_matrix(y_pred, y_test)
TrueNegative = confusionMatrix[0][0]
FalsePositive = confusionMatrix[0][1]
FalseNegative = confusionMatrix[1][0]
TruePositive = confusionMatrix[1][1]
print(confusionMatrix)
accuracy = (TruePositive+TrueNegative)/(TruePositive+TrueNegative+FalsePositive+FalseNegative)
print(f"The accuracy of the model is {np.round(accuracy,2) * 100} %")

precision = (TruePositive)/(TruePositive+FalsePositive)
print(f"The precision of the model is {np.round(precision,2) * 100} %")
recall = (TruePositive)/(TruePositive+FalseNegative)
print(f"The recall of the model is {np.round(recall,2) * 100} %")

## alternative way 
print(classification_report(y_test, y_pred))

[[11533  4244]
 [ 4464 11759]]
The accuracy of the model is 73.0 %
The precision of the model is 73.0 %
The recall of the model is 72.0 %
              precision    recall  f1-score   support

           0       0.73      0.72      0.73     15997
           1       0.72      0.73      0.73     16003

    accuracy                           0.73     32000
   macro avg       0.73      0.73      0.73     32000
weighted avg       0.73      0.73      0.73     32000



---
### Hyperparameter Tuning 

In [348]:
## Lets do the hyperparameter tuning to figure out which parameter is perfect ?!
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}
grid = GridSearchCV(logit,param_grid, cv = 5)
grid.fit(X_train_pca,y_train)
print(grid.best_params_) ## The best value for C has been updated 

{'C': 10}
