## Supervised Model Development Template


### Import Python libraries

In [1]:
import pandas as pd

import sweetviz as sv

from mitosheet import *;

from pycaret.classification import *
#from pycaret.regression import *

### Read CSV file and check dataframe

In [2]:
#Read CSV file into a dataframe
allData = pd.read_csv('HeartDiseaseEDA.csv')

In [3]:
# Get dataframe information
allData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 918 non-null    int64  
 1   Age-Classification  918 non-null    object 
 2   Sex                 918 non-null    object 
 3   ChestPainType       918 non-null    object 
 4   RestingBP           918 non-null    int64  
 5   Cholesterol         918 non-null    int64  
 6   MaxHR               918 non-null    int64  
 7   ExerciseAngina      918 non-null    object 
 8   Oldpeak             918 non-null    float64
 9   ST_Slope            918 non-null    object 
 10  HeartDisease        918 non-null    int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 79.0+ KB


In [4]:
allData

Unnamed: 0,Age,Age-Classification,Sex,ChestPainType,RestingBP,Cholesterol,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,Youth,M,ATA,140,289,172,N,0.0,Up,0
1,49,Adults,F,NAP,160,180,156,N,1.0,Flat,1
2,37,Youth,M,ATA,130,283,98,N,0.0,Up,0
3,48,Adults,F,ASY,138,214,108,Y,1.5,Flat,1
4,54,Adults,M,NAP,150,195,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...
913,45,Youth,M,TA,110,264,132,N,1.2,Flat,1
914,68,Old,M,ASY,144,193,141,N,3.4,Flat,1
915,57,Adults,M,ASY,130,131,115,Y,1.2,Flat,1
916,57,Adults,F,ATA,130,236,174,N,0.0,Flat,1


### Split data into modelling and unseen prediction data

90% for modelling, 10% unseen prediction data

In [5]:
data = allData.sample(frac=0.9, random_state=42)
evaluationData = allData.drop(data.index)

data.reset_index(drop=True, inplace=True)
evaluationData.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(evaluationData.shape))

Data for Modeling: (826, 11)
Unseen Data For Predictions: (92, 11)


## Setup Pycaret environment/data pre-processing
The following environment setup was used.

train_size-Setting the train size to 80% and the test size to 20%
target-setting the target to be the HeartDisease
session_id-setting the session_id to be 42

In [6]:
exp_clf = setup(data = data, target = 'HeartDisease',  train_size=0.80,session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,HeartDisease
2,Target type,Binary
3,Original data shape,"(826, 11)"
4,Transformed data shape,"(826, 18)"
5,Transformed train set shape,"(660, 18)"
6,Transformed test set shape,"(166, 18)"
7,Ordinal features,2
8,Numeric features,5
9,Categorical features,5


### View new training data

In [7]:
Xtrain = get_config('X_train_transformed')
Xtrain

Unnamed: 0,Age,Age-Classification_Adults,Age-Classification_Youth,Age-Classification_Old,Sex,ChestPainType_ASY,ChestPainType_TA,ChestPainType_NAP,ChestPainType_ATA,RestingBP,Cholesterol,MaxHR,ExerciseAngina,Oldpeak,ST_Slope_Flat,ST_Slope_Up,ST_Slope_Down
342,59.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,135.0,234.0,161.0,0.0,0.5,1.0,0.0,0.0
414,62.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,120.0,0.0,134.0,0.0,-0.8,1.0,0.0,0.0
109,40.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,106.0,240.0,80.0,1.0,0.0,0.0,1.0,0.0
178,39.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,160.0,147.0,160.0,0.0,0.0,0.0,1.0,0.0
371,45.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,130.0,219.0,130.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,31.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,120.0,270.0,153.0,1.0,1.5,1.0,0.0,0.0
140,59.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,140.0,169.0,140.0,0.0,0.0,0.0,1.0,0.0
213,50.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,145.0,0.0,139.0,1.0,0.7,1.0,0.0,0.0
550,43.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,120.0,201.0,165.0,0.0,0.0,0.0,1.0,0.0


### Compare models

In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8742,0.9281,0.9112,0.8736,0.8905,0.7426,0.7472,5.124
nb,Naive Bayes,0.8545,0.916,0.8709,0.873,0.8706,0.7043,0.7079,2.649
dt,Decision Tree Classifier,0.7924,0.7899,0.8087,0.822,0.8135,0.5789,0.5818,2.795
knn,K Neighbors Classifier,0.6955,0.7274,0.7497,0.7228,0.7336,0.3782,0.3819,2.701
svm,SVM - Linear Kernel,0.6242,0.0,0.7205,0.7322,0.6598,0.216,0.2844,3.109


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Ridge Classifer is the best performing model as we can see in the results above

### Create using a specific algorithm

In [None]:
#best_model = create_model('<algorithm_id>')

In [None]:
print(best_model)

### Tune model

Tune model optimised on 'F1' score using 10 iterations

In [None]:
tuned_best = tune_model(best_model, optimize='F1', n_iter=10, choose_better=True )

In [None]:
print(tuned_best)

### Evaluate model performance

In [None]:
plot_model(tuned_best, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})

Confusion matrix shows an accuracy of 87% for both categories when predicting the test data

In [None]:
plot_model(tuned_best,plot='class_report')

The class report shows the various metrics and there performance of predicitng if an individual will have HeartDisease attack. From the Class report,we can see that we have a very strong values for our precission,recalla and F1 which implies that our model gives us a better accuracy in predicting if an individual would have HeartDiseas attack

In [None]:
plot_model(tuned_best,plot='learning')

The learning curve is good with both the cross validation and training scores converging,Since the crossvalidation and the training score converges then it means that no additional data is required.

In [None]:
plot_model(tuned_best,plot='feature_all')

All of the features attributes are used by the algorithm. The most important are "Sex","ST_Slope_Up","ST_Slope_Flat,"ChestPainType_ASY","ExerciseAngina"

### Make predictions on test data

In [None]:
predict_model(tuned_best)

High metric values when predicting using the test data.This shows that our model is very efficinet in making predictions if an individual would have HeartDisease attack

### Finalise and save model

In [None]:
final_model = finalize_model(tuned_best)

In [None]:
save_model(final_model,'HeartDiseaseModel')

### Load model

In [None]:
saved_final = load_model('HeartDiseaseModel')

### Make predictions on unseen data

In [None]:
new_prediction = predict_model(saved_final, data=evaluationData)

In [None]:
new_prediction.head()