In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing Libraries and Loading the Dataset

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
heart_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/11_ML_HeartDisease/heart.csv")
heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# Data Preprocessing

In [4]:
cat_col = heart_df.select_dtypes(include='object').columns
cat_col

Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')

# Converting Categorical Variables to Numeric

In [5]:
for col in cat_col:
  print(col)
  print((heart_df[col].unique()),list (range(heart_df [col].nunique())))
  heart_df[col].replace((heart_df[col].unique()), range(heart_df[col].nunique()), inplace=True)
  print('*'*90)
  print()

Sex
['M' 'F'] [0, 1]
******************************************************************************************

ChestPainType
['ATA' 'NAP' 'ASY' 'TA'] [0, 1, 2, 3]
******************************************************************************************

RestingECG
['Normal' 'ST' 'LVH'] [0, 1, 2]
******************************************************************************************

ExerciseAngina
['N' 'Y'] [0, 1]
******************************************************************************************

ST_Slope
['Up' 'Flat' 'Down'] [0, 1, 2]
******************************************************************************************



In [6]:
heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,0,3,110,264,0,0,132,0,1.2,1,1
914,68,0,2,144,193,1,0,141,0,3.4,1,1
915,57,0,2,130,131,0,0,115,1,1.2,1,1
916,57,1,0,130,236,0,2,174,0,0.0,1,1


In [8]:
heart_df['Cholesterol'].value_counts()

Unnamed: 0_level_0,count
Cholesterol,Unnamed: 1_level_1
0,172
254,11
220,10
223,10
204,9
...,...
353,1
278,1
157,1
176,1


# Imputing the 0 values in cholestrol coulmn with KNN Imputer

In [9]:
np.nan

nan

In [10]:
heart_df['Cholesterol'].replace(0,np.nan, inplace=True)

In [11]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(heart_df)
heart_df = pd.DataFrame(after_impute, columns=heart_df.columns)

In [12]:
heart_df['Cholesterol'].isna().sum()

np.int64(0)

In [14]:
count = 0
for i in heart_df['Cholesterol']:
  if i==0:
    count+=1
print(count)

0


# Doing same for RestingBP

In [15]:
heart_df['RestingBP'][heart_df['RestingBP']==0]

Unnamed: 0,RestingBP
449,0.0


In [16]:
from sklearn.impute import KNNImputer
heart_df['RestingBP'].replace(0,np.nan, inplace=True)
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(heart_df)
heart_df = pd.DataFrame(after_impute, columns=heart_df.columns)

In [17]:
heart_df['RestingBP'].isnull().sum()

np.int64(0)

# Change column type to int

In [19]:
withoutOldPeak = heart_df.columns
withoutOldPeak = withoutOldPeak.drop('Oldpeak')
heart_df[withoutOldPeak] = heart_df[withoutOldPeak].astype('int32')

In [20]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int32  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int32  
 4   Cholesterol     918 non-null    int32  
 5   FastingBS       918 non-null    int32  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int32  
 8   ExerciseAngina  918 non-null    int32  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int32  
 11  HeartDisease    918 non-null    int32  
dtypes: float64(1), int32(11)
memory usage: 46.7 KB


# Data Visualization

In [21]:
!pip install plotly



In [23]:
import plotly.express as px

In [24]:
px.line(heart_df.corr()['HeartDisease'][:-1].sort_values())

# Age and HeartDisease Distribution

In [25]:
px.sunburst(heart_df, path=['HeartDisease','Age'])

In [26]:
px.histogram(heart_df, x='Age', color='HeartDisease')

# Percentage of HeartDisease data distribution

In [27]:
px.pie(heart_df, names='HeartDisease', title='Percentage of HeartDisease class distribution')

# Sex vs Heart Disease

In [28]:
px.histogram(heart_df, x='Sex', color='HeartDisease')

# Chest Pain Type vs Heart Disease

In [29]:
px.histogram(heart_df, x='ChestPainType', color='HeartDisease')

# RestingBP vs Heart Disease

In [30]:
px.sunburst(heart_df,path=['HeartDisease','RestingBP'])

# Fasting BS vs Heart Disease

In [31]:
px.histogram(heart_df, x='FastingBS', color='HeartDisease')

# MaxHR vs Heart Disease

In [32]:
px.sunburst(heart_df, path=['HeartDisease', 'MaxHR'])

In [33]:
px.violin(heart_df,x='HeartDisease',y='MaxHR',color='HeartDisease')

# OldPeak vs Heart Disease

In [34]:
px.violin(heart_df,x='HeartDisease',y='Oldpeak',color='HeartDisease')

# ST_Slope vs Heart Disease

In [35]:
px.histogram(heart_df, x='ST_Slope', color='HeartDisease')

# Exercise Angina vs Heart Disease

In [36]:
px.histogram(heart_df, x='ExerciseAngina', color='HeartDisease')

# Train-Test-Split

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    heart_df.drop('HeartDisease', axis=1),
    heart_df['HeartDisease'],
    test_size=0.2,
    random_state=42,
    stratify=heart_df['HeartDisease']
)

# Model Training

## Logistics Regression

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
best_solver = ''
test_score = np.zeros(6)
for i, n in enumerate(solver):
  lr = LogisticRegression (solver=n).fit(X_train, y_train)
  test_score[i] = lr.score (X_test, y_test)
  if lr.score (X_test, y_test)==test_score.max():
    best_solver = n

lr = LogisticRegression(solver=best_solver)
lr.fit(X_train, y_train)
lr_pred=lr.predict(X_test)
print(f'LogisticRegression Score: {accuracy_score(y_test, lr_pred)}')

LogisticRegression Score: 0.8586956521739131


In [44]:
import pickle
file = open('LogisticRegression.pkl','wb')
pickle.dump(lr,file)

## Support Vendor Machine(SVM)

In [40]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

kernels = {'linear': 0, 'poly': 0, 'rbf': 0, 'sigmoid': 0}
best = ''

for i in kernels:
    svm = SVC(kernel=i)
    svm.fit(X_train, y_train)
    yhat = svm.predict(X_test)
    kernels[i] = f1_score(y_test, yhat, average="weighted")
    if kernels[i] == max(kernels.values()):
        best = i

svm = SVC(kernel=best)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(f'SVM f1_score kernel({best}): {f1_score(y_test, svm_pred, average="weighted")}')

SVM f1_score kernel(linear): 0.8422922535440344


In [45]:
file = open('SVM.pkl','wb')
pickle.dump(svm,file)

## Digital Tree Classifier

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeClassifier(class_weight='balanced')
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4],
    'random_state': [0, 42]
}

grid_search = GridSearchCV(dtree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
Ctree = DecisionTreeClassifier(**grid_search.best_params_, class_weight='balanced')
Ctree.fit(X_train, y_train)
dtc_pred = Ctree.predict(X_test)
print("DecisionTree's Accuracy:", accuracy_score(y_test, dtc_pred))

DecisionTree's Accuracy: 0.8097826086956522


In [46]:
file = open('tree.pkl','wb')
pickle.dump(Ctree,file)

## Random Forest Classifier

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 150, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9, 19],
    'max_leaf_nodes': [3, 6, 9]
}

grid_search = GridSearchCV(rfc, param_grid)
grid_search.fit(X_train, y_train)
rfctree = RandomForestClassifier(**grid_search.best_params_)
rfctree.fit(X_train, y_train)
rfc_pred = rfctree.predict(X_test)
print("RandomForestClassifier's Accuracy:", accuracy_score(y_test, rfc_pred))

RandomForestClassifier's Accuracy: 0.8532608695652174


In [47]:
file = open('RandomForest.pkl','wb')
pickle.dump(rfctree,file)