In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [3]:
data = pd.read_csv('heart.csv')
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [7]:
data.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [8]:
X = data[['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS','RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']]
y = data['HeartDisease']

In [9]:
categorical_features = [x for x in X.columns if X.dtypes[x] == 'object']
numerical_features = [x for x in X.columns if X.dtypes[x] != 'object']

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [17]:
numerical_pipeline = Pipeline(
    steps=[
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('encoder',OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32))
    ]
)

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_features),
    ('categorical_pipeline',categorical_pipeline,categorical_features)
])



In [18]:
preprocessor

In [20]:
scaled_X_train = preprocessor.fit_transform(X_train)
scaled_X_test = preprocessor.transform(X_test)

In [32]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier()
}

for x in range(len(list(models))):
    model = list(models.values())[x]
    model.fit(scaled_X_train,y_train)

    y_pred = model.predict(scaled_X_test)

    print(model)
    print('Accuracy : ',accuracy_score(y_test,y_pred))
    print('-'*50)

LogisticRegression()
Accuracy :  0.8913043478260869
--------------------------------------------------
DecisionTreeClassifier()
Accuracy :  0.8315217391304348
--------------------------------------------------
RandomForestClassifier()
Accuracy :  0.8967391304347826
--------------------------------------------------


In [34]:
param_grid = {'n_estimators':[50,100,150,200,300],
              'criterion':['gini','entropy','log_loss'],
              'max_depth':[1,3,5,8,None],
              'min_samples_split':[0.1,0.3,0.5,0.7,0.9],
              'min_samples_leaf':[1,2,3,4,5]
}
grid = GridSearchCV(RandomForestClassifier(),param_grid=param_grid,cv=5,verbose=True,n_jobs=-1,scoring='accuracy')

grid.fit(scaled_X_train,y_train)

Fitting 5 folds for each of 1875 candidates, totalling 9375 fits


In [35]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'min_samples_leaf': 4,
 'min_samples_split': 0.1,
 'n_estimators': 300}

In [36]:
best_model = RandomForestClassifier(criterion='entropy',max_depth=None,min_samples_leaf=4,min_samples_split=0.1,n_estimators=300)

In [47]:
best_model.fit(scaled_X_train,y_train)
y_pred = best_model.predict(scaled_X_test)
print('Accuracy : ',accuracy_score(y_test,y_pred)*100)

Accuracy :  87.5
