In [24]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score,classification_report,plot_confusion_matrix,f1_score

In [2]:
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

In [3]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [7]:
df.corr()['DEATH_EVENT'].sort_values()

time                       -0.526964
ejection_fraction          -0.268603
serum_sodium               -0.195204
platelets                  -0.049139
smoking                    -0.012623
sex                        -0.004316
diabetes                   -0.001943
creatinine_phosphokinase    0.062728
anaemia                     0.066270
high_blood_pressure         0.079351
age                         0.253729
serum_creatinine            0.294278
DEATH_EVENT                 1.000000
Name: DEATH_EVENT, dtype: float64

In [28]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import  train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [13]:
X = df.drop('DEATH_EVENT',axis=1)

In [14]:
y = df['DEATH_EVENT']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [67]:
scaler = StandardScaler()

In [68]:
scaled_X_train = scaler.fit_transform(X_train)

In [69]:
scaled_X_test = scaler.transform(X_test)

In [70]:
def result(model):
    model.fit(scaled_X_train,y_train)
    y_predictions = model.predict(scaled_X_test)
    print(f'accuracy: {accuracy_score(y_test,y_predictions)}')
    print(f'F1 Score : {f1_score(y_test,y_predictions)}')
    print(classification_report(y_test,y_predictions))

In [71]:
def gridSearch(model,param_grid):
    grid_model = GridSearchCV(estimator=model,param_grid=param_grid,cv=5,scoring='accuracy')
    grid_model.fit(scaled_X_train,y_train)
    y_predictions = grid_model.predict(scaled_X_test)
    print(f'Grid Model best Param: {grid_model.best_params_}')
    print(f'accuracy: {accuracy_score(y_test,y_predictions)}')
    print(f'F1 Score : {f1_score(y_test,y_predictions)}')
    print(classification_report(y_test,y_predictions))

In [72]:
rf_model = RandomForestClassifier()

In [73]:
result(rf_model)

accuracy: 0.8833333333333333
F1 Score : 0.7741935483870969
              precision    recall  f1-score   support

           0       0.93      0.91      0.92        45
           1       0.75      0.80      0.77        15

    accuracy                           0.88        60
   macro avg       0.84      0.86      0.85        60
weighted avg       0.89      0.88      0.88        60



In [74]:
rf_param_grid = {
    'n_estimators':[50,100,200,250],
    'criterion':["gini", "entropy"],
}

In [75]:
gridSearch(rf_model,rf_param_grid)

Grid Model best Param: {'criterion': 'entropy', 'n_estimators': 200}
accuracy: 0.9
F1 Score : 0.8125
              precision    recall  f1-score   support

           0       0.95      0.91      0.93        45
           1       0.76      0.87      0.81        15

    accuracy                           0.90        60
   macro avg       0.86      0.89      0.87        60
weighted avg       0.91      0.90      0.90        60



In [76]:
dt_model = DecisionTreeClassifier() 

In [77]:
result(dt_model)

accuracy: 0.8333333333333334
F1 Score : 0.6666666666666666
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        45
           1       0.67      0.67      0.67        15

    accuracy                           0.83        60
   macro avg       0.78      0.78      0.78        60
weighted avg       0.83      0.83      0.83        60

