<a href="https://colab.research.google.com/github/thisisnitish/heart_failure_prediction/blob/main/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Heart Failure Prediction using KNN algorithm with hyperparametertuning.

In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings

**Reading the datasets using**

In [2]:
df=pd.read_csv("./heart_failure_clinical_records_dataset.csv")

**Viewing the first 5 rows from the dataset to get the intuition**

In [3]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


**Checking whether there is missing data or not, if not then it can proceed to the Exploratory Data Analysis (EDA) stage.**

In [4]:
df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [5]:
y=df["DEATH_EVENT"]

In [6]:
x=df.drop("DEATH_EVENT",axis=1)

In [7]:
x.drop("time",axis=1,inplace=True)

**After dropping the column we are again checking the value.**

In [8]:
x.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0


In [9]:
from sklearn.model_selection import train_test_split

Now we are splitting the dataset in to training dataset and test dataset. And after that we will perform data fitting, some transformation and then using the model KNeighborsClassifier inorder to tune the hyperparameter.

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
scaler=MinMaxScaler()

In [13]:
x_train=scaler.fit_transform(x_train)

In [14]:
x_test=scaler.transform(x_test)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [16]:
model=KNeighborsClassifier()

In [17]:
k_range=list(range(1,150))
param_grid=dict(n_neighbors=k_range)

In [18]:
#using the GridSearch
hyperpara=GridSearchCV(model,param_grid,cv=5,return_train_score=True)

In [19]:
# fit the model
hyperpara.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30, ...]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [20]:
pd.DataFrame(hyperpara.cv_results_).sort_values("mean_test_score",ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
14,0.001027,0.000053,0.003166,0.000578,15,{'n_neighbors': 15},0.666667,0.708333,0.666667,0.666667,0.638298,0.669326,0.022385,1,0.691099,0.706806,0.696335,0.696335,0.697917,0.697699,0.005104
24,0.001096,0.000169,0.003680,0.000581,25,{'n_neighbors': 25},0.687500,0.687500,0.645833,0.645833,0.659574,0.665248,0.018849,2,0.659686,0.654450,0.670157,0.664921,0.666667,0.663176,0.005519
21,0.001047,0.000111,0.003243,0.000258,22,{'n_neighbors': 22},0.708333,0.666667,0.645833,0.645833,0.659574,0.665248,0.022997,2,0.659686,0.664921,0.659686,0.659686,0.661458,0.661087,0.002036
20,0.001073,0.000118,0.003207,0.000427,21,{'n_neighbors': 21},0.687500,0.666667,0.666667,0.645833,0.659574,0.665248,0.013478,4,0.675393,0.675393,0.670157,0.675393,0.671875,0.673642,0.002212
16,0.001097,0.000159,0.002985,0.000223,17,{'n_neighbors': 17},0.666667,0.708333,0.666667,0.645833,0.638298,0.665160,0.024346,5,0.691099,0.685864,0.675393,0.680628,0.677083,0.682014,0.005787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,0.001053,0.000140,0.002846,0.000060,13,{'n_neighbors': 13},0.666667,0.562500,0.625000,0.666667,0.638298,0.631826,0.038268,145,0.680628,0.696335,0.717277,0.696335,0.692708,0.696657,0.011815
6,0.000897,0.000030,0.002936,0.000270,7,{'n_neighbors': 7},0.687500,0.625000,0.583333,0.604167,0.638298,0.627660,0.035264,146,0.727749,0.727749,0.722513,0.706806,0.703125,0.717588,0.010547
4,0.000908,0.000037,0.002748,0.000218,5,{'n_neighbors': 5},0.625000,0.604167,0.583333,0.625000,0.638298,0.615160,0.019302,147,0.722513,0.738220,0.732984,0.743455,0.750000,0.737435,0.009351
0,0.001386,0.000922,0.003473,0.001777,1,{'n_neighbors': 1},0.562500,0.562500,0.562500,0.687500,0.595745,0.594149,0.048419,148,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000


**Fitting KNN classifier to the training data and finding the accuracy**



In [21]:
'''
Now we will fit the K-NN classifier to the training data. we have already 
imported the KNeighborsClassifier class of Sklearn Neighbors library.
'''
model=KNeighborsClassifier(n_neighbors=15,metric='euclidean')

In [22]:
#Training the model
model.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')

In [23]:
#Finding the accuracy for training dataset
model.score(x_train,y_train)

0.694560669456067

In [24]:
#Finding the accuracy for the test dataset
'''Here we can see that the hear failure prediction failure is increases by
approx 78%, and there are could be many reasons such as high blood pressure,
smoking, diabetes etc.'''
model.score(x_test,y_test)

0.7833333333333333