In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,mean_squared_error,mean_absolute_error

<h2>KNN Imputer</h2>

In [2]:
df=pd.read_csv('sample.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1.0,,125,212,,1.0,168,,1.0,2.0,2.0,3.0,0.0
1,53,1.0,,140,203,1.0,,155,1.0,3.1,,,3.0,0.0
2,70,1.0,,145,174,,1.0,125,1.0,2.6,,,3.0,0.0
3,61,1.0,,148,203,,1.0,161,,,2.0,1.0,3.0,0.0
4,62,,,138,294,1.0,1.0,106,,1.9,1.0,3.0,2.0,0.0


In [3]:
df.isnull().sum()

age           0
sex         312
cp          497
trestbps      0
chol          0
fbs         872
restecg     497
thalach       0
exang       680
oldpeak     329
slope        74
ca          578
thal          7
target        0
dtype: int64

In [4]:
cols=df.columns[:-1]
cols

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

**Feature Scaling - Standardization**

In [5]:
scaler=StandardScaler()
df[cols]=scaler.fit_transform(df[cols])
df[cols]=pd.DataFrame(df[cols],columns=cols[:])
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,-0.268437,0.0,,-0.377636,-0.659332,,-0.170996,0.821321,,-0.52072,1.013765,0.313039,1.115645,0.0
1,-0.158157,0.0,,0.479107,-0.833861,0.0,,0.255968,0.0,1.371114,,,1.115645,0.0
2,1.716595,0.0,,0.764688,-1.396233,,-0.170996,-1.048692,0.0,0.920677,,,1.115645,0.0
3,0.724079,0.0,,0.936037,-0.833861,,-0.170996,0.5169,,,1.013765,-0.843394,1.115645,0.0
4,0.834359,,,0.364875,0.930822,0.0,-0.170996,-1.874977,,0.290066,-0.986422,1.469472,-0.574424,0.0


**Impute missing values using KNN**

In [7]:
imp=KNNImputer(n_neighbors=3)
df_imputed=pd.DataFrame(imp.fit_transform(df[cols]),columns=cols)
df_imputed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,-0.268437,0.0,-1.260558,-0.377636,-0.659332,0.0,-0.170996,0.821321,0.0,-0.520720,1.013765,0.313039,1.115645
1,-0.158157,0.0,-1.260558,0.479107,-0.833861,0.0,-0.170996,0.255968,0.0,1.371114,1.013765,-0.843394,1.115645
2,1.716595,0.0,0.259019,0.764688,-1.396233,0.0,-0.170996,-1.048692,0.0,0.920677,-0.986422,0.313039,1.115645
3,0.724079,0.0,0.259019,0.936037,-0.833861,0.0,-0.170996,0.516900,0.0,0.019804,1.013765,-0.843394,1.115645
4,0.834359,0.0,1.778596,0.364875,0.930822,0.0,-0.170996,-1.874977,0.0,0.290066,-0.986422,1.469472,-0.574424
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.503520,0.0,-1.260558,0.479107,-0.484803,0.0,-0.170996,0.647366,0.0,-0.340546,1.013765,-0.843394,-0.574424
1021,0.613800,0.0,0.259019,-0.377636,0.232705,0.0,-0.170996,-0.352873,0.0,1.100852,-0.986422,-0.843394,1.115645
1022,-0.819834,0.0,0.259019,-1.234378,0.562371,0.0,-0.170996,-1.353113,0.0,-0.520720,-0.986422,-0.843394,-0.574424
1023,-0.488996,0.0,0.259019,-1.234378,0.155137,0.0,-0.170996,0.429923,0.0,-0.430633,1.013765,-0.843394,-0.574424


**Reverse Standardization**

In [9]:
df1=scaler.inverse_transform(df_imputed)
df1=pd.DataFrame(df1,columns=cols[:])
df1

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52.0,1.0,1.0,125.0,212.0,1.0,1.0,168.0,1.0,1.0,2.0,2.0,3.0
1,53.0,1.0,1.0,140.0,203.0,1.0,1.0,155.0,1.0,3.1,2.0,1.0,3.0
2,70.0,1.0,2.0,145.0,174.0,1.0,1.0,125.0,1.0,2.6,1.0,2.0,3.0
3,61.0,1.0,2.0,148.0,203.0,1.0,1.0,161.0,1.0,1.6,2.0,1.0,3.0
4,62.0,1.0,3.0,138.0,294.0,1.0,1.0,106.0,1.0,1.9,1.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59.0,1.0,1.0,140.0,221.0,1.0,1.0,164.0,1.0,1.2,2.0,1.0,2.0
1021,60.0,1.0,2.0,125.0,258.0,1.0,1.0,141.0,1.0,2.8,1.0,1.0,3.0
1022,47.0,1.0,2.0,110.0,275.0,1.0,1.0,118.0,1.0,1.0,1.0,1.0,2.0
1023,50.0,1.0,2.0,110.0,254.0,1.0,1.0,159.0,1.0,1.1,2.0,1.0,2.0


<h2>KNN Classifier</h2>

In [10]:
df2=pd.read_csv('heart.csv')
df2.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [11]:
df2.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [12]:
X=df2.drop(['target'],axis=1)
y=df2['target']

**Feature Scaling - Standardization**

In [13]:
scaler=StandardScaler()
X=scaler.fit_transform(X)
X=pd.DataFrame(X,columns=df.columns[:-1])

In [14]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,-0.268437,0.661504,-0.915755,-0.377636,-0.659332,-0.418878,0.891255,0.821321,-0.712287,-0.060888,0.995433,1.209221,1.089852
1,-0.158157,0.661504,-0.915755,0.479107,-0.833861,2.387330,-1.004049,0.255968,1.403928,1.727137,-2.243675,-0.731971,1.089852
2,1.716595,0.661504,-0.915755,0.764688,-1.396233,-0.418878,0.891255,-1.048692,1.403928,1.301417,-2.243675,-0.731971,1.089852
3,0.724079,0.661504,-0.915755,0.936037,-0.833861,-0.418878,0.891255,0.516900,-0.712287,-0.912329,0.995433,0.238625,1.089852
4,0.834359,-1.511706,-0.915755,0.364875,0.930822,2.387330,0.891255,-1.874977,-0.712287,0.705408,-0.624121,2.179817,-0.522122
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.503520,0.661504,0.055931,0.479107,-0.484803,-0.418878,0.891255,0.647366,1.403928,-0.912329,0.995433,-0.731971,-0.522122
1021,0.613800,0.661504,-0.915755,-0.377636,0.232705,-0.418878,-1.004049,-0.352873,1.403928,1.471705,-0.624121,0.238625,1.089852
1022,-0.819834,0.661504,-0.915755,-1.234378,0.562371,-0.418878,-1.004049,-1.353113,1.403928,-0.060888,-0.624121,0.238625,-0.522122
1023,-0.488996,-1.511706,-0.915755,-1.234378,0.155137,-0.418878,-1.004049,0.429923,-0.712287,-0.912329,0.995433,-0.731971,-0.522122


**Splitting the dataset**

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=40)

**Model Construction**

In [16]:
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)

In [17]:
y_predict=knn.predict(X_test)
y_predict

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,

**Probability Estimates**

In [18]:
#Probability of getting 0 as output
knn.predict_proba(X_test)[:,0]

array([0.33333333, 0.        , 0.        , 0.33333333, 0.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 1.        ,
       1.        , 1.        , 0.33333333, 0.66666667, 0.66666667,
       1.        , 0.        , 0.33333333, 0.33333333, 0.66666667,
       1.        , 0.        , 1.        , 1.        , 0.66666667,
       0.        , 0.        , 0.        , 1.        , 1.        ,
       0.        , 1.        , 1.        , 1.        , 0.        ,
       0.66666667, 0.66666667, 0.        , 1.        , 1.        ,
       1.        , 0.        , 0.        , 0.        , 1.        ,
       0.66666667, 0.        , 0.        , 1.        , 0.        ,
       0.33333333, 1.        , 1.        , 0.33333333, 0.        ,
       0.        , 0.        , 0.33333333, 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.        , 1.     

In [19]:
#Probability of getting 1 as output
knn.predict_proba(X_test)[:,1]

array([0.66666667, 1.        , 1.        , 0.66666667, 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.66666667, 0.33333333, 0.33333333,
       0.        , 1.        , 0.66666667, 0.66666667, 0.33333333,
       0.        , 1.        , 0.        , 0.        , 0.33333333,
       1.        , 1.        , 1.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 1.        ,
       0.33333333, 0.33333333, 1.        , 0.        , 0.        ,
       0.        , 1.        , 1.        , 1.        , 0.        ,
       0.33333333, 1.        , 1.        , 0.        , 1.        ,
       0.66666667, 0.        , 0.        , 0.66666667, 1.        ,
       1.        , 1.        , 0.66666667, 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.     

**Model Evaluation**

In [20]:
print('Accuracy Score : ',accuracy_score(y_test,y_predict))

Accuracy Score :  0.922077922077922


In [21]:
print('Confusion matrix : \n',confusion_matrix(y_test,y_predict))

Confusion matrix : 
 [[141  12]
 [ 12 143]]


In [22]:
print('Classification Report : \n',classification_report(y_test,y_predict))

Classification Report : 
               precision    recall  f1-score   support

           0       0.92      0.92      0.92       153
           1       0.92      0.92      0.92       155

    accuracy                           0.92       308
   macro avg       0.92      0.92      0.92       308
weighted avg       0.92      0.92      0.92       308



<h2>MAE and MSE</h2>

**Original dataset**

In [24]:
df2=scaler.inverse_transform(X)
df2=pd.DataFrame(df2,columns=df.columns[:-1])
df2

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52.0,1.0,0.0,125.0,212.0,0.0,1.0,168.0,0.0,1.0,2.0,2.0,3.0
1,53.0,1.0,0.0,140.0,203.0,1.0,0.0,155.0,1.0,3.1,0.0,0.0,3.0
2,70.0,1.0,0.0,145.0,174.0,0.0,1.0,125.0,1.0,2.6,0.0,0.0,3.0
3,61.0,1.0,0.0,148.0,203.0,0.0,1.0,161.0,0.0,0.0,2.0,1.0,3.0
4,62.0,0.0,0.0,138.0,294.0,1.0,1.0,106.0,0.0,1.9,1.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59.0,1.0,1.0,140.0,221.0,0.0,1.0,164.0,1.0,0.0,2.0,0.0,2.0
1021,60.0,1.0,0.0,125.0,258.0,0.0,0.0,141.0,1.0,2.8,1.0,1.0,3.0
1022,47.0,1.0,0.0,110.0,275.0,0.0,0.0,118.0,1.0,1.0,1.0,1.0,2.0
1023,50.0,0.0,0.0,110.0,254.0,0.0,0.0,159.0,0.0,0.0,2.0,0.0,2.0


**Imputed dataset**

In [25]:
df1

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52.0,1.0,1.0,125.0,212.0,1.0,1.0,168.0,1.0,1.0,2.0,2.0,3.0
1,53.0,1.0,1.0,140.0,203.0,1.0,1.0,155.0,1.0,3.1,2.0,1.0,3.0
2,70.0,1.0,2.0,145.0,174.0,1.0,1.0,125.0,1.0,2.6,1.0,2.0,3.0
3,61.0,1.0,2.0,148.0,203.0,1.0,1.0,161.0,1.0,1.6,2.0,1.0,3.0
4,62.0,1.0,3.0,138.0,294.0,1.0,1.0,106.0,1.0,1.9,1.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59.0,1.0,1.0,140.0,221.0,1.0,1.0,164.0,1.0,1.2,2.0,1.0,2.0
1021,60.0,1.0,2.0,125.0,258.0,1.0,1.0,141.0,1.0,2.8,1.0,1.0,3.0
1022,47.0,1.0,2.0,110.0,275.0,1.0,1.0,118.0,1.0,1.0,1.0,1.0,2.0
1023,50.0,1.0,2.0,110.0,254.0,1.0,1.0,159.0,1.0,1.1,2.0,1.0,2.0


In [26]:
mse = {}
mae = {}

for col in df2.columns:
    mse[col] = mean_squared_error(df2[col], df1[col])
    mae[col] = mean_absolute_error(df2[col], df1[col])

errors = pd.DataFrame({"MSE": mse, "MAE": mae})

In [27]:
errors

Unnamed: 0,MSE,MAE
age,0.0,0.0
sex,0.30439,0.30439
cp,1.92,0.91122
trestbps,0.0,0.0
chol,0.0,0.0
fbs,0.850732,0.850732
restecg,0.517073,0.49561
thalach,0.0,0.0
exang,0.663415,0.663415
oldpeak,0.658078,0.352293
