### **Handling Bivariate Missing Values using KNN Imputation**

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('train.csv')[['Age','Pclass','Fare','Survived']]

In [3]:
data.sample(10)

Unnamed: 0,Age,Pclass,Fare,Survived
628,26.0,3,7.8958,0
871,47.0,1,52.5542,1
73,26.0,3,14.4542,0
811,39.0,3,24.15,0
675,18.0,3,7.775,0
858,24.0,3,19.2583,1
309,30.0,1,56.9292,1
638,41.0,3,39.6875,0
89,24.0,3,8.05,0
249,54.0,2,26.0,0


In [4]:
data.isnull().mean()*100

Age         19.86532
Pclass       0.00000
Fare         0.00000
Survived     0.00000
dtype: float64

In [6]:
X = data.drop(columns=['Survived'])
y = data['Survived']

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [8]:
X_train.sample(6)

Unnamed: 0,Age,Pclass,Fare
823,27.0,3,12.475
73,26.0,3,14.4542
596,,2,33.0
120,21.0,2,73.5
229,,3,25.4667
406,51.0,3,7.75


#### **Applying KNN imputation**

In [9]:
knn = KNNImputer(n_neighbors=3,weights='distance') #Here distance attribute is to improve accuracy

X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

In [10]:
lr = LogisticRegression()

lr.fit(X_train_trf,y_train)

y_pred = lr.predict(X_test_trf)

accuracy_score(y_test,y_pred)

0.7150837988826816

#### **Comparing with Simple Imputation with mean strategy**

In [11]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [12]:
lr = LogisticRegression()

lr.fit(X_train_trf2,y_train)

y_pred2 = lr.predict(X_test_trf2)

accuracy_score(y_test,y_pred2) #Here if we compare we can see the difference of 20% of improvement in accuracy score

0.6927374301675978