## KNN Imputer

https://medium.com/@kyawsawhtoon/a-guide-to-knn-imputation-95e2dc496e

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
## Drop unwanted columns
df = df.drop(["PassengerId", "Ticket", "Name", "Cabin", "Age"], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Gender,SibSp,Parch,Fare,Embarked
0,0,3,male,1,0,7.25,S
1,1,1,female,1,0,71.2833,C
2,1,3,female,0,0,7.925,S
3,1,1,female,1,0,53.1,S
4,0,3,male,0,0,8.05,S


In [4]:
df.isna().sum()
## Gender has 3 missing values
## Embarked has 2 missing values

Survived    0
Pclass      0
Gender      3
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [5]:
## before impuatation we need to convert object data types to intigers

cat_columns = df[["Gender", "Embarked"]]

cat_dummies = pd.get_dummies(cat_columns, drop_first=True)
cat_dummies

Unnamed: 0,Gender_male,Embarked_Q,Embarked_S
0,1,0,1
1,0,0,0
2,0,0,1
3,0,0,1
4,1,0,1
...,...,...,...
886,1,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [6]:
df = df.drop(["Gender", "Embarked"], axis=1)

new_df = pd.concat([df,cat_dummies], axis=1)

In [7]:
new_df

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Gender_male,Embarked_Q,Embarked_S
0,0,3,1,0,7.2500,1,0,1
1,1,1,1,0,71.2833,0,0,0
2,1,3,0,0,7.9250,0,0,1
3,1,1,1,0,53.1000,0,0,1
4,0,3,0,0,8.0500,1,0,1
...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,1,0,1
887,1,1,0,0,30.0000,0,0,1
888,0,3,1,2,23.4500,0,0,1
889,1,1,0,0,30.0000,1,0,0


In [17]:
new_df.isna().sum()

Survived       0
Pclass         0
SibSp          0
Parch          0
Fare           0
Gender_male    0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [8]:
new_df.drop("Survived", axis=1)

Unnamed: 0,Pclass,SibSp,Parch,Fare,Gender_male,Embarked_Q,Embarked_S
0,3,1,0,7.2500,1,0,1
1,1,1,0,71.2833,0,0,0
2,3,0,0,7.9250,0,0,1
3,1,1,0,53.1000,0,0,1
4,3,0,0,8.0500,1,0,1
...,...,...,...,...,...,...,...
886,2,0,0,13.0000,1,0,1
887,1,0,0,30.0000,0,0,1
888,3,1,2,23.4500,0,0,1
889,1,0,0,30.0000,1,0,0


In [9]:
new_df.columns

Index(['Survived', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Gender_male',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [10]:
## before KNNimputer we need to scale down the featues
scaler = StandardScaler()
scaler_a = scaler.fit_transform(new_df.drop("Survived", axis=1)) 

dataf = pd.DataFrame(scaler_a, columns=new_df.columns[1:])

In [11]:
dataf

Unnamed: 0,Pclass,SibSp,Parch,Fare,Gender_male,Embarked_Q,Embarked_S
0,0.827377,0.432793,-0.473674,-0.502445,0.741327,-0.307562,0.619306
1,-1.566107,0.432793,-0.473674,0.786845,-1.348933,-0.307562,-1.614710
2,0.827377,-0.474545,-0.473674,-0.488854,-1.348933,-0.307562,0.619306
3,-1.566107,0.432793,-0.473674,0.420730,-1.348933,-0.307562,0.619306
4,0.827377,-0.474545,-0.473674,-0.486337,0.741327,-0.307562,0.619306
...,...,...,...,...,...,...,...
886,-0.369365,-0.474545,-0.473674,-0.386671,0.741327,-0.307562,0.619306
887,-1.566107,-0.474545,-0.473674,-0.044381,-1.348933,-0.307562,0.619306
888,0.827377,0.432793,2.008933,-0.176263,-1.348933,-0.307562,0.619306
889,-1.566107,-0.474545,-0.473674,-0.044381,0.741327,-0.307562,-1.614710


In [12]:
dataf.isna().sum()

Pclass         0
SibSp          0
Parch          0
Fare           0
Gender_male    0
Embarked_Q     0
Embarked_S     0
dtype: int64

## KNN Impute

In [13]:
## now use KNN imputer
knn = KNNImputer()
no_missing_df = knn.fit_transform(dataf)

knn_df = pd.DataFrame(no_missing_df, columns=new_df.columns[1:])

In [14]:
knn_df["Survived"] = df["Survived"]

In [15]:
knn_df

Unnamed: 0,Pclass,SibSp,Parch,Fare,Gender_male,Embarked_Q,Embarked_S,Survived
0,0.827377,0.432793,-0.473674,-0.502445,0.741327,-0.307562,0.619306,0
1,-1.566107,0.432793,-0.473674,0.786845,-1.348933,-0.307562,-1.614710,1
2,0.827377,-0.474545,-0.473674,-0.488854,-1.348933,-0.307562,0.619306,1
3,-1.566107,0.432793,-0.473674,0.420730,-1.348933,-0.307562,0.619306,1
4,0.827377,-0.474545,-0.473674,-0.486337,0.741327,-0.307562,0.619306,0
...,...,...,...,...,...,...,...,...
886,-0.369365,-0.474545,-0.473674,-0.386671,0.741327,-0.307562,0.619306,0
887,-1.566107,-0.474545,-0.473674,-0.044381,-1.348933,-0.307562,0.619306,1
888,0.827377,0.432793,2.008933,-0.176263,-1.348933,-0.307562,0.619306,0
889,-1.566107,-0.474545,-0.473674,-0.044381,0.741327,-0.307562,-1.614710,1


In [16]:
knn_df.isna().sum()

Pclass         0
SibSp          0
Parch          0
Fare           0
Gender_male    0
Embarked_Q     0
Embarked_S     0
Survived       0
dtype: int64