In [1]:
import pandas as pd

from sklearn.impute import KNNImputer


In [3]:
# Load titanic csv dataset.
df= 
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Remove PassengerId, Name, Ticket & Cabin columns
df = 

In [9]:
# Check if there is any NA values for each column


Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked     True
dtype: bool

Which 2 columns have `NA` values?

***
<h2> KNN Imputer </h2>

KNN by default uses Euclidean Distance.

A good way to modify the text data is to perform one-hot encoding or create "dummy variables". The idea is to convert each category into a binary data column by assigning a 1 or 0. Other options would be to use LabelEncoder or OrdinalEncoder from Scikit-Learn’s preprocessing package.

We will use <u>One-hot encoding</u>. First, we will make a list of categorical variables with text data and generate dummy variables by using  `.get_dummies`

In [11]:
cat_variables = df[['Sex', 'Embarked']]
cat_dummies = pd.get_dummies(cat_variables, drop_first=True)
cat_dummies.head()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S
0,1,0,1
1,0,0,0
2,0,0,1
3,0,0,1
4,1,0,1


In [12]:
df = df.drop(['Sex', 'Embarked'], axis=1)
df = pd.concat([df, cat_dummies], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


Another critical point here is that the KNN Imptuer is a distance-based imputation method and it <text style="color:red">requires us to normalize our data</text>. Otherwise, the different <text style="color:red;">scales of our data will lead the KNN Imputer to generate biased replacements</text> for the missing values. For simplicity, we will use Scikit-Learn’s MinMaxScaler which will scale our variables to have values between 0 and 1.

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.271174,0.125,0.0,0.014151,1.0,0.0,1.0
1,1.0,0.0,0.472229,0.125,0.0,0.139136,0.0,0.0,0.0
2,1.0,1.0,0.321438,0.0,0.0,0.015469,0.0,0.0,1.0
3,1.0,0.0,0.434531,0.125,0.0,0.103644,0.0,0.0,1.0
4,0.0,1.0,0.434531,0.0,0.0,0.015713,1.0,0.0,1.0


Refer [here](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html) to complete next section. 

<i> init a KNN Imputer with 5 neighbours. 
<i> Output the dataframe with the imputed values.

In [14]:
from sklearn.impute import KNNImputer
imputer = 

# Imputed dataframe using the method.
final_df = pd.DataFrame()

In [15]:
# check if the final df is correctly imputed with no missing values.
final_df.isna().any()

Survived      False
Pclass        False
Age           False
SibSp         False
Parch         False
Fare          False
Sex_male      False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [None]:
df.isna().sum()

Survived      0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_male      0
Embarked_Q    0
Embarked_S    0
dtype: int64

Last thought:
<ext style="color:#998ddd">How does selection of K-factors affect the imputation?</text>

~ END