## KNN imputation

The missing values are estimated as the average value from the closest K neighbours.

In [None]:
import io
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Limit our data and use only these columns
cols_to_use = [
    'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
    'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
    'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
    'WoodDeckSF',  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
    'SalePrice'
]

In [None]:
# Load the House Prices dataset.
data = pd.read_csv(io.BytesIO(uploaded['houseprice.csv']), usecols=cols_to_use)
data.head()

In [None]:
for var in data.columns:
    if data[var].isnull().sum() > 1:
        print(var, data[var].isnull().sum())

In [None]:
# Remove the Target Variable from the Features
cols_to_use.remove('SalePrice')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=42)

X_train.shape, X_test.shape

In [None]:
# Reset Index
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

## KNN imputation
Further Reading: [KNNImputer Sci-kit Learn](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html#sklearn.impute.KNNImputer)

In [None]:
imputer = KNNImputer(
    n_neighbors=5, # Number of neighboring samples to use for imputation
    weights='distance', # Weight function used in prediction
    metric='nan_euclidean', # Distance metric for searching neighbors
)

In [None]:
imputer.fit(X_train)

In [None]:
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

# sklearn returns a Numpy array; transform back to pandas dataframe
train_t = pd.DataFrame(train_t, columns=X_train.columns)
test_t = pd.DataFrame(test_t, columns=X_test.columns)

train_t.head()

In [None]:
# Variables with Nulls
train_t[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']].isnull().sum()

In [None]:
# Before Imputation
X_train[X_train['MasVnrArea'].isnull()]['MasVnrArea']

In [None]:
# After Imputation
train_t[X_train['MasVnrArea'].isnull()]['MasVnrArea']

In [None]:
# Check Mean of Train Dataset
X_train['MasVnrArea'].mean()