# **Estimating missing data with K-nearest neighbors**

Replace missing data, by the mean value shown by their closest k neighbors.

In [1]:
pip install feature-engine



In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from feature_engine.wrappers import SklearnTransformerWrapper

## **Load data**

In [3]:
# Load data with numerical variables
variables = ["A2", "A3", "A8", "A11", "A14", "A15", "target"]
data = pd.read_csv("credit_approval_uci.csv", usecols=variables)
data.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,target
0,30.83,0.0,1.25,1,202.0,0,1
1,58.67,4.46,3.04,6,43.0,560,1
2,24.5,,,0,280.0,824,1
3,27.83,1.54,3.75,5,100.0,3,1
4,20.17,5.625,1.71,0,120.0,0,1


## **Split data into train and test sets**

In [4]:
# Let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 6), (207, 6))

In [5]:
# Find the fraction of missing data:
X_train.isnull().mean()

A2     0.022774
A3     0.140787
A8     0.140787
A11    0.000000
A14    0.014493
A15    0.000000
dtype: float64

In [6]:
# Set up the imputer to find the closes 5 neighbors
# utilizing euclidean distance, and weighting the 
# neighbours so that furthest neighbors have smaller
# influence:
imputer = KNNImputer(
    n_neighbors=5,
    weights="distance",
)

In [7]:
# Find the closest neighbors:
imputer.fit(X_train)

KNNImputer(weights='distance')

In [8]:
# Replace the missing values by the weighted
# mean of the values shown by the neighbors:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [9]:
# The result is a NumPy array:
X_train

array([[4.608e+01, 3.000e+00, 2.375e+00, 8.000e+00, 3.960e+02, 4.159e+03],
       [1.592e+01, 2.875e+00, 8.500e-02, 0.000e+00, 1.200e+02, 0.000e+00],
       [3.633e+01, 2.125e+00, 8.500e-02, 1.000e+00, 5.000e+01, 1.187e+03],
       ...,
       [1.958e+01, 6.650e-01, 1.665e+00, 0.000e+00, 2.200e+02, 5.000e+00],
       [2.283e+01, 2.290e+00, 2.290e+00, 7.000e+00, 1.400e+02, 2.384e+03],
       [4.058e+01, 3.290e+00, 3.500e+00, 0.000e+00, 4.000e+02, 0.000e+00]])

In [10]:
# We can corroborate that there is no missing data:
pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
dtype: int64

## **Find neighbors base on specific variables**

In [11]:
# Let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [12]:
# Set up the imputer to find neighbous based on
# 4 numerical variables:
imputer = SklearnTransformerWrapper(
    transformer=KNNImputer(),
    variables=["A2", "A3", "A8", "A11"],
)

In [13]:
# Find neighbors and replace missing data
# by their estimates:
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)