# Imputation of missing values

### 1. SimpleImputer 

Univariate feature imputation


In [1]:
from sklearn.impute import SimpleImputer
import pandas as pd, numpy as np



In [2]:
df = pd.DataFrame([[np.nan, 2, 7, 0],
                   [3, 4, 2, 1],
                   [np.nan, 3, 6, 5],
                   [np.nan, 0, np.nan, 7]],
                  columns=list('ABCD'))

df


Unnamed: 0,A,B,C,D
0,,2,7.0,0
1,3.0,4,2.0,1
2,,3,6.0,5
3,,0,,7


In [3]:
# replace NaN with mean
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
fillna = imp.fit_transform(df)
fillna

array([[3., 2., 7., 0.],
       [3., 4., 2., 1.],
       [3., 3., 6., 5.],
       [3., 0., 5., 7.]])

In [4]:
# replace 0 with 999
imp = SimpleImputer(missing_values=0, strategy='constant', fill_value=999)
imp.fit_transform(fillna)

array([[  3.,   2.,   7., 999.],
       [  3.,   4.,   2.,   1.],
       [  3.,   3.,   6.,   5.],
       [  3., 999.,   5.,   7.]])

### 2. IterativeImputer
Multivariate imputer that estimates each feature from all the others.

A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.

We can assign model for the imputer. See [Imputing missing values with variants of IterativeImputer.](https://scikit-learn.org/stable/auto_examples/impute/plot_iterative_imputer_variants_comparison.html#sphx-glr-auto-examples-impute-plot-iterative-imputer-variants-comparison-py)

In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [6]:
df

Unnamed: 0,A,B,C,D
0,,2,7.0,0
1,3.0,4,2.0,1
2,,3,6.0,5
3,,0,,7


In [7]:
imp = IterativeImputer(random_state=0)
imp.fit_transform(df)

array([[3.        , 2.        , 7.        , 0.        ],
       [3.        , 4.        , 2.        , 1.        ],
       [3.        , 3.        , 6.        , 5.        ],
       [3.        , 0.        , 5.00526148, 7.        ]])

### 3. KNNImputer

Imputation for completing missing values using k-Nearest Neighbors.

Each sample’s missing values are imputed using the mean value from n_neighbors

In [8]:
df

Unnamed: 0,A,B,C,D
0,,2,7.0,0
1,3.0,4,2.0,1
2,,3,6.0,5
3,,0,,7


In [9]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(df)

array([[3., 2., 7., 0.],
       [3., 4., 2., 1.],
       [3., 3., 6., 5.],
       [3., 0., 4., 7.]])

### 4. MissingIndicator

In [10]:
from sklearn.impute import MissingIndicator

indicator = MissingIndicator()
indicator.fit_transform(df)

array([[ True, False],
       [False, False],
       [ True, False],
       [ True,  True]])

In [11]:
df

Unnamed: 0,A,B,C,D
0,,2,7.0,0
1,3.0,4,2.0,1
2,,3,6.0,5
3,,0,,7
