<a href="https://colab.research.google.com/github/vinay10949/AnalyticsAndML/blob/master/FeatureEngineering/MissingDataImputation/1_23_Assembling_an_imputation_pipeline_with_Feature_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Replacing missing values by a value at the end of the distribution

In this recipe, we will replace missing values by a value at the end of the distribution, estimated with the Gaussian approximation or the inter-quantal range proximity rule, using pandas and Feature-Engine, all open source Python libraries.

In [0]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import feature_engine.missing_data_imputers as mdi

In [0]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [0]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [0]:
# find categorical variables
cat_cols = [c for c in data.columns if data[c].dtypes=='O']
data[cat_cols].isnull().mean().sort_values()

A12    0.000000
A13    0.000000
A4     0.008696
A5     0.008696
A6     0.013043
A7     0.013043
A1     0.017391
A9     0.133333
A10    0.133333
dtype: float64

In [0]:
# find numerical variables
num_cols = [c for c in data.columns if data[c].dtypes!='O']
data[num_cols].isnull().mean().sort_values()

A11    0.000000
A15    0.000000
A16    0.000000
A2     0.017391
A14    0.018841
A3     0.133333
A8     0.133333
dtype: float64

In [0]:
# # find the percentage of missing data within those variables

# X_train.isnull().mean()

In [0]:
# first we need to make a list with the numerical vars
features_num_arbitrary = ['A3', 'A8']
features_num_median = ['A2', 'A14']

features_cat_frequent = ['A4', 'A5', 'A6', 'A7']
features_cat_missing = ['A1', 'A9', 'A10']

In [0]:
# we instantiate each imputer within a pipeline

pipe = Pipeline(steps=[
    ('imp_num_arbitrary', mdi.ArbitraryNumberImputer(variables = features_num_arbitrary)),
    ('imp_num_median', mdi.MeanMedianImputer(imputation_method = 'median', variables=features_num_median)),
    ('imp_cat_frequent', mdi.FrequentCategoryImputer(variables = features_cat_frequent)),
    ('imp_cat_missing', mdi.CategoricalVariableImputer(variables=features_cat_missing))
])

In [0]:
# now we fit the preprocessor
pipe.fit(X_train)



Pipeline(memory=None,
         steps=[('imp_num_arbitrary',
                 ArbitraryNumberImputer(arbitrary_number=-999,
                                        variables=['A3', 'A8'])),
                ('imp_num_median',
                 MeanMedianImputer(imputation_method='median',
                                   variables=['A2', 'A14'])),
                ('imp_cat_frequent',
                 FrequentCategoryImputer(variables=['A4', 'A5', 'A6', 'A7'])),
                ('imp_cat_missing',
                 CategoricalVariableImputer(variables=['A1', 'A9', 'A10']))],
         verbose=False)

In [0]:
# and now we impute the data
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

In [0]:
X_train.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
dtype: int64

In [0]:
X_test.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
dtype: int64