# **Marking Imputed Values**

Add binary variables to indicate that a value was missing using pandas, Scikit-learn and Feature-Engine.

In [1]:
pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[?25l[K     |█▎                              | 10 kB 15.9 MB/s eta 0:00:01[K     |██▌                             | 20 kB 19.5 MB/s eta 0:00:01[K     |███▊                            | 30 kB 13.4 MB/s eta 0:00:01[K     |█████                           | 40 kB 10.5 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 4.9 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 5.7 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 5.7 MB/s eta 0:00:01[K     |██████████                      | 81 kB 4.3 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 4.8 MB/s eta 0:00:01[K     |████████████▋                   | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████████▉                  | 112 kB 5.2 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 5.2 MB/s eta 0:00:01[K     |████████████████▍               | 133 kB 5.2 M

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from feature_engine.imputation import (
    AddMissingIndicator, CategoricalImputer, MeanMedianImputer,
)

## **Load data**

In [3]:
data = pd.read_csv("credit_approval_uci.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


## **Split data into train and test sets**

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## **Add missing indicator with pandas**

In [5]:
# Capture variable names in a list:
varnames = ["A1", "A3", "A4", "A5", "A6", "A7", "A8"]

In [6]:
# Create missing indicator names in a list:
indicators = [f"{var}_na" for var in varnames]
indicators

['A1_na', 'A3_na', 'A4_na', 'A5_na', 'A6_na', 'A7_na', 'A8_na']

In [7]:
# Add missing indicators:
X_train[indicators] = X_train[varnames].isna().astype(int)
X_test[indicators] = X_test[varnames].isna().astype(int)

In [8]:
# Check the new missing indicator variables:
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A13,A14,A15,A1_na,A3_na,A4_na,A5_na,A6_na,A7_na,A8_na
596,a,46.08,3.0,u,g,c,v,2.375,t,t,...,g,396.0,4159,0,0,0,0,0,0,0
303,a,15.92,2.875,u,g,q,v,0.085,f,f,...,g,120.0,0,0,0,0,0,0,0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,...,g,50.0,1187,0,0,0,0,0,0,0
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,g,100.0,0,0,0,0,0,0,0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,...,g,360.0,1332,0,0,0,0,0,0,0


In [9]:
# The mean of the missing indicators should be the same as the
# fraction of missing values in the original variable.
# Let's check that out for the variable A3:
X_train["A3"].isnull().mean(), X_train["A3_na"].mean()

(0.14078674948240166, 0.14078674948240166)

## **Adding missing indicator with Feature-engine**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [11]:
# Set up imputer to add indicators to variables 
# with missing data:
imputer = AddMissingIndicator(
    variables=None,
    missing_only=True,
)

# Imputer finds variables with missing data:
imputer.fit(X_train)

AddMissingIndicator()

In [12]:
# The variables that had missing data in
# the train set:
imputer.variables_

['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A14']

In [13]:
# Add missing indicators:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [14]:
# Check the new missing indicator variables:
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A2_na,A3_na,A4_na,A5_na,A6_na,A7_na,A8_na,A9_na,A10_na,A14_na
596,a,46.08,3.0,u,g,c,v,2.375,t,t,...,0,0,0,0,0,0,0,0,0,0
303,a,15.92,2.875,u,g,q,v,0.085,f,f,...,0,0,0,0,0,0,0,0,0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,...,0,0,0,0,0,0,0,0,0,0
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,0,0,0,0,0,0,0,0,0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,...,0,0,0,0,0,0,0,0,0,0


### **Add missing indicators and impute data**

In [15]:
# Let's create a pipeline, where we add missing indicators
# first, and then impute categorical variables with the 
# frequent category, and numerical variables with the mean:
pipe = Pipeline(
    [
        ("indicators", AddMissingIndicator(missing_only=True)),
        ("categorical", CategoricalImputer(imputation_method="frequent")),
        ("nummerical", MeanMedianImputer()),
    ]
)

In [16]:
# Add indicators and impute data:
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

In [17]:
# Final dataset after the imputation:
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A2_na,A3_na,A4_na,A5_na,A6_na,A7_na,A8_na,A9_na,A10_na,A14_na
596,a,46.08,3.0,u,g,c,v,2.375,t,t,...,0,0,0,0,0,0,0,0,0,0
303,a,15.92,2.875,u,g,q,v,0.085,f,f,...,0,0,0,0,0,0,0,0,0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,...,0,0,0,0,0,0,0,0,0,0
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,0,0,0,0,0,0,0,0,0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Corroborate absence of missing values:
X_train.isnull().sum()

A1        0
A2        0
A3        0
A4        0
A5        0
A6        0
A7        0
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14       0
A15       0
A1_na     0
A2_na     0
A3_na     0
A4_na     0
A5_na     0
A6_na     0
A7_na     0
A8_na     0
A9_na     0
A10_na    0
A14_na    0
dtype: int64

## **Adding missing indicator with Scikit-learn**

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [20]:
indicator = MissingIndicator(features="missing-only")
indicator.fit(X_train)

MissingIndicator()

In [21]:
# The features that had missing data in the train set
# (the NumPy array shows the column index): 
indicator.features_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 13])

In [22]:
# We need to join the missing indicators
# to the original dataframe:

# Create a new variable name for each of the Missing Indicators:
indicator_cols = [c + "_na" for c in X_train.columns[indicator.features_]]

# Concatenate the original dataset with the missing indicators
X_train = pd.concat(
    [
        X_train.reset_index(drop=True),
        pd.DataFrame(indicator.transform(X_train), columns=indicator_cols),
    ],
    axis=1,
)

X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A2_na,A3_na,A4_na,A5_na,A6_na,A7_na,A8_na,A9_na,A10_na,A14_na
0,a,46.08,3.0,u,g,c,v,2.375,t,t,...,False,False,False,False,False,False,False,False,False,False
1,a,15.92,2.875,u,g,q,v,0.085,f,f,...,False,False,False,False,False,False,False,False,False,False
2,b,36.33,2.125,y,p,w,v,0.085,t,t,...,False,False,False,False,False,False,False,False,False,False
3,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,False,False,False,False,False,False,False,False,False,False
4,b,57.83,7.04,u,g,m,v,14.0,t,t,...,False,False,False,False,False,False,False,False,False,False


### **Add missing indicators and impute data**

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [24]:
numvars = X_train.select_dtypes(exclude="O").columns.to_list()
catvars = X_train.select_dtypes(include="O").columns.to_list()

In [25]:
# Pipeline:
pipe = ColumnTransformer([
    ("num_imputer", SimpleImputer(strategy="mean", add_indicator=True), numvars),
    ("cat_imputer", SimpleImputer(strategy="most_frequent", add_indicator=True), catvars),
])

In [26]:
# Add indicators and impute data:
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

In [27]:
# The result is a Numpy array:
X_train

array([[46.08, 3.0, 2.375, ..., False, False, False],
       [15.92, 2.875, 0.085, ..., False, False, False],
       [36.33, 2.125, 0.085, ..., False, False, False],
       ...,
       [19.58, 0.665, 1.665, ..., False, False, False],
       [22.83, 2.29, 2.29, ..., False, False, False],
       [40.58, 3.29, 3.5, ..., False, False, False]], dtype=object)

In [28]:
# Note that numerical variables and its indicators
# are on the left of the array, and categorical 
# variables with its indicators are on the right:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,46.08,3.0,2.375,8.0,396.0,4159.0,0.0,0.0,0.0,0.0,...,t,t,g,False,False,False,False,False,False,False
1,15.92,2.875,0.085,0.0,120.0,0.0,0.0,0.0,0.0,0.0,...,f,f,g,False,False,False,False,False,False,False
2,36.33,2.125,0.085,1.0,50.0,1187.0,0.0,0.0,0.0,0.0,...,t,f,g,False,False,False,False,False,False,False
3,22.17,0.585,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,...,f,f,g,False,False,False,False,False,False,False
4,57.83,7.04,14.0,6.0,360.0,1332.0,0.0,0.0,0.0,0.0,...,t,t,g,False,False,False,False,False,False,False
