# Removing Observation with Missing Data 

In [1]:
import pandas as pd
import numpy as np

# to split the data sets
from sklearn.model_selection import train_test_split

# to impute missing data with sklearn
from sklearn.impute import SimpleImputer

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import MeanMedianImputer

# to impute missing data with sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import CategoricalVariableImputer

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import ArbitraryNumberImputer

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import RandomSampleImputer



# to show all the columns of the dataframe in the notebeook
pd.set_option('display.max_columns', None)

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's inspect the percentage of missing values in each variable

data.isnull().mean().sort_values(ascending=True)

A11    0.000000
A12    0.000000
A13    0.000000
A15    0.000000
A16    0.000000
A4     0.008696
A5     0.008696
A6     0.013043
A7     0.013043
A1     0.017391
A2     0.017391
A14    0.018841
A3     0.133333
A8     0.133333
A9     0.133333
A10    0.133333
dtype: float64

In [4]:
# create a complete case data set

data_cca = data.dropna()

In [5]:
print('Number of total observations: {}'.format(len(data)))
print('Number of observations with complete cases: {}'.format(len(data_cca)))

Number of total observations: 690
Number of observations with complete cases: 564


In [6]:
# we can also indicate for which variables we would like the complete
# cases

data_cca = data.dropna(subset=[
    'A1',
    'A2',
    'A6',
    'A7',
    'A14',
])

In [7]:
print('Number of total observations: {}'.format(len(data)))
print('Number of observations with complete cases: {}'.format(len(data_cca)))

Number of total observations: 690
Number of observations with complete cases: 653


# Performing Mean or Median Imputation

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data per variable

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Mean / median imputation with pandas

In [5]:
# replace NA in indicated numerical variables

for var in ['A2', 'A3', 'A8', 'A11', 'A15']:

    value = X_train[var].median()

    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

In [6]:
# check absence of missing values in imputed variables

X_train[['A2', 'A3', 'A8', 'A11', 'A15']].isnull().sum()

A2     0
A3     0
A8     0
A11    0
A15    0
dtype: int64

## Mean / median imputation with Scikit-learn

In [7]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data[['A2', 'A3', 'A8', 'A11', 'A15']],
    data['A16'],
    test_size=0.3,
    random_state=0)

In [8]:
# create a median imputation object with SimpleImputer
imputer = SimpleImputer(strategy='median')

# let's fit the imputer to the train set
# the imputer will learn the median of all variables
imputer.fit(X_train)

# we can look at the learnt medians:
imputer.statistics_

array([28.835,  2.75 ,  1.   ,  0.   ,  6.   ])

In [9]:
# and now we impute the train and test sets
# NOTE: the data is returned as a numpy array!!!

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [10]:
# check that missing values were removed

pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
4    0
dtype: int64

## Mean / Median imputation with Feature-engine

In [11]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [12]:
# let's create a median imputer

median_imputer = MeanMedianImputer(imputation_method='median',
                                   variables=['A2', 'A3', 'A8', 'A11', 'A15'])

median_imputer.fit(X_train)

MeanMedianImputer(imputation_method='median',
                  variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [13]:
# let's inspect the dictionary with the mappings for each variable
median_imputer.imputer_dict_

{'A2': 28.835, 'A3': 2.75, 'A8': 1.0, 'A11': 0.0, 'A15': 6.0}

In [14]:
# transform the data
X_train = median_imputer.transform(X_train)
X_test = median_imputer.transform(X_test)



In [15]:
# check that null values were replaced
X_train[['A2', 'A3', 'A8', 'A11', 'A15']].isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A15    0.0
dtype: float64

## Mean / median imputation with Sklearn selecting features to impute

In [17]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [18]:
# first we need to make a list with the numerical vars
numeric_features_mean = ['A2', 'A3', 'A8', 'A11', 'A15']

# then we instantiate the imputer within a pipeline
numeric_mean_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

# then we put the features list and the imputer in the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('mean_imputer', numeric_mean_imputer, numeric_features_mean)
    ], remainder='passthrough')

In [19]:
# now we fit the preprocessor
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('mean_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 ['A2', 'A3', 'A8', 'A11', 'A15'])],
                  verbose=False)

In [20]:
# and now we impute the data
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [21]:
# Note that Scikit-Learn transformers return NumPy arrays!!
X_train

array([[46.08, 3.0, 2.375, ..., 't', 'g', 396.0],
       [15.92, 2.875, 0.085, ..., 'f', 'g', 120.0],
       [36.33, 2.125, 0.085, ..., 'f', 'g', 50.0],
       ...,
       [19.58, 0.665, 1.665, ..., 'f', 'g', 220.0],
       [22.83, 2.29, 2.29, ..., 't', 'g', 140.0],
       [40.58, 3.29, 3.5, ..., 't', 's', 400.0]], dtype=object)

# Implementing a Mode or Frequent Category Imputation

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data within those variables

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Frequent category imputation with pandas

In [5]:
# replace NA in some categorical variables

for var in ['A4', 'A5', 'A6', 'A7']:

    value = X_train[var].mode()[0]

    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

In [6]:
# check absence of missing values

X_train[['A4', 'A5', 'A6', 'A7']].isnull().sum()

A4    0
A5    0
A6    0
A7    0
dtype: int64

## Frequent category imputation with Scikit-learn

In [7]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [8]:
# create a frequent category imputation object with SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')

# we fit the imputer to the train set
# the imputer will learn the mode of all variables
imputer.fit(X_train)

# we can look at the learnt modes:
imputer.statistics_

array(['u', 'g', 'c', 'v'], dtype=object)

In [9]:
# and now we impute the train and test set
# NOTE: the data is returned as a numpy array!!!

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [10]:
pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

## Frequent category imputation with Feature-engine

In [11]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [12]:
# let's create a frequent imputation transformer

mode_imputer = CategoricalVariableImputer(variables=['A4', 'A5', 'A6', 'A7'], imputation_method='frequent')

mode_imputer.fit(X_train)

FrequentCategoryImputer(variables=['A4', 'A5', 'A6', 'A7'])

In [13]:
# dictionary with the mappings for each variable
mode_imputer.imputer_dict_

{'A4': 'u', 'A5': 'g', 'A6': 'c', 'A7': 'v'}

In [14]:
# transform the data
X_train = mode_imputer.transform(X_train)
X_test = mode_imputer.transform(X_test)



In [15]:
X_train[['A4', 'A5', 'A6', 'A7']].isnull().mean()

A4    0.0
A5    0.0
A6    0.0
A7    0.0
dtype: float64

## Frequent category imputation with Sklearn selecting features to impute

In [17]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')

# let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [18]:
# first we make a lists with the features
# to be imputed

categoric_features = ['A4', 'A5', 'A6', 'A7']

# then we instantiate the imputer within a pipeline

categoric_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

# then we put the features list and the imputer together
# using the column transformer

preprocessor = ColumnTransformer(transformers=[
    ('frequent_imputer', categoric_imputer, categoric_features)
    ], remainder='passthrough')

In [19]:
# now we fit the preprocessor
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('frequent_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0))],
                                          verbose=False),
                                 ['A4', 'A5', 'A6', 'A7'])],
                  verbose=False)

In [20]:
# and now we can impute the data

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [21]:
# be carefutl that Scikit-Learn transformers return NumPy arrays!!
X_train

array([['u', 'g', 'c', ..., 'g', 396.0, 4159],
       ['u', 'g', 'q', ..., 'g', 120.0, 0],
       ['y', 'p', 'w', ..., 'g', 50.0, 1187],
       ...,
       ['u', 'g', 'w', ..., 'g', 220.0, 5],
       ['u', 'g', 'q', ..., 'g', 140.0, 2384],
       ['u', 'g', 'm', ..., 's', 400.0, 0]], dtype=object)

# Replacing Missing Values With an Arbitrary Number 

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data per variable

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Arbitrary imputation with pandas

In [5]:
# find the maximum value per variable
X_train[['A2','A3', 'A8', 'A11']].max()

A2     76.750
A3     26.335
A8     20.000
A11    67.000
dtype: float64

In [6]:
# replace NA with 99 in indicated numerical variables

for var in ['A2','A3', 'A8', 'A11']:
    
    X_train[var].fillna(99, inplace=True)
    X_test[var].fillna(99, inplace=True)

In [7]:
# check absence of missing values
X_train[['A2','A3', 'A8', 'A11']].isnull().sum()

A2     0
A3     0
A8     0
A11    0
dtype: int64

## Arbitrary imputation with Scikit-learn

In [8]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data[['A2', 'A3', 'A8', 'A11']],
    data['A16'],
    test_size=0.3,
    random_state=0)

In [9]:
# create an instance of the simple imputer
imputer = SimpleImputer(strategy='constant', fill_value=99)

# we fit the imputer to the train set
imputer.fit(X_train)

# we can look at the constant values:
imputer.statistics_

array([99., 99., 99., 99.])

In [10]:
# and now we impute the train and test set
# NOTE: the data is returned as a numpy array!!!

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [11]:
# check that missing values were removed
pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

## Arbitrary imputation imputation with feature engine

In [12]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [13]:
# let's create an arbitrary value imputer

imputer = ArbitraryNumberImputer(
    arbitrary_number=99, variables=['A2','A3', 'A8', 'A11'])

imputer.fit(X_train)

ArbitraryNumberImputer(arbitrary_number=99, variables=['A2', 'A3', 'A8', 'A11'])

In [14]:
# dictionary with the mappings for each variable
imputer.arbitrary_number

99

In [15]:
# transform the data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



In [16]:
# check that null values were replaced
X_train[['A2','A3', 'A8', 'A11']].isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
dtype: float64

## Arbitrary imputation imputation with Sklearn selecting features to impute

In [18]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1),data['A16' ], test_size=0.3, random_state=0)

In [19]:
# first we need to make a list with the numerical vars
features_arbitrary = ['A2', 'A3', 'A8', 'A11']
features_mean = ['A15']

# then we instantiate the imputer within a pipeline
arbitrary_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=99))])

mean_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

# then we put the features list and the imputer in
# the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('arbitrary_imputer', arbitrary_imputer, features_arbitrary),
    ('mean_imputer', mean_imputer, features_mean)
    ], remainder='passthrough')

In [20]:
# now we fit the preprocessor
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('arbitrary_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=99,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0))],
                                          verbose=False),
                                 ['A2', 'A3', 'A8', 'A11']),
                                ('mean_imputer',
                                 Pipeline(memory=None,

In [21]:
# and now we impute the data
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [22]:
# Note that Scikit-Learn transformers return NumPy arrays!!
X_train

array([[46.08, 3.0, 2.375, ..., 't', 'g', 396.0],
       [15.92, 2.875, 0.085, ..., 'f', 'g', 120.0],
       [36.33, 2.125, 0.085, ..., 'f', 'g', 50.0],
       ...,
       [19.58, 0.665, 1.665, ..., 'f', 'g', 220.0],
       [22.83, 2.29, 2.29, ..., 't', 'g', 140.0],
       [40.58, 3.29, 3.5, ..., 't', 's', 400.0]], dtype=object)

# Capturing Missing Values in a bespoke Category

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data per variable

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Adding a bespoke category with pandas

In [5]:
# replace NA in some categorical variables

for var in ['A4', 'A5', 'A6', 'A7']:

    X_train[var].fillna('Missing', inplace=True)
    X_test[var].fillna('Missing', inplace=True)

In [6]:
# check absence of missing values
X_train[['A4', 'A5', 'A6', 'A7']].isnull().sum()

A4    0
A5    0
A6    0
A7    0
dtype: int64

## Adding a bespoke category with Scikit-learn

In [7]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [8]:
# create an instance of the simple imputer
imputer = SimpleImputer(strategy='constant', fill_value='Missing')

# we fit the imputer to the train set
imputer.fit(X_train)

# we can look at the new category:
imputer.statistics_

array(['Missing', 'Missing', 'Missing', 'Missing'], dtype=object)

In [9]:
# and now we impute the train and test set
# NOTE: the data is returned as a numpy array!!!

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [10]:
pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

## Adding a bespoke category with feature engine

In [11]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [12]:
imputer = CategoricalVariableImputer(variables=['A4', 'A5', 'A6', 'A7'])

imputer.fit(X_train)

CategoricalVariableImputer(variables=['A4', 'A5', 'A6', 'A7'])

In [13]:
# transform the data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



In [14]:
X_train[['A4', 'A5', 'A6', 'A7']].isnull().mean()

A4    0.0
A5    0.0
A6    0.0
A7    0.0
dtype: float64

## Adding a bespoke category with Sklearn selecting features to impute

In [15]:
import pandas as pd

# to impute missing data with sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# to split the datasets
from sklearn.model_selection import train_test_split

In [16]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [17]:
# first we make a lists with the features to be imputed
features_arbitrary = ['A4', 'A5']
features_mode = ['A6', 'A7']

# then we instantiate the imputer within a pipeline
arbitrary_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing'))])

mode_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

# then we put the features list and the imputers in
# the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('arbitrary_imputer', arbitrary_imputer, features_arbitrary),
    ('mean_imputer', mode_imputer, features_mode)
    ], remainder='passthrough')

In [18]:
# now we fit the preprocessor
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('arbitrary_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value='Missing',
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0))],
                                          verbose=False),
                                 ['A4', 'A5']),
                                ('mean_imputer',
                                 Pipeline(memory=None,
     

In [19]:
# and now we can impute the data

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [20]:
# be carefutl that Scikit-Learn transformers return NumPy arrays!!
X_train

array([['u', 'g', 'c', ..., 'g', 396.0, 4159],
       ['u', 'g', 'q', ..., 'g', 120.0, 0],
       ['y', 'p', 'w', ..., 'g', 50.0, 1187],
       ...,
       ['u', 'g', 'w', ..., 'g', 220.0, 5],
       ['u', 'g', 'q', ..., 'g', 140.0, 2384],
       ['u', 'g', 'm', ..., 's', 400.0, 0]], dtype=object)

# Replacing Missing Values by a value at the end of the distribution

In [1]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import EndTailImputer

## End tail imputation with pandas

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data per variable

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

In [5]:
# replace NA in indicated numerical variables
# using inter-quantal range proximity rule 

for var in ['A2', 'A3', 'A8', 'A11', 'A15']:

    IQR = X_train[var].quantile(0.75) - X_train[var].quantile(0.25)
    value = X_train[var].quantile(0.75) + 1.5 * IQR

    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

In [6]:
# check absence of missing values
X_train[['A2', 'A3', 'A8', 'A11', 'A15']].isnull().sum()

A2     0
A3     0
A8     0
A11    0
A15    0
dtype: int64

## End tail imputation with Feature Engine

In [7]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [8]:
# let's create a median imputer

imputer = EndTailImputer(distribution='skewed', tail='right',
                         variables=['A2', 'A3', 'A8', 'A11', 'A15'])

imputer.fit(X_train)

EndTailImputer(distribution='skewed', fold=3, tail='right',
               variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [9]:
# dictionary with the mappings for each variable
imputer.imputer_dict_

{'A2': 88.18,
 'A3': 27.31,
 'A8': 11.504999999999999,
 'A11': 12.0,
 'A15': 1800.0}

In [10]:
# transform the data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



In [11]:
# check that null values were replaced
X_train[['A2', 'A3', 'A8', 'A11', 'A15']].isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A15    0.0
dtype: float64

# Implementing Random Sample Imputation

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data within those variables

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Random Sample imputation with pandas

In [5]:
# extract a random sample (as many values as missing values in the variable)

number_missing_values = X_train['A2'].isnull().sum()
number_missing_values

11

In [6]:
# extract a random sample (as many values as missing values in the variable)

random_sample_train = X_train['A2'].dropna().sample(number_missing_values, random_state=0)

In [7]:
# re-index the random sample so that we can join it to our original data

random_sample_train.index = X_train[X_train['A2'].isnull()].index

random_sample_train.index

Int64Index([97, 500, 329, 83, 254, 608, 445, 450, 515, 286, 86], dtype='int64')

In [8]:
# replace the missing values
X_train.loc[X_train['A2'].isnull(), 'A2'] = random_sample_train

X_train['A2'].isnull().sum()

0

In [9]:
# repeat in a loop for the rest of the variables
# and for both train and test set

for var in ['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']:

    # extract a random sample
    random_sample_train = X_train[var].dropna().sample(
        X_train[var].isnull().sum(), random_state=0)

    random_sample_test = X_train[var].dropna().sample(
        X_test[var].isnull().sum(), random_state=0)

    # re index the random sample
    random_sample_train.index = X_train[X_train[var].isnull()].index
    random_sample_test.index = X_test[X_test[var].isnull()].index

    # replace the NA 
    X_train.loc[X_train[var].isnull(), var] = random_sample_train
    X_test.loc[X_test[var].isnull(), var] = random_sample_test
    
# check missing data
X_train[['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']].isnull().sum()

A1    0
A3    0
A4    0
A5    0
A6    0
A7    0
A8    0
dtype: int64

## Random Sample imputation with Feature Engine

In [10]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [11]:
# let's create a random sample imputer

imputer = RandomSampleImputer()

imputer.fit(X_train)

RandomSampleImputer(random_state=None, seed='general', seeding_method='add',
                    variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8',
                               'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15'])

In [12]:
# the imputer stores the train set

imputer.X.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


In [13]:
# transform the data - replace the missing values

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



In [14]:
# check that null values were replaced
X_train.isnull().mean()

A1     0.0
A2     0.0
A3     0.0
A4     0.0
A5     0.0
A6     0.0
A7     0.0
A8     0.0
A9     0.0
A10    0.0
A11    0.0
A12    0.0
A13    0.0
A14    0.0
A15    0.0
dtype: float64

## Random Sampling seeding on variable values

In [15]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [16]:
imputer_obs = RandomSampleImputer(random_state=['A8', 'A3'], seed='observation', seeding_method='add')

In [17]:
imputer_obs.fit(X_train)

RandomSampleImputer(random_state=['A8', 'A3'], seed='observation',
                    seeding_method='add',
                    variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8',
                               'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15'])

In [18]:
X_train_tt = imputer_obs.transform(X_train)
X_test_tt = imputer_obs.transform(X_test)

# Adding a Missing Value Indicator Variable

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data within those variables

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Add missing indicator with pandas

In [5]:
# add missing indicator

for var in ['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']:

    X_train[var+'_NA'] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var+'_NA'] = np.where(X_test[var].isnull(), 1, 0)

    
# check the new missing indicator variables
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A13,A14,A15,A1_NA,A3_NA,A4_NA,A5_NA,A6_NA,A7_NA,A8_NA
596,a,46.08,3.0,u,g,c,v,2.375,t,t,...,g,396.0,4159,0,0,0,0,0,0,0
303,a,15.92,2.875,u,g,q,v,0.085,f,f,...,g,120.0,0,0,0,0,0,0,0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,...,g,50.0,1187,0,0,0,0,0,0,0
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,g,100.0,0,0,0,0,0,0,0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,...,g,360.0,1332,0,0,0,0,0,0,0


In [6]:
# the mean of the missing indicator should be the same as the 
# percentage of missing values in the original variable

X_train['A3'].isnull().mean(), X_train['A3_NA'].mean()

(0.14078674948240166, 0.14078674948240166)

## Adding missing indicator with Feature Engine

In [7]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [8]:
# let's create a random sample imputer

imputer = AddNaNBinaryImputer()

imputer.fit(X_train)

AddNaNBinaryImputer(variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8',
                               'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15'])

In [9]:
# transform the data - replace the missing values

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



In [10]:
# check that null values were replaced
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A6_na,A7_na,A8_na,A9_na,A10_na,A11_na,A12_na,A13_na,A14_na,A15_na
596,a,46.08,3.0,u,g,c,v,2.375,t,t,...,0,0,0,0,0,0,0,0,0,0
303,a,15.92,2.875,u,g,q,v,0.085,f,f,...,0,0,0,0,0,0,0,0,0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,...,0,0,0,0,0,0,0,0,0,0
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,0,0,0,0,0,0,0,0,0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,...,0,0,0,0,0,0,0,0,0,0


## Adding missing indicator with Scikit-learn

In [11]:
import pandas as pd
from sklearn.impute import MissingIndicator
from sklearn.model_selection import train_test_split

In [12]:
data = pd.read_csv('creditApprovalUCI.csv')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [14]:
indicator = MissingIndicator(error_on_new=True, features='missing-only')
indicator.fit(X_train)  

MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                 sparse='auto')

In [15]:
# we can see the features with na:
# the result shows the column index in the NumPy array

indicator.features_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 13], dtype=int64)

In [16]:
# with Sklearn we need to join the missing indicators dataframe
# to the original X_train

# let's create a column name for each of the new MissingIndicators
indicator_cols = [c+'_NA' for c in X_train.columns[indicator.features_]]

# and now let's concatenate the original dataset with the missing indicators
X_train = pd.concat([
    X_train.reset_index(),
    pd.DataFrame(indicator.transform(X_train), columns = indicator_cols)],
    axis=1)

X_train.head()

Unnamed: 0,index,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A2_NA,A3_NA,A4_NA,A5_NA,A6_NA,A7_NA,A8_NA,A9_NA,A10_NA,A14_NA
0,596,a,46.08,3.0,u,g,c,v,2.375,t,...,False,False,False,False,False,False,False,False,False,False
1,303,a,15.92,2.875,u,g,q,v,0.085,f,...,False,False,False,False,False,False,False,False,False,False
2,204,b,36.33,2.125,y,p,w,v,0.085,t,...,False,False,False,False,False,False,False,False,False,False
3,351,b,22.17,0.585,y,p,ff,ff,0.0,f,...,False,False,False,False,False,False,False,False,False,False
4,118,b,57.83,7.04,u,g,m,v,14.0,t,...,False,False,False,False,False,False,False,False,False,False
