In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer

Prepairing data


In [2]:
raw_data = pd.read_csv('raw_data/crx.data', header=None)

In [5]:
varnames = ['A'+str(s) for s in range(1, 17)]
raw_data.columns = varnames

In [9]:
raw_data = raw_data.replace('?', np.nan)

In [10]:
raw_data['A2'] = raw_data['A2'].astype('float')
raw_data['A14'] = raw_data['A14'].astype('float')

In [11]:
raw_data['A16'] = raw_data['A16'].map({'+': 1, '-': 0})

In [13]:
np.random.seed(9001)
values = set([np.random.randint(0, len(raw_data)) for p in range(0, 100)])
for var in ['A3', 'A8', 'A9', 'A10']:
    raw_data.loc[values, var] = np.nan

  raw_data.loc[values, var] = np.nan
  raw_data.loc[values, var] = np.nan
  raw_data.loc[values, var] = np.nan
  raw_data.loc[values, var] = np.nan


In [14]:
raw_data.to_csv('creditApprovalUCI.csv', index=False)

Task 1: Removing observations with missing data

In [22]:
df = pd.read_csv('data/creditApprovalUCI.csv')

In [20]:
df.isna().mean().sort_values(ascending=True)

A11    0.000000
A12    0.000000
A13    0.000000
A15    0.000000
A16    0.000000
A4     0.008696
A5     0.008696
A6     0.013043
A7     0.013043
A1     0.017391
A2     0.017391
A14    0.018841
A3     0.136232
A8     0.136232
A9     0.136232
A10    0.136232
dtype: float64

In [21]:
df = df.dropna()

Task 2: Performing mean or median imputation

In [23]:
df = pd.read_csv('data/creditApprovalUCI.csv')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('A16', axis=1), df['A16'], test_size=0.3, random_state=42)

In [30]:
X_train.isnull().mean()

A1     0.018634
A2     0.012422
A3     0.122153
A4     0.012422
A5     0.012422
A6     0.014493
A7     0.014493
A8     0.122153
A9     0.122153
A10    0.122153
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.024845
A15    0.000000
dtype: float64

Task 3: Capturing missing values in a bespoke category

In [33]:
df = pd.read_csv('data/creditApprovalUCI.csv')

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('A16', axis=1), df['A16'], test_size=0.3, random_state=42)

In [35]:
imp = SimpleImputer(strategy='constant', fill_value='Missing')

In [36]:
imp.fit(X_train)

In [37]:
imp.transform(X_test)

array([['a', 'Missing', 1.5, ..., 'g', 200.0, 105],
       ['a', 46.0, 'Missing', ..., 'g', 100.0, 960],
       ['b', 20.0, 0.0, ..., 'g', 144.0, 0],
       ...,
       ['b', 29.92, 1.835, ..., 'g', 260.0, 200],
       ['a', 37.33, 2.5, ..., 'g', 260.0, 246],
       ['a', 22.83, 2.29, ..., 'g', 140.0, 2384]], dtype=object)