In [7]:
!pip install feature-engine -q

In [8]:
import pandas as pd

# to split the data sets:
from sklearn.model_selection import train_test_split

# to impute missing data with sklearn:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# to impute missing data with Feature-engine:
from feature_engine.imputation import MeanMedianImputer

In [9]:
!git clone https://github.com/taipeihugo/Feature-Engineering.git -q

fatal: destination path 'Feature-Engineering' already exists and is not an empty directory.


In [10]:
data = pd.read_csv("Feature-Engineering/credit_approval_uci.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [12]:
X_train.isnull().sum()

Unnamed: 0,0
A1,0
A2,0
A3,0
A4,0
A5,0
A6,0
A7,0
A8,0
A9,0
A10,0


In [13]:
# Let's inspect the proportion of missing
# values per variable:

X_train.isnull().mean()

Unnamed: 0,0
A1,0.0
A2,0.0
A3,0.0
A4,0.0
A5,0.0
A6,0.0
A7,0.0
A8,0.0
A9,0.0
A10,0.0


In [14]:
numeric_vars = X_train.select_dtypes(exclude="O").columns.to_list()
numeric_vars

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [15]:
# Learn the variables median values:

median_values = X_train[numeric_vars].median().to_dict()

median_values

{'A2': 28.58, 'A3': 3.0, 'A8': 1.0, 'A11': 0.0, 'A14': 152.0, 'A15': 6.0}

In [16]:
# Replace missing data by the median:

X_train = X_train.fillna(value=median_values)
X_test = X_test.fillna(value=median_values)

In [17]:
# Corroborate absence of missing values:
X_train[numeric_vars].isnull().sum()

Unnamed: 0,0
A2,0
A3,0
A8,0
A11,0
A14,0
A15,0


In [18]:
# Corroborate absence of missing values:
X_test[numeric_vars].isnull().sum()

Unnamed: 0,0
A2,0
A3,0
A8,0
A11,0
A14,0
A15,0


In [19]:
# 使用Scikit-learn
# Split data into train and test set:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [20]:
numeric_vars

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [21]:
# Make a list with the non-numerical variables:

remaining_vars = [var for var in X_train.columns if var not in numeric_vars]

remaining_vars

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [22]:
# Set up the imputer to replace missing data
# with the median:

imputer = SimpleImputer(strategy="median")

# Indicate which variables to impute:
ct = ColumnTransformer([("imputer", imputer, numeric_vars)], remainder="passthrough")

# Find the median value per variable:
ct.fit(X_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [23]:
# Check the medians that will be used in
# the imputation:

ct.named_transformers_.imputer.statistics_

array([ 28.58,   3.  ,   1.  ,   0.  , 152.  ,   6.  ])

In [24]:
# Replace missing data:

X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

X_train

array([[46.08, 3.0, 2.375, ..., 't', 't', 'g'],
       [15.92, 2.875, 0.085, ..., 'f', 'f', 'g'],
       [36.33, 2.125, 0.085, ..., 't', 'f', 'g'],
       ...,
       [19.58, 0.665, 1.665, ..., 'f', 'f', 'g'],
       [22.83, 2.29, 2.29, ..., 't', 't', 'g'],
       [40.58, 3.29, 3.5, ..., 'f', 't', 's']], dtype=object)

In [25]:
# Convert returned array to a pandas dataframe:

X_train = pd.DataFrame(
    X_train,
    columns=numeric_vars + remaining_vars,
)

X_train.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,46.08,3.0,2.375,8.0,396.0,4159.0,a,u,g,c,v,t,t,t,g
1,15.92,2.875,0.085,0.0,120.0,0.0,a,u,g,q,v,f,f,f,g
2,36.33,2.125,0.085,1.0,50.0,1187.0,b,y,p,w,v,t,t,f,g
3,22.17,0.585,0.0,0.0,100.0,0.0,b,y,p,ff,ff,f,f,f,g
4,57.83,7.04,14.0,6.0,360.0,1332.0,b,u,g,m,v,t,t,t,g


In [26]:
# Corroborate absence of missing values:

X_train[numeric_vars].isnull().sum()

Unnamed: 0,0
A2,0
A3,0
A8,0
A11,0
A14,0
A15,0


In [27]:
# Convert returned array to a pandas dataframe:

X_test = pd.DataFrame(
    X_test,
    columns=numeric_vars + remaining_vars,
)

X_test.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,45.83,10.5,5.0,7.0,0.0,0.0,a,u,g,q,v,t,t,t,g
1,64.08,20.0,17.5,9.0,0.0,1000.0,b,u,g,x,h,t,t,t,g
2,31.25,3.75,0.625,9.0,181.0,0.0,a,u,g,cc,h,t,t,t,g
3,39.25,9.5,6.5,14.0,240.0,4607.0,b,u,g,m,v,t,t,f,g
4,26.17,2.0,0.0,0.0,276.0,1.0,a,u,g,j,j,f,f,t,g


In [28]:
# Corroborate absence of missing values:

X_test[numeric_vars].isnull().sum()

Unnamed: 0,0
A2,0
A3,0
A8,0
A11,0
A14,0
A15,0


In [29]:
# 使用Feature-engine
# Split data into train and test set:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [30]:
# Set up the imputer to replace missing data
# with the median:

imputer = MeanMedianImputer(
    imputation_method="median",
    variables=numeric_vars,
)

# Find the median values:
imputer.fit(X_train)

In [31]:
# The median values per variable:

imputer.imputer_dict_

{'A2': 28.58, 'A3': 3.0, 'A8': 1.0, 'A11': 0.0, 'A14': 152.0, 'A15': 6.0}

In [32]:
# Replace missing data with the median:

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [33]:
# Corroborate absence of missing values:

X_train[numeric_vars].isnull().sum()

Unnamed: 0,0
A2,0
A3,0
A8,0
A11,0
A14,0
A15,0


In [34]:
# Corroborate absence of missing values:

X_test[numeric_vars].isnull().sum()

Unnamed: 0,0
A2,0
A3,0
A8,0
A11,0
A14,0
A15,0
