# Tutorial: Data preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

## Load UCI Data
UCI Adult data are going to be used in this tutorial. The data contain both numerical and categorical features, and the target to predict is whether a person makes over 50K a year. It is a very simple binary classification task.

In [6]:
train = pd.read_csv('../datasets/adult_train.csv')
test = pd.read_csv('../datasets/adult_test.csv')

In [7]:
train.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
test.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [6]:
target_dict = {
    ' <=50K': 0,
    ' >50K': 1
}

## Encode categorical and numerical features
For categorical features, label encoding will be applied. For numerical features, rank-gauss transform will be applied.

In [7]:
from xfeat import SelectCategorical, SelectNumerical
from kuma_utils.preprocessing.xfeat import Pipeline, TargetEncoder
from kuma_utils.preprocessing import DistTransformer
from category_encoders import OrdinalEncoder

In [8]:
num_enc = Pipeline([
    SelectNumerical(),
    DistTransformer('rankgauss')
])

cat_enc = Pipeline([
    SelectCategorical(exclude_cols=['Target']),
    OrdinalEncoder(handle_missing='return_nan')
], target_col='Target')

In [9]:
train_cat, train_num = cat_enc.fit_transform(train).astype(int, errors='ignore'), num_enc.fit_transform(train).astype(float)
categorical_index = list(range(train_cat.shape[1]))
train_encoded = pd.concat([train_cat, train_num], axis=1) 
train_encoded.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,1.0,1,1,1.0,1,1,1,1.0,0.126243,-1.173177,0.972124,1.423976,-5.199338,-0.070036
1,2.0,1,2,2.0,2,1,1,1.0,0.814719,-1.116616,0.972124,-5.199338,-5.199338,-1.901646
2,3.0,2,3,3.0,1,1,1,1.0,0.062707,0.485916,-0.547956,-5.199338,-5.199338,-0.070036
3,3.0,3,2,3.0,2,2,1,1.0,1.001437,0.65271,-1.285763,-5.199338,-5.199338,-0.070036
4,3.0,1,2,4.0,3,2,2,2.0,-0.643497,1.346146,0.972124,-5.199338,-5.199338,-0.070036


In [10]:
test_encoded = pd.concat([
    cat_enc.transform(test).astype(int, errors='ignore'), 
    num_enc.transform(test).astype(float)], axis=1)
test_encoded.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,3.0,3,1,10.0,4,2,1,1.0,-0.900346,0.587223,-1.285763,-5.199338,-5.199338,-0.070036
1,3.0,2,2,9.0,2,1,1,1.0,0.062707,-1.052221,-0.547956,-5.199338,-5.199338,1.014616
2,5.0,7,2,13.0,2,1,1,1.0,-0.643497,1.334709,0.630671,-5.199338,-5.199338,-0.070036
3,3.0,6,2,10.0,2,2,1,1.0,0.441763,-0.233917,0.163861,1.871975,-5.199338,-0.070036
4,,6,1,,4,1,2,1.0,-2.042277,-0.88379,0.163861,-5.199338,-5.199338,-1.065046


## Drop data points 

In [11]:
def random_drop(df, ratio=0.1):
    for col in df.columns:
        if col in 'Target':
            continue
        drop_num = int(df.shape[0] * ratio)
        drop_idx = np.random.choice(np.arange(df.shape[0]), drop_num)
        df[col].iloc[drop_idx] = np.nan

In [12]:
random_drop(train_encoded, 0.2)
random_drop(test_encoded, 0.2)

## Regression imputer and Simple imputer

In [13]:
from kuma_utils.preprocessing.imputer import LGBMImputer
from sklearn.impute import SimpleImputer

In [14]:
imputer = LGBMImputer(cat_features=categorical_index, n_iter=200, verbose=True)
train_encoded2 = imputer.fit_transform(train_encoded)
test_encoded2 = imputer.transform(test_encoded)

Workclass:	multiclass...iter43/200
Education:	multiclass...iter200/200
Martial_Status:	multiclass...iter194/200
Occupation:	multiclass...iter153/200
Relationship:	multiclass...iter200/200
Race:	multiclass...iter200/200
Sex:	binary...iter200/200
Country:	multiclass...iter4/200
Age:	regression...iter200/200
fnlwgt:	regression...iter200/200
Education_Num:	regression...iter200/200
Capital_Gain:	regression...iter200/200
Capital_Loss:	regression...iter200/200
Hours_per_week:	regression...iter200/200


In [15]:
simple_imputer = SimpleImputer(strategy='most_frequent')
simple_imputer.fit(train_encoded)
train_encoded3 = pd.DataFrame(simple_imputer.transform(train_encoded), columns=train_encoded.columns)
test_encoded3 = pd.DataFrame(simple_imputer.transform(test_encoded), columns=test_encoded.columns)

In [16]:
train_encoded2.info(), test_encoded2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Workclass       32561 non-null  int64  
 1   Education       32561 non-null  int64  
 2   Martial_Status  32561 non-null  int64  
 3   Occupation      32561 non-null  int64  
 4   Relationship    32561 non-null  int64  
 5   Race            32561 non-null  int64  
 6   Sex             32561 non-null  int64  
 7   Country         32561 non-null  int64  
 8   Age             32561 non-null  float64
 9   fnlwgt          32561 non-null  float64
 10  Education_Num   32561 non-null  float64
 11  Capital_Gain    32561 non-null  float64
 12  Capital_Loss    32561 non-null  float64
 13  Hours_per_week  32561 non-null  float64
dtypes: float64(6), int64(8)
memory usage: 3.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 14 columns):
 #   Column 

(None, None)

## Compare performance

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from kuma_utils.training import CrossValidator
from kuma_utils.metrics import AUC

In [18]:
labels = train.Target.replace(target_dict)
test_labels = test.Target.replace(target_dict)

In [19]:
cv0 = CrossValidator(LogisticRegression)
cv0.train(
    data=(train_encoded3, labels),
    folds=StratifiedKFold(n_splits=5, random_state=0)
)
AUC()(test_labels, np.stack(cv0.smart_predict(test_encoded3)).mean(0))

Logger created at 20/11/26:04:41:09
04:41:09 [cv0] Starting fold 0
eval_metric automatically selected.
04:41:09 [None]	best score is 0.803895
04:41:09 [cv0] Fold 0: eval=0.803895 (iter=None)
04:41:09 [cv0] Starting fold 1
eval_metric automatically selected.
04:41:09 [None]	best score is 0.811358
04:41:09 [cv0] Fold 1: eval=0.811358 (iter=None)
04:41:09 [cv0] Starting fold 2
eval_metric automatically selected.
04:41:09 [None]	best score is 0.819390
04:41:09 [cv0] Fold 2: eval=0.819390 (iter=None)
04:41:09 [cv0] Starting fold 3
eval_metric automatically selected.
04:41:09 [None]	best score is 0.814046
04:41:09 [cv0] Fold 3: eval=0.814046 (iter=None)
04:41:09 [cv0] Starting fold 4
eval_metric automatically selected.
04:41:10 [None]	best score is 0.823891
04:41:10 [cv0] Fold 4: eval=0.823891 (iter=None)
04:41:10 [cv0] Overall metric: 0.814516 + 0.006851


0.8111444304977667

In [20]:
cv1 = CrossValidator(LogisticRegression)
cv1.train(
    data=(train_encoded2, labels),
    folds=StratifiedKFold(n_splits=5, random_state=0)
)
AUC()(test_labels, np.stack(cv1.smart_predict(test_encoded2)).mean(0))

Logger created at 20/11/26:04:41:10
04:41:10 [cv0] Starting fold 0
eval_metric automatically selected.
04:41:10 [None]	best score is 0.822656
04:41:10 [cv0] Fold 0: eval=0.822656 (iter=None)
04:41:10 [cv0] Starting fold 1
eval_metric automatically selected.
04:41:10 [None]	best score is 0.828035
04:41:10 [cv0] Fold 1: eval=0.828035 (iter=None)
04:41:10 [cv0] Starting fold 2
eval_metric automatically selected.
04:41:10 [None]	best score is 0.834526
04:41:10 [cv0] Fold 2: eval=0.834526 (iter=None)
04:41:10 [cv0] Starting fold 3
eval_metric automatically selected.
04:41:10 [None]	best score is 0.837332
04:41:10 [cv0] Fold 3: eval=0.837332 (iter=None)
04:41:10 [cv0] Starting fold 4
eval_metric automatically selected.
04:41:10 [None]	best score is 0.841207
04:41:10 [cv0] Fold 4: eval=0.841207 (iter=None)
04:41:10 [cv0] Overall metric: 0.832751 + 0.006630


0.8319408297039562

**LGBMImputer is better than SimpleImputer**