# Tutorial: Data preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

In [3]:
import sys
sys.path.append('../pyscript/') #('')の中に探索元のpathを書く

## Load UCI Data
UCI Adult data are going to be used in this tutorial. The data contain both numerical and categorical features, and the target to predict is whether a person makes over 50K a year. It is a very simple binary classification task.

In [4]:
train = pd.read_csv('../datasets/adult_train.csv')
test = pd.read_csv('../datasets/adult_test.csv')

In [5]:
# columnのspace削除
train.columns = list(map(str.strip, train.columns.tolist()))
test.columns = list(map(str.strip, test.columns.tolist()))

# check
train.columns

Index(['Age', 'Workclass', 'fnlwgt', 'Education', 'Education_Num',
       'Martial_Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital_Gain', 'Capital_Loss', 'Hours_per_week', 'Country', 'Target'],
      dtype='object')

In [6]:
train.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
test.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


## Encode categorical and numerical features
For categorical features, label encoding will be applied. For numerical features, rank-gauss transform will be applied.

In [8]:
from xfeat import SelectCategorical, SelectNumerical
from preprocessing.xfeat import Pipeline, TargetEncoder
from preprocessing import DistTransformer
from category_encoders import OrdinalEncoder

In [9]:
num_enc = Pipeline([
    SelectNumerical(),
    DistTransformer('rankgauss')
])

cat_enc = Pipeline([
    SelectCategorical(exclude_cols=['Target']),
    OrdinalEncoder(handle_missing='return_nan')
], target_col='Target')

In [10]:
train_cat, train_num = cat_enc.fit_transform(train).astype(int, errors='ignore'), num_enc.fit_transform(train).astype(float)
train_encoded = pd.concat([train_cat, train_num], axis=1) 
train_encoded.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,1,1,1,1,1,1,1,1,0.126243,-1.173177,0.972124,1.423976,-5.199338,-0.070036
1,2,1,2,2,2,1,1,1,0.814719,-1.116616,0.972124,-5.199338,-5.199338,-1.901646
2,3,2,3,3,1,1,1,1,0.062707,0.485916,-0.547956,-5.199338,-5.199338,-0.070036
3,3,3,2,3,2,2,1,1,1.001437,0.65271,-1.285763,-5.199338,-5.199338,-0.070036
4,3,1,2,4,3,2,2,2,-0.643497,1.346146,0.972124,-5.199338,-5.199338,-0.070036


In [11]:
test_encoded = pd.concat([
    cat_enc.transform(test).astype(int, errors='ignore'), 
    num_enc.transform(test).astype(float)], axis=1)
test_encoded.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,3,3,1,10,4,2,1,1,-0.900346,0.587223,-1.285763,-5.199338,-5.199338,-0.070036
1,3,2,2,9,2,1,1,1,0.062707,-1.052221,-0.547956,-5.199338,-5.199338,1.014616
2,5,7,2,13,2,1,1,1,-0.643497,1.334709,0.630671,-5.199338,-5.199338,-0.070036
3,3,6,2,10,2,2,1,1,0.441763,-0.233917,0.163861,1.871975,-5.199338,-0.070036
4,6,6,1,12,4,1,2,1,-2.042277,-0.88379,0.163861,-5.199338,-5.199338,-1.065046


## Drop data points 

In [12]:
# extract a part data 
def random_drop(df, ratio=0.1):
    for col in df.columns:
        if col in 'Target':
            continue
        drop_num = int(df.shape[0] * ratio)
        drop_idx = np.random.choice(np.arange(df.shape[0]), drop_num)
        df[col].iloc[drop_idx] = np.nan

In [13]:
random_drop(train_encoded, 0.2)
random_drop(test_encoded, 0.2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [14]:
train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Workclass       26661 non-null  float64
 1   Education       26667 non-null  float64
 2   Martial_Status  26651 non-null  float64
 3   Occupation      26656 non-null  float64
 4   Relationship    26638 non-null  float64
 5   Race            26634 non-null  float64
 6   Sex             26672 non-null  float64
 7   Country         26639 non-null  float64
 8   Age             26680 non-null  float64
 9   fnlwgt          26642 non-null  float64
 10  Education_Num   26679 non-null  float64
 11  Capital_Gain    26678 non-null  float64
 12  Capital_Loss    26665 non-null  float64
 13  Hours_per_week  26652 non-null  float64
dtypes: float64(14)
memory usage: 3.5 MB


## Regression imputer and Simple imputer

In [15]:
# category column index
categorical_index = list(range(train_cat.shape[1])) 
# target columnのboolean化
target_dict = {
    ' <=50K': 0,
    ' >50K': 1
}

In [16]:
categorical_index

[0, 1, 2, 3, 4, 5, 6, 7]

In [17]:
from preprocessing.imputer import LGBMImputer
from sklearn.impute import SimpleImputer

In [18]:
imputer = LGBMImputer(cat_features=categorical_index, n_iter=200, verbose=True)
train_encoded2 = imputer.fit_transform(train_encoded)
test_encoded2 = imputer.transform(test_encoded)

Workclass:	multiclass...iter30/200
Education:	multiclass...iter200/200
Martial_Status:	multiclass...iter200/200
Occupation:	multiclass...iter153/200
Relationship:	multiclass...iter200/200
Race:	multiclass...iter200/200
Sex:	binary...iter200/200
Country:	multiclass...iter2/200
Age:	regression...iter200/200
fnlwgt:	regression...iter200/200
Education_Num:	regression...iter200/200
Capital_Gain:	regression...iter200/200
Capital_Loss:	regression...iter200/200
Hours_per_week:	regression...iter200/200


In [19]:
train_encoded2.info(), test_encoded2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Workclass       32561 non-null  int64  
 1   Education       32561 non-null  int64  
 2   Martial_Status  32561 non-null  int64  
 3   Occupation      32561 non-null  int64  
 4   Relationship    32561 non-null  int64  
 5   Race            32561 non-null  int64  
 6   Sex             32561 non-null  int64  
 7   Country         32561 non-null  int64  
 8   Age             32561 non-null  float64
 9   fnlwgt          32561 non-null  float64
 10  Education_Num   32561 non-null  float64
 11  Capital_Gain    32561 non-null  float64
 12  Capital_Loss    32561 non-null  float64
 13  Hours_per_week  32561 non-null  float64
dtypes: float64(6), int64(8)
memory usage: 3.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 14 columns):
 #   Column 

(None, None)

In [20]:
train_encoded2.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,1,1,1,1,1,1,2,1,0.126243,-1.173177,0.972124,1.423976,-5.199338,-0.072752
1,3,1,2,2,2,1,1,1,0.814719,-1.116616,0.002084,-3.437141,-5.199338,0.417627
2,3,2,3,3,1,1,1,1,0.062707,0.006254,-0.018733,-5.199338,-5.199338,0.306572
3,3,3,2,8,2,2,1,1,1.001437,0.144287,-1.285763,-5.199338,-4.655553,-0.070036
4,3,1,2,1,3,2,2,2,-0.643497,1.346146,0.003114,-5.199338,-5.199338,-0.070036


In [21]:
simple_imputer = SimpleImputer(strategy='most_frequent')
simple_imputer.fit(train_encoded)
train_encoded3 = pd.DataFrame(simple_imputer.transform(train_encoded), columns=train_encoded.columns)
test_encoded3 = pd.DataFrame(simple_imputer.transform(test_encoded), columns=test_encoded.columns)

In [22]:
train_encoded3.info(), test_encoded3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Workclass       32561 non-null  float64
 1   Education       32561 non-null  float64
 2   Martial_Status  32561 non-null  float64
 3   Occupation      32561 non-null  float64
 4   Relationship    32561 non-null  float64
 5   Race            32561 non-null  float64
 6   Sex             32561 non-null  float64
 7   Country         32561 non-null  float64
 8   Age             32561 non-null  float64
 9   fnlwgt          32561 non-null  float64
 10  Education_Num   32561 non-null  float64
 11  Capital_Gain    32561 non-null  float64
 12  Capital_Loss    32561 non-null  float64
 13  Hours_per_week  32561 non-null  float64
dtypes: float64(14)
memory usage: 3.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 14 columns):
 #   Column          

(None, None)

In [23]:
train_encoded3.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.126243,-1.173177,0.972124,1.423976,-5.199338,-0.070036
1,3.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,0.814719,-1.116616,-0.547956,-5.199338,-5.199338,-0.070036
2,3.0,2.0,3.0,3.0,1.0,1.0,1.0,1.0,0.062707,-0.565258,-0.547956,-5.199338,-5.199338,-0.070036
3,3.0,3.0,2.0,4.0,2.0,2.0,1.0,1.0,1.001437,-0.565258,-1.285763,-5.199338,-5.199338,-0.070036
4,3.0,1.0,2.0,4.0,3.0,2.0,2.0,2.0,-0.643497,1.346146,-0.547956,-5.199338,-5.199338,-0.070036


## Compare performance

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from training import CrossValidator
from metrics import AUC

In [25]:
labels = train.Target.replace(target_dict)
test_labels = test.Target.replace(target_dict)

In [26]:
cv0 = CrossValidator(LogisticRegression)
cv0.train(
    data=(train_encoded3, labels),
    folds=StratifiedKFold(n_splits=5, random_state=0)
)
AUC()(test_labels, np.stack(cv0.smart_predict(test_encoded3)).mean(0))



Logger created at 21/03/07:06:12:50
06:12:50 [cv0] Starting fold 0
eval_metric automatically selected.
06:12:50 [None]	best score is 0.808428
06:12:50 [cv0] Fold 0: eval=0.808428 (iter=None)
06:12:50 [cv0] Starting fold 1
eval_metric automatically selected.
06:12:50 [None]	best score is 0.811852
06:12:50 [cv0] Fold 1: eval=0.811852 (iter=None)
06:12:50 [cv0] Starting fold 2
eval_metric automatically selected.
06:12:51 [None]	best score is 0.817931
06:12:51 [cv0] Fold 2: eval=0.817931 (iter=None)
06:12:51 [cv0] Starting fold 3
eval_metric automatically selected.
06:12:51 [None]	best score is 0.817840
06:12:51 [cv0] Fold 3: eval=0.817840 (iter=None)
06:12:51 [cv0] Starting fold 4
eval_metric automatically selected.
06:12:51 [None]	best score is 0.825798
06:12:51 [cv0] Fold 4: eval=0.825798 (iter=None)
06:12:51 [cv0] Overall metric: 0.816370 + 0.005949


0.8139691345595118

In [27]:
cv1 = CrossValidator(LogisticRegression)
cv1.train(
    data=(train_encoded2, labels),
    folds=StratifiedKFold(n_splits=5, random_state=0)
)
AUC()(test_labels, np.stack(cv1.smart_predict(test_encoded2)).mean(0))

Logger created at 21/03/07:06:12:51
06:12:51 [cv0] Starting fold 0
eval_metric automatically selected.




06:12:52 [None]	best score is 0.825338
06:12:52 [cv0] Fold 0: eval=0.825338 (iter=None)
06:12:52 [cv0] Starting fold 1
eval_metric automatically selected.
06:12:52 [None]	best score is 0.828176
06:12:52 [cv0] Fold 1: eval=0.828176 (iter=None)
06:12:52 [cv0] Starting fold 2
eval_metric automatically selected.
06:12:52 [None]	best score is 0.837741
06:12:52 [cv0] Fold 2: eval=0.837741 (iter=None)
06:12:52 [cv0] Starting fold 3
eval_metric automatically selected.
06:12:52 [None]	best score is 0.835686
06:12:52 [cv0] Fold 3: eval=0.835686 (iter=None)
06:12:52 [cv0] Starting fold 4
eval_metric automatically selected.
06:12:53 [None]	best score is 0.841676
06:12:53 [cv0] Fold 4: eval=0.841676 (iter=None)
06:12:53 [cv0] Overall metric: 0.833723 + 0.006072


0.8326943371261187

**LGBMImputer is better than SimpleImputer**

# [WIP] Light gbm imputation のエラーに関する調査

In [28]:
cat_features=categorical_index
n_iter=200
verbose=True
X = train_encoded.copy()
early_stopping_rounds = int(n_iter/10)
output_X = X.copy()
n_features = X.shape[1]
if isinstance(X, pd.DataFrame):
    feature_names = X.columns.tolist()
else:
    feature_names = [f'f{i}' for i in range(n_features)]
    X = pd.DataFrame(X, columns=feature_names)
feature_with_missing = [col for col in feature_names if X[col].isnull().sum() > 0]
feature_with_missing
import lightgbm as lgb
from preprocessing.utils import analyze_column

In [143]:
for icol, col in enumerate(feature_with_missing):
    if icol in cat_features:
        nuni = X[col].dropna().nunique()
        if nuni == 2:
            params = {
                'objective': 'binary'
            }
        elif nuni > 2:
            params = {
                'objective': 'multiclass',
                'num_class': nuni + 1
            }
    else:  # automatic analyze column
        if analyze_column(X[col]) == 'numeric':
            params = {
                'objective': 'regression'
            }
        else:
            nuni = X[col].dropna().nunique()
            if nuni == 2:
                params = {
                    'objective': 'binary'
                }
            elif nuni > 2:
                params = {
                    'objective': 'multiclass',
                    'num_class': nuni + 1
                }
            else:
                print(f'column {col} has only one unique value.')
                continue

    params['verbosity'] = -1

    print(icol,col,params)

0 Workclass {'objective': 'multiclass', 'num_class': 10, 'verbosity': -1}
1 Education {'objective': 'multiclass', 'num_class': 17, 'verbosity': -1}
2 Martial_Status {'objective': 'multiclass', 'num_class': 8, 'verbosity': -1}
3 Occupation {'objective': 'multiclass', 'num_class': 16, 'verbosity': -1}
4 Relationship {'objective': 'multiclass', 'num_class': 7, 'verbosity': -1}
5 Race {'objective': 'multiclass', 'num_class': 6, 'verbosity': -1}
6 Sex {'objective': 'binary', 'verbosity': -1}
7 Country {'objective': 'multiclass', 'num_class': 43, 'verbosity': -1}
8 Age {'objective': 'regression', 'verbosity': -1}
9 fnlwgt {'objective': 'regression', 'verbosity': -1}
10 Education_Num {'objective': 'regression', 'verbosity': -1}
11 Capital_Gain {'objective': 'regression', 'verbosity': -1}
12 Capital_Loss {'objective': 'regression', 'verbosity': -1}
13 Hours_per_week {'objective': 'regression', 'verbosity': -1}


In [149]:
col = 'Country'
nuni = X[col].dropna().nunique()
if nuni == 2:
    params = {
        'objective': 'binary'
    }
elif nuni > 2:
    params = {
        'objective': 'multiclass',
        'num_class': nuni + 1,
        'deterministic': True,
        'use_missing': False ##これはなんぞ
    }
null_idx = X[col].isnull()
x_train = X.loc[~null_idx].drop(col, axis=1)
x_test = X.loc[null_idx].drop(col, axis=1)
y_offset = X[col].min()
y_train = X.loc[~null_idx, col].astype(int) - y_offset
dtrain = lgb.Dataset(
    data=x_train,
    label=y_train
)
model = lgb.train(
            params, dtrain, valid_sets=[dtrain],
            num_boost_round=n_iter,
            early_stopping_rounds=early_stopping_rounds,
            # verbose_eval=0,
        )

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 643
[LightGBM] [Info] Number of data points in the train set: 26662, number of used features: 13
[LightGBM] [Info] Start training from score -0.109570
[LightGBM] [Info] Start training from score -5.860261
[LightGBM] [Info] Start training from score -5.986302
[LightGBM] [Info] Start training from score -5.760178
[LightGBM] [Info] Start training from score -4.025577
[LightGBM] [Info] Start training from score -3.892045
[LightGBM] [Info] Start training from score -6.047860
[LightGBM] [Info] Start training from score -5.691185
[LightGBM] [Info] Start training from score -7.888410
[LightGBM] [Info] Start training from score -5.821547
[LightGBM] [Info] Start training from score -5.606027
[LightGBM] [Info] Start training from score -5.472496
[LightGBM] [Info] Start training from score -6.789797
[LightGBM] [Info] Start training from score -5.128