In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, r2_score
%matplotlib inline 
from sklearn.model_selection import train_test_split
import warnings
from category_encoders import HashingEncoder, BinaryEncoder, WOEEncoder
warnings.filterwarnings(action='ignore')

In [21]:
def one_hot_encoder(data, nan_as_category = True):
    data = data.copy()
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)]
    for c in categorical_columns:
        if nan_as_category:
            data[c].fillna('NaN', inplace = True)
        values = list(data[c].unique())
        for v in values:
            data[str(c) + '_' + str(v)] = (data[c] == v).astype(np.uint8)
    data.drop(categorical_columns, axis = 1, inplace = True)
    data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    return data, [c for c in data.columns if c not in original_columns]

def lgbm_encoder(data):
    for c in data.columns:
        col_type = data[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            data[c] = data[c].astype('category')
    return data

def target_encoder(data, y):
    data = data.copy()
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)]
    data['TARGET'] = y
    for c in categorical_columns:
        data[c] = data.groupby(c)["TARGET"].transform("mean")
    data = data.drop(['TARGET'], axis=1)
    return data

def hash_encoder(data, y):
    n_components = 100
    data = data.copy()
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)]
    hashing_enc = HashingEncoder(cols=categorical_columns, n_components=n_components).fit(data, y)
    X_train_hashing = hashing_enc.transform(data.reset_index(drop=True))
    return X_train_hashing


def binary_encoder(data):
    data = data.copy()
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)]
    be = BinaryEncoder(cols=categorical_columns).fit_transform(data)
    return be

def woe_encoder(data, y):
    data = data.copy()    
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)]
    woe_enc = WOEEncoder(cols=categorical_columns, random_state=17).fit(data, y)
    X_train_woe = woe_enc.transform(data.reset_index(drop=True))
    return X_train_woe

def label_encoder(data):
    data = data.copy()    
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)]
    for c in categorical_columns:
        data[c] = LabelEncoder().fit_transform(data[c])
    return data

def cat_data_load():
#    https://www.kaggle.com/competitions/cat-in-the-dat/code
    credit_path = './data/cat_data//'
    train = pd.read_csv(credit_path+'train.csv')
    y = train['target']
    x = train.drop(['id', 'target', 'nom_7','nom_8','nom_9'], axis=1)
    return x, y

def cat_data_small_load(n=10000):
#    https://www.kaggle.com/competitions/cat-in-the-dat/code
    credit_path = './data/cat_data//'
    train = pd.read_csv(credit_path+'train.csv').sample(n=n)
    y = train['target']
    x = train.drop(['id', 'target', 'nom_7','nom_8','nom_9'], axis=1)
    return x, y

In [3]:
def feature_importance(model, x):
    feat_imp = pd.Series(model.feature_importances_, index=x.columns)
    feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))
    
def credit_data_load():
#     https://www.kaggle.com/code/mlisovyi/beware-of-categorical-features-in-lgbm
    credit_path = './data/home-credit/'
    application_train = pd.read_csv(credit_path+'application_train.csv')

    y = application_train['TARGET']
    x = application_train.drop(['TARGET', 'SK_ID_CURR'], axis=1)
    return x, y

def aws_employee_access_data_load():
#    https://www.kaggle.com/datasets/winjia/amazoncom-employee-access-challenge
    credit_path = './data/amazon_employee_access/'
    train = pd.read_csv(credit_path+'train.csv')
    y = train['ACTION']
    x = train[['RESOURCE', 'MGR_ID', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY', 'ROLE_CODE']]
    return x, y

def data_info(data):
    print(f'data shape : {data.shape}')
    category_info = data.select_dtypes('object').apply(pd.Series.nunique, axis = 0) 
    print('category counts')
    print(f'category total count : {sum(category_info)}')
    print(category_info)

def get_score(x, y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 40)
    fit_params={"early_stopping_rounds":10, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'verbose': 1000,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': 'auto' # that's actually the default
           }
    clf = lgb.LGBMClassifier(
                n_jobs = -1,
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=34,
                colsample_bytree=0.9497036,
                subsample=0.8715623,
                max_depth=8,
                reg_alpha=0.041545473,
                reg_lambda=0.0735294,
                min_split_gain=0.0222415,
                min_child_weight=39.3259775,
                silent=-1,
                verbose=-1, )
    clf.fit(X_train, y_train, **fit_params)
    pred = clf.predict(X_test)
    pred_p = clf.predict_proba(X_test)
    f1 = f1_score(y_test, pred)
    r2 = r2_score(y_test, pred_p[:,1])
    roc_auc = roc_auc_score(y_test, pred_p[:,1])
    return f1, r2, roc_auc

In [24]:
def categorical_score(x,y):
    encoder_results = []
    x = x.copy()
    data_info(x)
#     print('----------------------------------------')
    print('--------------- lgbm encoder ----------------')
    lgbm_x = lgbm_encoder(x.copy())
    result = get_score(lgbm_x, y)
    encoder_results.append(['lgbm encoder'] + list(result))
#     print(f' f1: {result[0]} , r2_score : {result[1]}, roc_auc : {result[2]}')
    print('------------- one hot encoder ----------------')
    one_hot_x, columns = one_hot_encoder(x)
    result =  get_score(one_hot_x, y)
    encoder_results.append(['one hot encoder'] +list(result))
#     print(f' f1: {result[0]} , r2_score : {result[1]}, roc_auc : {result[2]}')
    print('------------- label encoder ----------------')
    label_x = label_encoder(x)
    result =  get_score(label_x, y)
    encoder_results.append(['label encoder'] +list(result))
#     print(f' f1: {result[0]} , r2_score : {result[1]}, roc_auc : {result[2]}')
    print('------------- target encoder ----------------')
    target_x = target_encoder(x, y)
    result =  get_score(target_x, y)
    encoder_results.append(['target encoder'] + list(result))
#     print(f' f1: {result[0]} , r2_score : {result[1]}, roc_auc : {result[2]}')
    print('------------- hash encoder ----------------')
    target_x = hash_encoder(x, y)
    result =  get_score(target_x, y)
    encoder_results.append(['hash encoder'] + list(result))
#     print(f' f1: {result[0]} , r2_score : {result[1]}, roc_auc : {result[2]}')
    print('------------- binary encoder ----------------')
    target_x = binary_encoder(x)
    result =  get_score(target_x, y)
    encoder_results.append(['binary encoder'] + list(result))
#     print(f' f1: {result[0]} , r2_score : {result[1]}, roc_auc : {result[2]}')
    print('------------- woe encoder ----------------')
    target_x = woe_encoder(x, y)
    result =  get_score(target_x, y)
    encoder_results.append(['WOE encoder'] + list(result))
#     print(f' f1: {result[0]} , r2_score : {result[1]}, roc_auc : {result[2]}')
#     print('----------------------------------------')
#     print('----------------------------------------')
    score_df = pd.DataFrame(encoder_results, columns=['encoder','f1','r2_score','roc_auc'])
    return score_df


In [25]:
x, y = credit_data_load()
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (307511, 120)
category counts
category total count : 140
NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
------------- label encoder ----------------
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.04088,0.092139,0.76372
1,one hot encoder,0.039399,0.092917,0.764628
2,label encoder,0.036435,0.092433,0.763903
3,target encoder,0.043124,0.093418,0.765467
4,hash encoder,0.039047,0.092487,0.764188
5,binary encoder,0.041181,0.092754,0.76505
6,WOE encoder,0.041991,0.093227,0.765465


In [26]:
x, y = aws_employee_access_data_load()
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (32769, 5)
category counts
category total count : 0
Series([], dtype: float64)
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
------------- label encoder ----------------
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.969487,0.072176,0.749605
1,one hot encoder,0.969487,0.072176,0.749605
2,label encoder,0.969487,0.072176,0.749605
3,target encoder,0.969487,0.072176,0.749605
4,hash encoder,0.969487,0.072176,0.749605
5,binary encoder,0.969487,0.072176,0.749605
6,WOE encoder,0.969487,0.072176,0.749605


In [27]:
x, y = cat_data_load()
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (300000, 20)
category counts
category total count : 1017
bin_3      2
bin_4      2
nom_0      3
nom_1      6
nom_2      6
nom_3      6
nom_4      4
nom_5    222
nom_6    522
ord_1      5
ord_2      6
ord_3     15
ord_4     26
ord_5    192
dtype: int64
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
[1000]	valid's auc: 0.760701	valid's binary_logloss: 0.523206
[2000]	valid's auc: 0.769949	valid's binary_logloss: 0.51417
[3000]	valid's auc: 0.773535	valid's binary_logloss: 0.510384
------------- label encoder ----------------
[1000]	valid's auc: 0.76989	valid's binary_logloss: 0.513061
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.466213,0.199609,0.773002
1,one hot encoder,0.486477,0.2047,0.774729
2,label encoder,0.493491,0.200314,0.771879
3,target encoder,0.512758,0.219085,0.784098
4,hash encoder,0.358714,0.123268,0.713925
5,binary encoder,0.412795,0.157305,0.742454
6,WOE encoder,0.512333,0.218744,0.783912


In [28]:
x, y = cat_data_small_load()
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (10000, 20)
category counts
category total count : 999
bin_3      2
bin_4      2
nom_0      3
nom_1      6
nom_2      6
nom_3      6
nom_4      4
nom_5    220
nom_6    509
ord_1      5
ord_2      6
ord_3     15
ord_4     26
ord_5    189
dtype: int64
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
------------- label encoder ----------------
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.170604,0.09539,0.701471
1,one hot encoder,0.288018,0.123628,0.722018
2,label encoder,0.344298,0.141456,0.733003
3,target encoder,0.568421,0.256687,0.805635
4,hash encoder,0.233293,0.091348,0.693086
5,binary encoder,0.139842,0.08404,0.690799
6,WOE encoder,0.551601,0.253005,0.802534


In [31]:
x, y = cat_data_small_load(n=8000)
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (8000, 20)
category counts
category total count : 994
bin_3      2
bin_4      2
nom_0      3
nom_1      6
nom_2      6
nom_3      6
nom_4      4
nom_5    220
nom_6    505
ord_1      5
ord_2      6
ord_3     15
ord_4     26
ord_5    188
dtype: int64
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
------------- label encoder ----------------
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.361153,0.102164,0.698436
1,one hot encoder,0.346213,0.131645,0.728436
2,label encoder,0.455526,0.146718,0.73994
3,target encoder,0.582915,0.270338,0.814421
4,hash encoder,0.275748,0.093933,0.695387
5,binary encoder,0.0,-0.001692,0.655974
6,WOE encoder,0.567394,0.263771,0.811798


In [29]:
x, y = cat_data_small_load(n=7000)
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (7000, 20)
category counts
category total count : 997
bin_3      2
bin_4      2
nom_0      3
nom_1      6
nom_2      6
nom_3      6
nom_4      4
nom_5    220
nom_6    508
ord_1      5
ord_2      6
ord_3     15
ord_4     26
ord_5    188
dtype: int64
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
------------- label encoder ----------------
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.01432,0.064332,0.679694
1,one hot encoder,0.0,0.005669,0.639399
2,label encoder,0.369384,0.13222,0.72469
3,target encoder,0.54242,0.259658,0.813966
4,hash encoder,0.0,0.024887,0.625695
5,binary encoder,0.0,0.028203,0.636904
6,WOE encoder,0.543662,0.247959,0.805309


In [32]:
x, y = cat_data_small_load(n=6000)
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (6000, 20)
category counts
category total count : 996
bin_3      2
bin_4      2
nom_0      3
nom_1      6
nom_2      6
nom_3      6
nom_4      4
nom_5    219
nom_6    506
ord_1      5
ord_2      6
ord_3     15
ord_4     26
ord_5    190
dtype: int64
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
------------- label encoder ----------------
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.333333,0.091295,0.687836
1,one hot encoder,0.272,0.092477,0.691777
2,label encoder,0.412587,0.149168,0.737951
3,target encoder,0.596439,0.276176,0.819375
4,hash encoder,0.231092,0.080568,0.681271
5,binary encoder,0.308285,0.082317,0.678059
6,WOE encoder,0.582583,0.270786,0.817784


In [30]:
x, y = cat_data_small_load(n=5000)
credit_score_df = categorical_score(x, y)
credit_score_df

data shape : (5000, 20)
category counts
category total count : 984
bin_3      2
bin_4      2
nom_0      3
nom_1      6
nom_2      6
nom_3      6
nom_4      4
nom_5    218
nom_6    500
ord_1      5
ord_2      6
ord_3     15
ord_4     26
ord_5    185
dtype: int64
--------------- lgbm encoder ----------------
------------- one hot encoder ----------------
------------- label encoder ----------------
------------- target encoder ----------------
------------- hash encoder ----------------
------------- binary encoder ----------------
------------- woe encoder ----------------


Unnamed: 0,encoder,f1,r2_score,roc_auc
0,lgbm encoder,0.323529,0.069267,0.665818
1,one hot encoder,0.0,0.050601,0.663176
2,label encoder,0.0,0.01115,0.65949
3,target encoder,0.588235,0.277859,0.822807
4,hash encoder,0.253886,0.080012,0.68205
5,binary encoder,0.0,0.049961,0.658184
6,WOE encoder,0.575701,0.263452,0.815503
