# Categorical Encodings
advanced encodings to encode the categorical variables to improve the classifier model:
- Count Encoding
- Target Encoding
- CatBoost Encoding


In [5]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

clicks = pd.read_parquet('dataset/baseline_data.pqt')

In [49]:
def get_data_splits(dataframe, valid_fraction=0.1):
    """Splits a dataframe into train, validation, and test sets.

    First, orders by the column 'click_time'. Set the size of the 
    validation and test sets with the valid_fraction keyword argument.
    """

    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

def train_model(train, valid, test=None, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed', 'day', 'hour', 'minute', 'second'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    num_round = 1000
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    callbacks=[
                    lgb.early_stopping(stopping_rounds=20),   
                    lgb.log_evaluation(period=0)])
    
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        return bst, valid_score, test_score
    else:
        return bst, valid_score

In [50]:
print("Baseline model")
train, valid, test = get_data_splits(clicks)
_ = train_model(train, valid)

Baseline model
[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[370]	valid_0's auc: 0.961874
Validation AUC score: 0.9618737977262175


### 1) Count encodings


In [51]:
import category_encoders as ce

cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

In [52]:
train

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,27226,3,1,13,120,2017-11-06 15:13:23,,0,6,15,13,23
1,110007,35,1,13,10,2017-11-06 15:41:07,2017-11-07 08:17:19,1,6,15,41,7
2,1047,6,1,13,157,2017-11-06 15:42:32,,0,6,15,42,32
3,76270,3,1,13,120,2017-11-06 15:56:17,,0,6,15,56,17
4,57862,3,1,13,120,2017-11-06 15:57:01,,0,6,15,57,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1840445,11718,18,1,17,26,2017-11-09 04:50:14,,0,9,4,50,14
1840449,32849,15,1,19,75,2017-11-09 04:50:14,,0,9,4,50,14
1840446,249422,48,1,19,103,2017-11-09 04:50:14,2017-11-09 04:53:15,1,9,4,50,14
1840443,232256,19,6,21,59,2017-11-09 04:50:14,2017-11-09 04:50:59,1,9,4,50,14


In [53]:
# Create the count encoder
count_enc = ce.CountEncoder(cols=cat_features)

# Learn encoding from the training set
count_enc.fit(train[cat_features])

# Apply encoding to the train and validation sets as new columns
train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_counts'))
valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_counts'))


In [54]:
# Train the model on the encoded datasets
_ = train_model(train_encoded, valid_encoded)

[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1729
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[454]	valid_0's auc: 0.964757
Validation AUC score: 0.9647570268628647


### 2) Target encoding


In [55]:
target_enc = ce.TargetEncoder(cols=cat_features)
    
# Learn encoding from the training set. Use the 'is_attributed' column as the target.
target_enc.fit(train[cat_features], train['is_attributed'])

# Apply encoding to the train and validation sets as new columns
train_encoded = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid_encoded = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))


In [56]:
_ = train_model(train_encoded, valid_encoded)

[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1830
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[39]	valid_0's auc: 0.946925
Validation AUC score: 0.9469248136931894


### 3) CatBoost Encoding

In [57]:
# Remove IP from the encoded features
cat_features = ['app', 'device', 'os', 'channel']

train, valid, test = get_data_splits(clicks)

# Create the CatBoost encoder
cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

# Learn encoding from the training set
cb_enc.fit(train[cat_features], train['is_attributed'])

# Apply encoding to the train and validation sets as new columns
train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb'))

In [58]:
_ = train_model(train_encoded, valid_encoded)

[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1582
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[209]	valid_0's auc: 0.962447
Validation AUC score: 0.9624474042209968
