<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

# Tiny Criteo Benchmark

## Global Settings and Imports

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext blackcellmagic

In [91]:
import sys, os
sys.path.append("../../")
import subprocess as sp
from tempfile import TemporaryDirectory
from time import process_time

import category_encoders as ce
import lightgbm as lgb
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import roc_auc_score, log_loss
import tensorflow as tf
from tqdm import tqdm

from reco_utils.common import tf_utils, gpu_utils, plot
import reco_utils.dataset.criteo as criteo
import reco_utils.recommender.lightgbm.lightgbm_utils as lgb_utils

print("System version: {}".format(sys.version))
print("GPUs: {}".format(gpu_utils.get_gpu_info()))
print("Sklearn version: {}".format(sklearn.__version__))
print()

print("LightGBM version: {}".format(lgb.__version__))
print("Tensorflow version: {}".format(tf.VERSION))
process = sp.run(['vw', '--version'], stdout=sp.PIPE, universal_newlines=True)
print("Vowpal Wabbit version: {}".format(process.stdout.rstrip()))

System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) 
[GCC 7.3.0]
GPUs: [{'device_name': 'Tesla K80', 'total_memory': 11441.1875, 'free_memory': 11373.9375}]
Sklearn version: 0.20.1

LightGBM version: 2.2.1
Tensorflow version: 1.12.0
Vowpal Wabbit version: 8.1.1


In [10]:
SIZE = "sample"

tmpdir = TemporaryDirectory()

## 0. Data Preparation
Here we use CSV format as the example data input. Our example data is a sample (about 100 thousand samples) from [Criteo dataset](https://www.kaggle.com/c/criteo-display-ad-challenge). The Criteo dataset is a well-known industry benchmarking dataset for developing CTR prediction models, and it's frequently adopted as evaluation dataset by research papers. The original dataset is too large for a lightweight demo, so we sample a small portion from it as a demo dataset.

Specifically, there are 39 columns of features in Criteo, where 13 columns are numerical features (I1-I13) and the other 26 columns are categorical features (C1-C26).

In [12]:
nume_cols = ["I" + str(i) for i in range(1, 14)]
cate_cols = ["C" + str(i) for i in range(1, 27)]
label_col = "Label"

header = [label_col] + nume_cols + cate_cols

all_data = criteo.load_pandas_df(size=SIZE, local_cache_path=tmpdir.name, header=header)
display(all_data.head())

100%|██████████| 8.58k/8.58k [00:01<00:00, 7.07kKB/s]


Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


First, we cut three sets (train_data (first 80%), valid_data (middle 10%) and test_data (last 10%)), cut from the original all data. <br>
Notably, considering the Criteo is a kind of time-series streaming data, which is also very common in recommendation scenario, we split the data by its order.

In [16]:
# split data to 3 sets    
length = len(all_data)
train_data = all_data.loc[:0.8*length-1]
valid_data = all_data.loc[0.8*length:0.9*length-1]
test_data = all_data.loc[0.9*length:]

del all_data

In [17]:
# missing value handling: mean for numeric, 'unk' for categorical features
# for simplicity, treat categorical features as ordinal feature

def fill_na(df, c_cols, n_cols, n_col_means=None):
    if not n_col_means:
        n_col_means = {}
        
    for item in tqdm(c_cols):
        df[item].fillna("UNK", inplace=True)

    for item in tqdm(n_cols):
        if item not in n_col_means:
            n_col_means[item] = df[item].mean()
        df[item].fillna(n_col_means[item], inplace=True)
    
    return n_col_means

encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)

In [18]:
nume_means = fill_na(train_data, cate_cols, nume_cols)

print("Numeric column mean values in the training set: {}".format(nume_means))

train_data = encoder.fit_transform(train_data)
display(train_data.head())

100%|██████████| 26/26 [00:01<00:00, 13.44it/s]
100%|██████████| 13/13 [00:00<00:00, 14.14it/s]


Numeric column mean values in the training set: {'I1': 3.70451525729157, 'I2': 112.293575, 'I3': 39.578949807667705, 'I4': 8.309818756880512, 'I5': 17539.79811756683, 'I6': 140.47713106394087, 'I7': 14.953125205044286, 'I8': 13.48726740330108, 'I9': 124.50148944267035, 'I10': 0.612250872649441, 'I11': 2.3841056126399383, 'I12': 0.9390330900502435, 'I13': 11.522395696055991}


Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,1,1,1,1,1,1,1,1,1,1
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,2,2,1,2,2,1,1,2,1,2
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,3,3,2,3,3,2,1,3,2,3
3,0,3.704515,893,39.57895,8.309819,4392.0,140.477131,0.0,0.0,0.0,...,4,4,2,3,4,1,1,4,2,3
4,0,3.0,-1,39.57895,0.0,2.0,0.0,3.0,0.0,0.0,...,4,5,2,3,5,1,2,5,2,3


In [19]:
# Pass train set's n_col_means
fill_na(valid_data, cate_cols, nume_cols, nume_means)
valid_data = encoder.transform(valid_data)

fill_na(test_data, cate_cols, nume_cols, nume_means)
test_data = encoder.transform(test_data)

100%|██████████| 26/26 [00:01<00:00, 13.90it/s]
100%|██████████| 13/13 [00:00<00:00, 13.59it/s]
100%|██████████| 26/26 [00:01<00:00, 13.89it/s]
100%|██████████| 13/13 [00:00<00:00, 13.93it/s]


## 1. LightGBM

### 1.1 LightGBM - Parameter Setting
Let's set the main related parameters for LightGBM now. Basically, the task is a binary classification (predicting click or no click), so the objective function is set to binary logloss, and 'AUC' metric, is used as a metric which is less effected by imbalance in the classes of the dataset.

Generally, we can adjust the number of leaves (MAX_LEAF), the minimum number of data in each leaf (MIN_DATA), maximum number of trees (NUM_OF_TREES), the learning rate of trees (TREE_LEARNING_RATE) and EARLY_STOPPING_ROUNDS (to avoid overfitting) in the model to get better performance.

Besides, we can also adjust some other listed parameters to optimize the results. [In this link](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst), a list of all the parameters is shown. Also, some advice on how to tune these parameters can be found [in this url](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst). 

In [95]:
MAX_LEAF = 64
MIN_DATA = 20
NUM_OF_TREES = 100
TREE_LEARNING_RATE = 0.15
EARLY_STOPPING_ROUNDS = 20
METRIC = "auc"

In [96]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': "binary",
    'metric': METRIC,
    'num_leaves': MAX_LEAF,
    'min_data': MIN_DATA,
    'boost_from_average': True,
    'num_threads': 20,           # set it according to your cpu cores
    'feature_fraction': 0.8,     # select 80% of features before training each tree
    'learning_rate': TREE_LEARNING_RATE,
}

### 1.2 LightGBM - Data Formatting

In [97]:
train_x = train_data.copy()
train_y = train_x.pop(label_col).values
valid_x = valid_data.copy()
valid_y = valid_x.pop(label_col).values
test_x = test_data.copy()
test_y = test_x.pop(label_col).values

print(
    "Train Data Shape: X: {trn_x_shape}, Y: {trn_y_shape}\n"
    "Valid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}\n"
    "Test Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}".format(
        trn_x_shape=train_x.shape,
        trn_y_shape=train_y.shape,
        vld_x_shape=valid_x.shape,
        vld_y_shape=valid_y.shape,
        tst_x_shape=test_x.shape,
        tst_y_shape=test_y.shape,
    )
)

Train Data Shape: X: (80000, 39), Y: (80000,)
Valid Data Shape: X: (10000, 39); Y: (10000,)
Test Data Shape: X: (10000, 39); Y: (10000,)


### 1.3 LightGBM - Create model
When both hyper-parameters and data are ready, we can create a model:

In [39]:
lgb_train = lgb.Dataset(
    train_x, train_y.reshape(-1), params=params, categorical_feature=cate_cols
)
lgb_valid = lgb.Dataset(
    valid_x, valid_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols
)
lgb_test = lgb.Dataset(
    test_x, test_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols
)
lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=NUM_OF_TREES,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    valid_sets=lgb_valid,
    categorical_feature=cate_cols,
)

[1]	valid_0's auc: 0.726354
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.739955
[3]	valid_0's auc: 0.745388
[4]	valid_0's auc: 0.749606
[5]	valid_0's auc: 0.75325
[6]	valid_0's auc: 0.755573
[7]	valid_0's auc: 0.756963
[8]	valid_0's auc: 0.759945
[9]	valid_0's auc: 0.760396
[10]	valid_0's auc: 0.760184
[11]	valid_0's auc: 0.759827
[12]	valid_0's auc: 0.760955
[13]	valid_0's auc: 0.762486
[14]	valid_0's auc: 0.763425
[15]	valid_0's auc: 0.763039
[16]	valid_0's auc: 0.763152
[17]	valid_0's auc: 0.763261
[18]	valid_0's auc: 0.763119
[19]	valid_0's auc: 0.76278
[20]	valid_0's auc: 0.762748
[21]	valid_0's auc: 0.762998
[22]	valid_0's auc: 0.762733
[23]	valid_0's auc: 0.76194
[24]	valid_0's auc: 0.761843
[25]	valid_0's auc: 0.761785
[26]	valid_0's auc: 0.761555
[27]	valid_0's auc: 0.761103
[28]	valid_0's auc: 0.761141
[29]	valid_0's auc: 0.760587
[30]	valid_0's auc: 0.759864
[31]	valid_0's auc: 0.759586
[32]	valid_0's auc: 0.759272
[33]	valid_0's auc: 0.

Now let's see what is the model's performance:

In [42]:
test_preds = lgb_model.predict(test_x)
auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))
logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)
res_basic = {"auc": auc, "logloss": logloss}
print(res_basic)

{'auc': 0.7649360450087227, 'logloss': 0.4703181701876848}


## 2. Vowpal Wabbit

In [20]:
model_path = os.path.join(tmpdir.name, 'vw.model')
saved_model_path = os.path.join(tmpdir.name, 'vw_saved.model')
train_path = os.path.join(tmpdir.name, 'train.dat')
test_path = os.path.join(tmpdir.name, 'test.dat')
prediction_path = os.path.join(tmpdir.name, 'prediction.dat')

In [111]:
def to_vw(df, l_col, n_cols, c_cols, output_path):
    """Convert Pandas DataFrame to vw input format
    """
    with open(output_path, "w") as f:
        tmp = df.reset_index()

        # When using logistic or hinge loss, the labels need to be from the set {+1,-1}
        tmp[l_col] = tmp[l_col].apply(lambda x: -1 if x == 0 else 1)

        # convert each row to VW input format (https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format)
        # [label] [tag]|[user namespace] [user id feature] |[item namespace] [movie id feature]
        # label is the true rating, tag is a unique id for the example just used to link predictions to truth
        # user and item namespaces separate the features to support interaction features through command line options
        # Note, space around `|` is very significant and should follow exact rules.
        for row in tqdm(tmp.itertuples()):
            n_feats = []
            for col in n_cols:
                n_feats.append("|{} {} ".format(col, getattr(row, col)))
            c_feats = []
            for col in c_cols:
                c_feats.append("|{} {} ".format(col, getattr(row, col)))
            f.write(
                "{:d} {:d}{} {}".format(
                    getattr(row, l_col),
                    row.index,
                    "".join(n_feats).rstrip(),
                    "".join(c_feats).rstrip(),
                )
            )
            f.write("\n")

In [112]:
def run_vw(train_params, test_params, test_df, l_col, prediction_path):
    """Convenience function to train, test, and show metrics of interest
    Args:
        train_params (str): vw training parameters
        test_params (str): vw testing parameters
        test_data (pd.DataFrame):
        l_col (str): label column name
        prediction_path (str): path to vw prediction output
    Returns:
        (dict): metrics and timing information
    """
    # train model
    train_start = process_time()
    sp.run(train_params.split(' '), check=True)
    train_stop = process_time()
    
    # test model
    test_start = process_time()
    sp.run(test_params.split(' '), check=True)
    test_stop = process_time()
    
    # read in predictions
    pred_df = pd.read_csv(prediction_path, delim_whitespace=True, names=['prediction'], index_col=1).join(test_df)
    test_y = pred_df.pop(l_col).values
    test_preds = pred_df['prediction'].values
    
    # calculate metrics
    result = dict()
    result['auc'] = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))
    result['logloss'] = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)
    result['Train Time (ms)'] = (train_stop - train_start) * 1000
    result['Test Time (ms)'] = (test_stop - test_start) * 1000
    
    return result

Transform data

In [113]:
# save train and test data in vw format
to_vw(
    # TODO shouldn't I use item id as index????
    df=pd.concat([train_data, valid_data]).reset_index(drop=True),
    l_col=label_col,
    n_cols=nume_cols,
    c_cols=cate_cols,
    output_path=train_path
)
to_vw(
    df=test_data,
    l_col=label_col,
    n_cols=nume_cols,
    c_cols=cate_cols,    
    output_path=test_path
)

90000it [00:03, 29619.85it/s]
10000it [00:00, 28960.41it/s]


In [114]:
train_params = "vw --loss_function logistic -f {model} -d {data} --quiet".format(
    model=model_path, data=train_path
)
test_params = "vw --link logistic -i {model} -d {data} -t -p {pred} --quiet".format(
    model=model_path, data=test_path, pred=prediction_path
)

# train model
train_start = process_time()
sp.run(train_params.split(' '), check=True)
train_stop = process_time()

# test model
test_start = process_time()
sp.run(test_params.split(' '), check=True)
test_stop = process_time()

In [115]:
result = run_vw(
    train_params=train_params,
    test_params=test_params,
    test_df=test_data,
    l_col=label_col,
    prediction_path=prediction_path
)
result

{'auc': 0.7705353921278044,
 'logloss': 0.46926060131459496,
 'Train Time (ms)': 7.8346579999930555,
 'Test Time (ms)': 6.585210000011443}

In [116]:
prediction_path

'/tmp/tmpy1v7u5kv/prediction.dat'