In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

clicks = pd.read_csv('talkingdata-adtracking-fraud-detection/train_sample.csv',
                        parse_dates=['click_time'])
clicks.head(10), clicks.tail(10), clicks.shape

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


(       ip  app  device  os  channel          click_time attributed_time  \
 0   87540   12       1  13      497 2017-11-07 09:30:38             NaN   
 1  105560   25       1  17      259 2017-11-07 13:40:27             NaN   
 2  101424   12       1  19      212 2017-11-07 18:05:24             NaN   
 3   94584   13       1  13      477 2017-11-07 04:58:08             NaN   
 4   68413   12       1   1      178 2017-11-09 09:00:09             NaN   
 5   93663    3       1  17      115 2017-11-09 01:22:13             NaN   
 6   17059    1       1  17      135 2017-11-09 01:17:58             NaN   
 7  121505    9       1  25      442 2017-11-07 10:01:53             NaN   
 8  192967    2       2  22      364 2017-11-08 09:35:17             NaN   
 9  143636    3       1  19      135 2017-11-08 12:35:26             NaN   
 
    is_attributed  
 0              0  
 1              0  
 2              0  
 3              0  
 4              0  
 5              0  
 6              0  
 7

In [2]:
click_times = clicks['click_time']
clicks['day'] = click_times.dt.day.astype('uint8')
clicks['hour'] = click_times.dt.hour.astype('uint8')
clicks['minute'] = click_times.dt.minute.astype('uint8')
clicks['second'] = click_times.dt.second.astype('uint8')
clicks.head(10), clicks.tail(10), clicks.shape

(       ip  app  device  os  channel          click_time attributed_time  \
 0   87540   12       1  13      497 2017-11-07 09:30:38             NaN   
 1  105560   25       1  17      259 2017-11-07 13:40:27             NaN   
 2  101424   12       1  19      212 2017-11-07 18:05:24             NaN   
 3   94584   13       1  13      477 2017-11-07 04:58:08             NaN   
 4   68413   12       1   1      178 2017-11-09 09:00:09             NaN   
 5   93663    3       1  17      115 2017-11-09 01:22:13             NaN   
 6   17059    1       1  17      135 2017-11-09 01:17:58             NaN   
 7  121505    9       1  25      442 2017-11-07 10:01:53             NaN   
 8  192967    2       2  22      364 2017-11-08 09:35:17             NaN   
 9  143636    3       1  19      135 2017-11-08 12:35:26             NaN   
 
    is_attributed  day  hour  minute  second  
 0              0    7     9      30      38  
 1              0    7    13      40      27  
 2              0    

In [3]:
def get_data_splits(dataframe, valid_fraction=0.1):
    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    
    train = dataframe[:-2 * valid_rows]
    valid = dataframe[-2 * valid_rows:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

In [14]:
def train_model(train, valid, test=None, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time', 'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    num_round = 1000
    print("Training model!")
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)
    
    train_pred = bst.predict(train[feature_cols])
    train_score = metrics.roc_auc_score(train['is_attributed'], train_pred)
    print(f"Train AUC score: {train_score}")
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        print(f"Test AUC score: {test_score}")
        return bst, valid_score, test_score
    else:
        return bst, valid_score

In [15]:
print('Baseline model')
train, valid, test = get_data_splits(clicks)
train_model(train, valid, test)

Baseline model
Training model!
Train AUC score: 0.9640588589040066
Validation AUC score: 0.9324771929824561
Test AUC score: 0.85750250501002


(<lightgbm.basic.Booster at 0x1a1ee968d0>,
 0.9324771929824561,
 0.85750250501002)

In [16]:
import category_encoders as ce

cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

count_encoder = ce.CountEncoder(cols=cat_features)
count_encoder.fit(train[cat_features])

train_encoded = train.join(count_encoder.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_encoder.transform(valid[cat_features]).add_suffix('_count'))
test_encoded = test.join(count_encoder.transform(test[cat_features]).add_suffix('_count'))

train_model(train_encoded, valid_encoded, test_encoded)

Training model!
Train AUC score: 0.9736499093129177
Validation AUC score: 0.848635588972431
Test AUC score: 0.7454709418837675


(<lightgbm.basic.Booster at 0x1a1ce2f710>,
 0.848635588972431,
 0.7454709418837675)

In [19]:
cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train[cat_features], train['is_attributed'])

train_encoded = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid_encoded = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))
test_encoded = test.join(target_enc.transform(test[cat_features]).add_suffix('_target'))

train_model(train_encoded, valid_encoded, test_encoded)

Training model!
Train AUC score: 0.9990704470802945
Validation AUC score: 0.8783538847117796
Test AUC score: 0.9599799599198396


(<lightgbm.basic.Booster at 0x1a1ee96358>,
 0.8783538847117796,
 0.9599799599198396)

In [20]:
cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)
cb_enc.fit(train[cat_features], train['is_attributed'])

train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb'))
test_encoded = test.join(cb_enc.transform(test[cat_features]).add_suffix('_cb'))

train_model(train_encoded, valid_encoded, test_encoded)

Training model!
Train AUC score: 0.9990704470802945
Validation AUC score: 0.8773754385964911
Test AUC score: 0.9592785571142284


(<lightgbm.basic.Booster at 0x1a218a4cf8>,
 0.8773754385964911,
 0.9592785571142284)

In [21]:
cat_features = ['app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)
cb_enc.fit(train[cat_features], train['is_attributed'])

train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb'))
test_encoded = test.join(cb_enc.transform(test[cat_features]).add_suffix('_cb'))

train_model(train_encoded, valid_encoded, test_encoded)

Training model!
Train AUC score: 0.9960245409956002
Validation AUC score: 0.8980190476190476
Test AUC score: 0.8995090180360721


(<lightgbm.basic.Booster at 0x1a1f34b710>,
 0.8980190476190476,
 0.8995090180360721)

In [22]:
clicks.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed', 'day', 'hour', 'minute', 'second'],
      dtype='object')