In [70]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

clicks = pd.read_csv('talkingdata-adtracking-fraud-detection/train_sample.csv',
                     parse_dates=['click_time'])
click_times = clicks['click_time']
clicks = clicks.assign(day=click_times.dt.day.astype('uint8'),
                       hour=click_times.dt.day.astype('uint8'),
                       minute=click_times.dt.minute.astype('uint8'),
                       second=click_times.dt.second.astype('uint8'))
cat_features = ['ip', 'app', 'device', 'os', 'channel']
for feature in cat_features:
    label_encoder = preprocessing.LabelEncoder()
    clicks[feature] = label_encoder.fit_transform(clicks[feature])
    
def get_data_splits(dataframe, valid_fraction=0.1):
    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-2 * valid_rows]
    valid = dataframe[-2 * valid_rows:-valid_rows]
    test = dataframe[-valid_rows:]
    return train, valid, test

def train_model(train, valid, test=None, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed'])
        dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
        dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
        
        param = {'num_leaves': 64, 'objective': 'binary',
                 'metric': 'auc', 'seed': 7}
        num_round = 1000
        print('Training model. Hold on a minute to see the validation score')
        bst = lgb.train(param, dtrain, num_round, valid_sets=[dtrain, dvalid],
                        early_stopping_rounds=20, verbose_eval=False)
        
        train_pred = bst.predict(train[feature_cols])
        train_score = metrics.roc_auc_score(train['is_attributed'], train_pred)
        print('Train AUC score {}'.format(train_score))
        
        valid_pred = bst.predict(valid[feature_cols])
        valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
        print('Validation AUC score {}'.format(valid_score))
        
        if test is not None:
            test_pred = bst.predict(test[feature_cols])
            test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
            print('Test AUC score {}'.format(test_score))
            
            return bst, valid_score, test_score
        else:
            return bst, valid_score

        
print('Baseline model score')
train, valid, test = get_data_splits(clicks)
train_model(train, valid, test)

Baseline model score
Training model. Hold on a minute to see the validation score
Train AUC score 0.9653580714807505
Validation AUC score 0.9345604010025064
Test AUC score 0.8603907815631263


(<lightgbm.basic.Booster at 0x1a23aab828>,
 0.9345604010025064,
 0.8603907815631263)

In [71]:
import itertools

cat_features = ['ip', 'app', 'device', 'os', 'channel']
interactions = pd.DataFrame(index=clicks.index)

for col1, col2 in itertools.combinations(cat_features, 2):
    new_col_name = '_'.join([col1, col2])
    new_values = clicks[col1].map(str) + '_' + clicks[col2].map(str)
    encoder = preprocessing.LabelEncoder()
    interactions[new_col_name] = encoder.fit_transform(new_values)

In [72]:
clicks = clicks.join(interactions)
train, valid, test = get_data_splits(clicks)
train_model(train, valid, test)

Training model. Hold on a minute to see the validation score
Train AUC score 0.9668823496531531
Validation AUC score 0.9437533834586467
Test AUC score 0.8727129258517035


(<lightgbm.basic.Booster at 0x1a271c6ba8>,
 0.9437533834586467,
 0.8727129258517035)

In [73]:
def count_past_events(series, time_window='6H'):
    series = pd.Series(series.index, index=series, name='ip_past_6hr_counts').sort_index()
    past_events = series.rolling(time_window).count() - 1
    return past_events

In [74]:
clicks.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,...,ip_app,ip_device,ip_os,ip_channel,app_device,app_os,app_channel,device_os,device_channel,os_channel
0,15220,11,1,13,159,2017-11-07 09:30:38,,0,7,7,...,15234,6356,15276,17939,34,183,66,55,91,479
1,18448,24,1,17,67,2017-11-07 13:40:27,,0,7,7,...,24800,10281,24825,29300,199,954,248,59,149,1003
2,17663,11,1,19,52,2017-11-07 18:05:24,,0,7,7,...,22485,9302,22504,26559,34,189,77,61,133,1248
3,16496,12,1,13,146,2017-11-07 04:58:08,,0,7,7,...,19001,7903,19074,22383,55,272,102,55,77,466
4,11852,11,1,1,45,2017-11-09 09:00:09,,0,9,9,...,5364,2269,5366,6266,34,175,74,24,125,1365


In [75]:
ip_past_6hr_counts = count_past_events(clicks.click_time)
ip_past_6hr_counts.head(), ip_past_6hr_counts.shape

(click_time
 2017-11-06 16:00:00    0.0
 2017-11-06 16:00:09    1.0
 2017-11-06 16:00:09    2.0
 2017-11-06 16:00:11    3.0
 2017-11-06 16:00:11    4.0
 Name: ip_past_6hr_counts, dtype: float64, (100000,))

In [76]:
ip_past_6hr_counts.index = clicks.index
ip_past_6hr_counts.head()

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
Name: ip_past_6hr_counts, dtype: float64

In [80]:
clicks['ip_past_6hr_counts'] = ip_past_6hr_counts
train, valid, test = get_data_splits(clicks)
train_model(train, valid, test)

Training model. Hold on a minute to see the validation score
Train AUC score 0.9663706773569211
Validation AUC score 0.9412511278195489
Test AUC score 0.8141533066132265


(<lightgbm.basic.Booster at 0x1a2717e898>,
 0.9412511278195489,
 0.8141533066132265)

In [68]:
clicks.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed', 'day', 'hour', 'minute', 'second', 'ip_app',
       'ip_device', 'ip_os', 'ip_channel', 'app_device', 'app_os',
       'app_channel', 'device_os', 'device_channel', 'os_channel'],
      dtype='object')

In [69]:
clicks.shape

(100000, 22)

In [78]:
def time_diff(series):
    return series.diff().dt.total_seconds()

In [82]:
clicks['past_events_6hr'] = clicks['click_time'].transform(time_diff)

In [83]:
clicks.head(20)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,...,ip_os,ip_channel,app_device,app_os,app_channel,device_os,device_channel,os_channel,ip_past_6hr_counts,past_events_6hr
0,15220,11,1,13,159,2017-11-07 09:30:38,,0,7,7,...,15276,17939,34,183,66,55,91,479,0.0,
1,18448,24,1,17,67,2017-11-07 13:40:27,,0,7,7,...,24825,29300,199,954,248,59,149,1003,1.0,14989.0
2,17663,11,1,19,52,2017-11-07 18:05:24,,0,7,7,...,22504,26559,34,189,77,61,133,1248,2.0,15897.0
3,16496,12,1,13,146,2017-11-07 04:58:08,,0,7,7,...,19074,22383,55,272,102,55,77,466,3.0,-47236.0
4,11852,11,1,1,45,2017-11-09 09:00:09,,0,9,9,...,5366,6266,34,175,74,24,125,1365,4.0,187321.0
5,16300,2,1,17,20,2017-11-09 01:22:13,,0,9,9,...,18529,21755,211,1183,284,59,98,955,5.0,-27476.0
6,2973,0,1,17,34,2017-11-09 01:17:58,,0,9,9,...,49535,56987,0,10,16,59,113,970,6.0,-255.0
7,21303,8,1,25,126,2017-11-07 10:01:53,,0,7,7,...,33226,39167,347,1912,395,66,58,1777,7.0,-141365.0
8,28059,1,2,22,100,2017-11-08 09:35:17,,0,8,8,...,46551,53758,189,699,217,165,206,1523,8.0,84804.0
9,23644,2,1,19,34,2017-11-08 12:35:26,,0,8,8,...,38308,44848,211,1185,286,61,113,1230,9.0,10809.0
