In [18]:
import pandas as pd

clicks = pd.read_csv('talkingdata-adtracking-fraud-detection/train_sample.csv',
                        parse_dates=['click_time'])
clicks.head(10), clicks.tail(10), clicks.shape

(       ip  app  device  os  channel          click_time attributed_time  \
 0   87540   12       1  13      497 2017-11-07 09:30:38             NaN   
 1  105560   25       1  17      259 2017-11-07 13:40:27             NaN   
 2  101424   12       1  19      212 2017-11-07 18:05:24             NaN   
 3   94584   13       1  13      477 2017-11-07 04:58:08             NaN   
 4   68413   12       1   1      178 2017-11-09 09:00:09             NaN   
 5   93663    3       1  17      115 2017-11-09 01:22:13             NaN   
 6   17059    1       1  17      135 2017-11-09 01:17:58             NaN   
 7  121505    9       1  25      442 2017-11-07 10:01:53             NaN   
 8  192967    2       2  22      364 2017-11-08 09:35:17             NaN   
 9  143636    3       1  19      135 2017-11-08 12:35:26             NaN   
 
    is_attributed  
 0              0  
 1              0  
 2              0  
 3              0  
 4              0  
 5              0  
 6              0  
 7

In [19]:
click_times = clicks['click_time']
clicks['day'] = click_times.dt.day.astype('uint8')
clicks['hour'] = click_times.dt.hour.astype('uint8')
clicks['minute'] = click_times.dt.minute.astype('uint8')
clicks['second'] = click_times.dt.second.astype('uint8')
clicks.head(10), clicks.tail(10), clicks.shape

(       ip  app  device  os  channel          click_time attributed_time  \
 0   87540   12       1  13      497 2017-11-07 09:30:38             NaN   
 1  105560   25       1  17      259 2017-11-07 13:40:27             NaN   
 2  101424   12       1  19      212 2017-11-07 18:05:24             NaN   
 3   94584   13       1  13      477 2017-11-07 04:58:08             NaN   
 4   68413   12       1   1      178 2017-11-09 09:00:09             NaN   
 5   93663    3       1  17      115 2017-11-09 01:22:13             NaN   
 6   17059    1       1  17      135 2017-11-09 01:17:58             NaN   
 7  121505    9       1  25      442 2017-11-07 10:01:53             NaN   
 8  192967    2       2  22      364 2017-11-08 09:35:17             NaN   
 9  143636    3       1  19      135 2017-11-08 12:35:26             NaN   
 
    is_attributed  day  hour  minute  second  
 0              0    7     9      30      38  
 1              0    7    13      40      27  
 2              0    

In [20]:
from sklearn import preprocessing

cat_features = ['ip', 'app', 'device', 'os', 'channel']

for feature in cat_features:
    clicks[feature+'_labels'] = preprocessing.LabelEncoder().fit_transform(clicks[feature])
    
clicks.head(10), clicks.tail(10), clicks.shape

(       ip  app  device  os  channel          click_time attributed_time  \
 0   87540   12       1  13      497 2017-11-07 09:30:38             NaN   
 1  105560   25       1  17      259 2017-11-07 13:40:27             NaN   
 2  101424   12       1  19      212 2017-11-07 18:05:24             NaN   
 3   94584   13       1  13      477 2017-11-07 04:58:08             NaN   
 4   68413   12       1   1      178 2017-11-09 09:00:09             NaN   
 5   93663    3       1  17      115 2017-11-09 01:22:13             NaN   
 6   17059    1       1  17      135 2017-11-09 01:17:58             NaN   
 7  121505    9       1  25      442 2017-11-07 10:01:53             NaN   
 8  192967    2       2  22      364 2017-11-08 09:35:17             NaN   
 9  143636    3       1  19      135 2017-11-08 12:35:26             NaN   
 
    is_attributed  day  hour  minute  second  ip_labels  app_labels  \
 0              0    7     9      30      38      15220          11   
 1              0   

In [21]:
feature_cols = ['day', 'hour', 'minute', 'second', 
                'ip_labels', 'app_labels', 'device_labels',
                'os_labels', 'channel_labels']

valid_fraction = 0.1
clicks_srt = clicks.sort_values('click_time')
valid_rows = int(len(clicks_srt) * valid_fraction)
train = clicks_srt[:-2 * valid_rows]
valid = clicks_srt[-2 * valid_rows:-valid_rows]
test = clicks_srt[-valid_rows:]

In [22]:
import lightgbm as lgb

dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
dtest = lgb.Dataset(test[feature_cols], label=test['is_attributed'])

param = {'num_leaves': 64, 'objective': 'binary', 'metric': 'auc'}
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dtrain, dvalid, dtest], early_stopping_rounds=10)

[1]	training's auc: 0.964059	valid_1's auc: 0.932413	valid_2's auc: 0.78119
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.769949	valid_1's auc: 0.530091	valid_2's auc: 0.683875
[3]	training's auc: 0.76744	valid_1's auc: 0.446663	valid_2's auc: 0.725859
[4]	training's auc: 0.855111	valid_1's auc: 0.608956	valid_2's auc: 0.776926
[5]	training's auc: 0.857454	valid_1's auc: 0.608624	valid_2's auc: 0.725837
[6]	training's auc: 0.906923	valid_1's auc: 0.809223	valid_2's auc: 0.776385
[7]	training's auc: 0.882895	valid_1's auc: 0.810089	valid_2's auc: 0.774101
[8]	training's auc: 0.883502	valid_1's auc: 0.810314	valid_2's auc: 0.776075
[9]	training's auc: 0.926062	valid_1's auc: 0.80959	valid_2's auc: 0.777204
[10]	training's auc: 0.916218	valid_1's auc: 0.809999	valid_2's auc: 0.778026
[11]	training's auc: 0.911424	valid_1's auc: 0.809321	valid_2's auc: 0.775982
Early stopping, best iteration is:
[1]	training's auc: 0.964059	valid_1's auc: 0.932413	vali

In [23]:
from sklearn import metrics

ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['is_attributed'], ypred)
print(f'Test score: {score}')

Test score: 0.7811898797595189


In [24]:
clicks.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed', 'day', 'hour', 'minute', 'second', 'ip_labels',
       'app_labels', 'device_labels', 'os_labels', 'channel_labels'],
      dtype='object')