In [1]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('data/train_v1.csv')
test = pd.read_csv('data/test_v1.csv')

In [3]:
train['online_hours'] = np.round(train['online_hours'])

In [4]:
train['adversarial_label'] = 1
test['adversarial_label'] = 0

In [5]:
adversarial_data = pd.concat([train, test])
adversarial_label = adversarial_data['adversarial_label']
adversarial_train = adversarial_data.drop(['driver_id', 'date', 'day', 'adversarial_label'], axis=1)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(adversarial_train, adversarial_label, 
                                                    test_size=0.2, random_state=42)

In [7]:
lgbm = LGBMClassifier(importance_type= 'gain',
                      min_child_samples=10, random_state=0)
lgbm.fit(x_train,y_train, 
         eval_set=(x_test, y_test), 
         early_stopping_rounds=20, 
         verbose=0,eval_metric='auc')
y_predicted = lgbm.predict(x_test)
roc_auc_score(y_test,y_predicted)

0.5048909187349743

### Drift Features 

In [8]:
{x_train.columns[i]:lgbm.feature_importances_[i] for i in np.argsort(lgbm.feature_importances_)[::-1]}

{'online_hours': 103.00528025627136,
 'age': 86.17276906967163,
 'prev_7': 78.8018388748169,
 'prev_14': 76.75278997421265,
 'dayofweek': 35.424999952316284,
 'number_of_kids': 23.642159938812256,
 'c2': 21.891130447387695,
 'prev_holiday': 12.20329999923706,
 'age_kids': 5.1564202308654785,
 'c3': 0.0,
 'c1': 0.0,
 'age3': 0.0,
 'gender_kids': 0.0,
 'next_holiday': 0.0,
 'holiday': 0.0,
 'gender': 0.0,
 'age2': 0.0}

In [18]:
adversarial_data = pd.concat([train, test])
adversarial_label = adversarial_data['adversarial_label']
adversarial_train = adversarial_data.drop(['driver_id', 'date', 'day', 'adversarial_label',
                                           'holiday', 'prev_holiday', 'next_holiday'], axis=1)
# adversarial_train = adversarial_data.drop(['driver_id', 'date', 'day', 'adversarial_label',
#                                            'dayofweek'], axis=1)

In [36]:
x_train, x_test, y_train, y_test = train_test_split(adversarial_train, adversarial_label, 
                                                    test_size=0.2, random_state=42)

In [10]:
lgbm = LGBMClassifier(importance_type= 'gain',
                      min_child_samples=10, random_state=0)
lgbm.fit(x_train,y_train, 
         eval_set=(x_test, y_test), 
         early_stopping_rounds=20, 
         verbose=0,eval_metric='auc')
y_predicted = lgbm.predict(x_test)
roc_auc_score(y_test,y_predicted)

0.5048909187349743

In [11]:
lebaran = test['day'].isin([25, 26])
off = test['online_hours'] == 0
anomaly_index = (lebaran)&(off)
np.sum(anomaly_index)

1515

In [15]:
adversarial_data = pd.concat([train, test[~anomaly_index]])
adversarial_label = adversarial_data['adversarial_label']
# adversarial_train = adversarial_data.drop(['driver_id', 'date', 'day', 'adversarial_label', 
#                                            'holiday', 'prev_holiday', 'next_holiday'], axis=1)
adversarial_train = adversarial_data.drop(['driver_id', 'date', 'day', 'adversarial_label'], axis=1)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(adversarial_train, adversarial_label, 
                                                    test_size=0.2, random_state=42)

In [17]:
lgbm = LGBMClassifier(importance_type= 'gain',
                      min_child_samples=10)
lgbm.fit(x_train,y_train, 
         eval_set=(x_test, y_test), 
         early_stopping_rounds=20, 
         verbose=0,eval_metric='auc')
y_predicted = lgbm.predict(x_test)
roc_auc_score(y_test,y_predicted)

0.49771195870344775

In [37]:
y_train

17005    0
6438     1
10255    0
3147     0
953      0
2565     0
8054     1
4204     0
11178    1
11903    1
6925     1
12469    0
9809     1
79       0
11512    1
2613     0
7804     0
10853    0
12600    1
6493     1
14382    0
3892     0
16981    1
3509     1
11655    0
14742    1
6627     0
8324     0
7893     1
5096     1
        ..
14502    1
13282    0
3890     1
3556     1
11394    1
14093    0
1267     1
1899     1
1660     0
189      1
2747     1
973      0
1484     0
8200     0
10022    0
6396     1
11235    0
2311     0
110      0
5311     1
2433     1
769      1
1685     1
16023    1
4504     0
16850    1
6265     1
11284    1
860      1
15795    1
Name: adversarial_label, Length: 27932, dtype: int64

In [46]:
test.loc[(test['dayofweek']==6)]

Unnamed: 0,driver_id,date,online_hours,gender,age,number_of_kids,day,dayofweek,holiday,prev_holiday,...,age2,age3,c1,c2,c3,gender_kids,age_kids,prev_7,prev_14,adversarial_label
3,111556,2017-06-25,0.0,0,49,4,25,6,1,0,...,1.0,4,0,0,0,4,9,0.000000,0.000000,0
10,111575,2017-06-25,0.0,1,49,0,25,6,1,0,...,1.0,4,0,2,0,5,5,0.000000,0.000000,0
17,111779,2017-06-25,0.0,1,26,0,25,6,1,0,...,0.0,2,0,0,0,5,0,0.000000,0.000000,0
24,111839,2017-06-25,10.0,1,25,0,25,6,1,0,...,0.0,2,1,3,1,5,0,6.766667,6.800000,0
31,112486,2017-06-25,0.0,0,44,1,25,6,1,0,...,1.0,4,0,1,0,1,6,0.000000,0.000000,0
38,112609,2017-06-25,8.0,1,55,0,25,6,1,0,...,1.0,5,1,2,1,5,5,3.866667,5.550000,0
45,112698,2017-06-25,6.0,1,43,3,25,6,1,0,...,1.0,4,0,1,1,8,8,7.183333,6.066667,0
52,112730,2017-06-25,0.0,1,31,3,25,6,1,0,...,0.0,3,0,0,0,8,3,0.000000,0.000000,0
59,113043,2017-06-25,0.0,1,28,0,25,6,1,0,...,0.0,2,0,2,0,5,0,0.000000,0.000000,0
66,113140,2017-06-25,9.0,1,18,0,25,6,1,0,...,0.0,1,1,3,1,5,0,9.858333,8.200000,0


In [47]:
train.loc[(train['dayofweek']==6)]

Unnamed: 0,driver_id,date,online_hours,gender,age,number_of_kids,day,dayofweek,holiday,prev_holiday,...,age2,age3,c1,c2,c3,gender_kids,age_kids,prev_7,prev_14,adversarial_label
3,111556,2017-06-18,0.0,0,49,4,18,6,1,1,...,1.0,4,0,0,0,4,9,0.000000,0.000000,1
10,111575,2017-06-18,0.0,1,49,0,18,6,1,1,...,1.0,4,0,2,0,5,5,0.000000,0.000000,1
17,111779,2017-06-18,0.0,1,26,0,18,6,1,1,...,0.0,2,0,0,0,5,0,0.000000,0.000000,1
24,111839,2017-06-18,7.0,1,25,0,18,6,1,1,...,0.0,2,1,3,1,5,0,6.800000,8.229167,1
31,112486,2017-06-18,0.0,0,44,1,18,6,1,1,...,1.0,4,0,1,0,1,6,0.000000,0.000000,1
38,112609,2017-06-18,4.0,1,55,0,18,6,1,1,...,1.0,5,1,2,1,5,5,5.550000,6.733333,1
45,112698,2017-06-18,7.0,1,43,3,18,6,1,1,...,1.0,4,0,1,1,8,8,6.066667,6.595833,1
52,112730,2017-06-18,0.0,1,31,3,18,6,1,1,...,0.0,3,0,0,0,8,3,0.000000,0.000000,1
59,113043,2017-06-18,0.0,1,28,0,18,6,1,1,...,0.0,2,0,2,0,5,0,0.000000,0.000000,1
66,113140,2017-06-18,10.0,1,18,0,18,6,1,1,...,0.0,1,1,3,1,5,0,8.200000,8.545833,1
