In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import holidays
import pytz
import eli5

from datetime import datetime
from scipy import stats

from catboost import *
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import ward, fcluster
from scipy.spatial.distance import pdist
from eli5.sklearn import PermutationImportance

In [31]:
drivers = pd.read_csv('data/drivers.csv')
train = pd.read_csv('data/train_80.csv', parse_dates=['date'])
test = pd.read_csv('data/test.csv', parse_dates=['date'])

anomaly_driver_idx = [320227, 799250, 800600]

In [32]:
train = train.drop('day', axis=1)

In [33]:
drivers2 = drivers[~drivers['driver_id'].isin(anomaly_driver_idx)].reset_index(drop=True)
train2 = train[~train['driver_id'].isin(anomaly_driver_idx)].reset_index(drop=True)
test2 = test[~test['driver_id'].isin(anomaly_driver_idx)].reset_index(drop=True)

In [34]:
date = pd.date_range(start='6/1/2017', end='6/21/2017')
len_driver = len(test2['driver_id'].unique())
len_date = len(date)

df = pd.DataFrame()
df['driver_id'] = np.tile(test2['driver_id'].unique(), len_date)
df['date'] = date.repeat(len_driver)
df = df.sort_values(['driver_id', 'date'])

train_merge = pd.merge(df, train2, on=['driver_id', 'date'], how='left', validate = 'm:1')
train_merge.fillna(0, inplace=True)

train_merge = pd.merge(train_merge, drivers2, on='driver_id', how='left', validate = 'm:1')

In [35]:
test_merge = pd.merge(test2, drivers2, on='driver_id', how='left', validate = 'm:1')

In [36]:
train_merge.head()

Unnamed: 0,driver_id,date,online_hours,gender,age,number_of_kids
0,111556,2017-06-01,2.216667,FEMALE,49,4
1,111556,2017-06-02,2.5,FEMALE,49,4
2,111556,2017-06-03,0.0,FEMALE,49,4
3,111556,2017-06-04,0.0,FEMALE,49,4
4,111556,2017-06-05,4.666667,FEMALE,49,4


In [37]:
test_merge.head()

Unnamed: 0,driver_id,date,online_hours,gender,age,number_of_kids
0,979863,2017-06-28,7,MALE,26,2
1,979863,2017-06-27,9,MALE,26,2
2,979863,2017-06-26,9,MALE,26,2
3,979863,2017-06-25,10,MALE,26,2
4,979863,2017-06-24,9,MALE,26,2


# Known Driver 

In [38]:
def additional_features(df):
    df['gender'] = (df['gender']=='MALE').astype(int)
    
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    
    df['holiday'] = (df['dayofweek'] > 4).astype(int)
#     df.sort_values(['driver_id', 'date'], ascending=False, inplace=True)
    df['prev_holiday'] = df.groupby(['driver_id'])['holiday'].shift(1).fillna(0).astype(int)
    df['next_holiday'] = df.groupby(['driver_id'])['holiday'].shift(-1).fillna(0).astype(int)

In [39]:
additional_features(train_merge)
additional_features(test_merge)

In [40]:
test_merge[test_merge.driver_id==979863]

Unnamed: 0,driver_id,date,online_hours,gender,age,number_of_kids,day,dayofweek,holiday,prev_holiday,next_holiday
0,979863,2017-06-28,7,1,26,2,28,2,0,0,0
1,979863,2017-06-27,9,1,26,2,27,1,0,0,0
2,979863,2017-06-26,9,1,26,2,26,0,0,0,1
3,979863,2017-06-25,10,1,26,2,25,6,1,0,1
4,979863,2017-06-24,9,1,26,2,24,5,1,1,0
5,979863,2017-06-23,8,1,26,2,23,4,0,1,0
6,979863,2017-06-22,7,1,26,2,22,3,0,0,0


In [41]:
train_merge.loc[(train_merge['dayofweek']==6)]

Unnamed: 0,driver_id,date,online_hours,gender,age,number_of_kids,day,dayofweek,holiday,prev_holiday,next_holiday
3,111556,2017-06-04,0.000000,0,49,4,4,6,1,1,0
10,111556,2017-06-11,0.000000,0,49,4,11,6,1,1,0
17,111556,2017-06-18,0.000000,0,49,4,18,6,1,1,0
24,111575,2017-06-04,0.000000,1,49,0,4,6,1,1,0
31,111575,2017-06-11,0.000000,1,49,0,11,6,1,1,0
38,111575,2017-06-18,0.000000,1,49,0,18,6,1,1,0
45,111779,2017-06-04,0.000000,1,26,0,4,6,1,1,0
52,111779,2017-06-11,0.000000,1,26,0,11,6,1,1,0
59,111779,2017-06-18,0.000000,1,26,0,18,6,1,1,0
66,111839,2017-06-04,8.229167,1,25,0,4,6,1,1,0


In [42]:
driver_group = train_merge.groupby('driver_id')['online_hours'].agg(['mean', 'std']).reset_index()
driver_group_2 = train_merge.groupby(['driver_id','holiday'])['online_hours'].agg(['mean', 'std']).reset_index()
driver_group_3 = driver_group_2[driver_group_2.holiday ==1]

In [43]:
def additional_features_2(df):    
    df.loc[(df['age']<=40), 'age2'] = 0
    df.loc[(df['age']>40), 'age2'] = 1
    df['age3'] = df['age'] // 10
    
    driver_0 = driver_group[driver_group['mean']<=5]['driver_id'].unique()
    driver_1 = driver_group[driver_group['mean']>5]['driver_id'].unique()
    df['c1'] = 0
    df.loc[df['driver_id'].isin(driver_1), 'c1'] = 1
    
    driver_0 = driver_group[driver_group['mean']<=2]['driver_id'].unique()
    driver_1 = driver_group[(driver_group['mean']>2)&(driver_group['mean']<=4)]['driver_id'].unique()
    driver_2 = driver_group[(driver_group['mean']>4)&(driver_group['mean']<=7)]['driver_id'].unique()
    driver_3 = driver_group[driver_group['mean']>7]['driver_id'].unique()
    df['c2'] = 0
    df.loc[df['driver_id'].isin(driver_1), 'c2'] = 1
    df.loc[df['driver_id'].isin(driver_2), 'c2'] = 2
    df.loc[df['driver_id'].isin(driver_3), 'c2'] = 3
    
    driver_0 = driver_group_3[driver_group_3['mean']<=3]['driver_id'].unique()
    driver_1 = driver_group_3[driver_group_3['mean']>3]['driver_id'].unique()
    df['c3'] = 0
    df.loc[df['driver_id'].isin(driver_1), 'c3'] = 1
    
    df['gender_kids'] = None
    df.loc[(df['gender'] == 0)&(df['number_of_kids'] == 0), ['gender_kids']] = 0
    df.loc[(df['gender'] == 0)&(df['number_of_kids'] == 1), ['gender_kids']] = 1
    df.loc[(df['gender'] == 0)&(df['number_of_kids'] == 2), ['gender_kids']] = 2
    df.loc[(df['gender'] == 0)&(df['number_of_kids'] == 3), ['gender_kids']] = 3
    df.loc[(df['gender'] == 0)&(df['number_of_kids'] == 4), ['gender_kids']] = 4
    df.loc[(df['gender'] == 1)&(df['number_of_kids'] == 0), ['gender_kids']] = 5
    df.loc[(df['gender'] == 1)&(df['number_of_kids'] == 1), ['gender_kids']] = 6
    df.loc[(df['gender'] == 1)&(df['number_of_kids'] == 2), ['gender_kids']] = 7
    df.loc[(df['gender'] == 1)&(df['number_of_kids'] == 3), ['gender_kids']] = 8
    df.loc[(df['gender'] == 1)&(df['number_of_kids'] == 4), ['gender_kids']] = 9
    
    df['age_kids'] = None
    df.loc[(df['age2'] == 0)&(df['number_of_kids'] == 0), ['age_kids']] = 0
    df.loc[(df['age2'] == 0)&(df['number_of_kids'] == 1), ['age_kids']] = 1
    df.loc[(df['age2'] == 0)&(df['number_of_kids'] == 2), ['age_kids']] = 2
    df.loc[(df['age2'] == 0)&(df['number_of_kids'] == 3), ['age_kids']] = 3
    df.loc[(df['age2'] == 0)&(df['number_of_kids'] == 4), ['age_kids']] = 4
    df.loc[(df['age2'] == 1)&(df['number_of_kids'] == 0), ['age_kids']] = 5
    df.loc[(df['age2'] == 1)&(df['number_of_kids'] == 1), ['age_kids']] = 6
    df.loc[(df['age2'] == 1)&(df['number_of_kids'] == 2), ['age_kids']] = 7
    df.loc[(df['age2'] == 1)&(df['number_of_kids'] == 3), ['age_kids']] = 8
    df.loc[(df['age2'] == 1)&(df['number_of_kids'] == 4), ['age_kids']] = 9
    return df

In [44]:
train_merge_2 = additional_features_2(train_merge)
test_merge_2 = additional_features_2(test_merge)

In [45]:
def get_features_3(train, test):
    start = test.day.min() - 15
    combined = pd.concat([train[train.day>start], test])
    df_train = additional_features_3(train)
    df_test = additional_features_3(combined)
    return df_train, df_test

def additional_features_3(merge):
    df_merge = merge.copy().sort_values(['driver_id', 'date'])
    df_merge['prev_7'] = df_merge.groupby('driver_id')['online_hours'].shift(7)
    df_merge['prev_14'] = df_merge.groupby('driver_id')['online_hours'].shift(14)
    df_merge.dropna(inplace=True)
    df_merge = df_merge.reset_index(drop=True)
    return df_merge

In [46]:
train_merge_3, test_merge_3 = get_features_3(train_merge_2, test_merge_2)

In [47]:
active_drivers = train_merge_3.groupby(['driver_id']).filter(lambda x: x['online_hours'].mean() > 1).driver_id.unique()
lebaran = test_merge_3['day'].isin([25, 26])
off = test_merge_3['online_hours'] == 0
active = test_merge_2['driver_id'].isin(active_drivers)
anomaly_index = (active)&(lebaran)&(off)
np.sum(anomaly_index)

1483

In [48]:
train_merge_3.to_csv('data/train_v1.csv', index=False)
test_merge_3.to_csv('data/test_v1.csv', index=False)

In [53]:
anomaly_index = (test_merge_3['day'].isin([25, 26]))&(test_merge_3['online_hours'] == 0)
test_merge_after_anomaly = test_merge_3[~anomaly_index]

In [None]:
test_merge_after_anomaly.to_csv('data/test_v1.csv', index=False)

In [50]:
drop_cols = ['driver_id', 'day', 'date', 'online_hours', 'holiday', 'next_holiday', 'prev_holiday']
x_train = train_merge_3.drop(drop_cols, axis=1)
y_train = train_merge_3['online_hours']

x_test = test_merge_3.drop(drop_cols, axis=1)
y_test = test_merge_3['online_hours']

In [22]:
x_train.head(2)

Unnamed: 0,gender,age,number_of_kids,dayofweek,age2,age3,c1,c2,c3,gender_kids,age_kids,prev_7,prev_14
0,0,49,4,3,1.0,4,0,0,0,4,9,3.179167,2.216667
1,0,49,4,4,1.0,4,0,0,0,4,9,4.433333,2.5


In [23]:
x_train.columns

Index(['gender', 'age', 'number_of_kids', 'dayofweek', 'age2', 'age3', 'c1',
       'c2', 'c3', 'gender_kids', 'age_kids', 'prev_7', 'prev_14'],
      dtype='object')

# Regressor 

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [51]:
def init_model():
    models = {}
    models['ext'] = ExtraTreesRegressor(random_state=0, verbose=0)
    models['ada'] = AdaBoostRegressor(random_state=0)
    models['catboost'] = CatBoostRegressor(random_state=0, verbose=0)
    models['lgbm'] = LGBMRegressor(random_state=0, verbose=0)
    models['xgb'] = XGBRegressor(random_state=0, obj='reg:squarederror')
    models['rf'] = RandomForestRegressor(random_state=0)
    models['dt'] = DecisionTreeRegressor(random_state=0)
    return models

def experiment(x_train, y_train, 
               x_test, y_test, 
               df_test, base_name = ''):
    models = init_model()
    for m in models:
        if m in ['rf', 'dt', 'ext', 'ada']:
            models[m].fit(x_train,y_train)
        else :
            models[m].fit(x_train,y_train, 
                          eval_set=[(x_test, y_test)], 
                          early_stopping_rounds=10, verbose=0)

        if m not in ['dt']:
            importance = models[m].feature_importances_
            for i in np.argsort(importance)[::-1][:3]:
                print(x_train.columns[i], "%.2f"% importance[i])

        y_predicted = models[m].predict(x_test).clip(0, 11)
        rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        r2 = r2_score(y_test, y_predicted)
        print(m, "================ RMSE: %.5f ===========R2: %.5f"% (rmse, r2))
        print('')
        
        df_test['pred_'+m] = y_predicted
        df_test.to_csv('preds/pred'+base_name+'_v1.csv', index=False)
        # 1.78

In [52]:
df_test = test_merge_3[['driver_id', 'date', 'online_hours']]

In [25]:
experiment(x_train, y_train, x_test, y_test, df_test)

prev_7 0.28
c1 0.25
prev_14 0.23

prev_14 0.38
prev_7 0.31
c2 0.22

prev_7 34.87
prev_14 29.39
c2 11.35

prev_14 304.00
prev_7 303.00
age 207.00

prev_14 0.33
c1 0.30
prev_7 0.21

prev_14 0.51
prev_7 0.30
age 0.05




In [26]:
experiment(x_train, y_train,
           x_test[~anomaly_index], y_test[~anomaly_index], 
           df_test[~anomaly_index], '_anomaly')

prev_7 0.28
c1 0.25
prev_14 0.23

prev_14 0.38
prev_7 0.31
c2 0.22

prev_7 34.87
prev_14 29.39
c2 11.35

prev_14 304.00
prev_7 303.00
age 207.00

prev_14 0.33
c1 0.30
prev_7 0.21

prev_14 0.51
prev_7 0.30
age 0.05




# Manually Old version 

In [27]:
# dt = DecisionTreeRegressor() 
# dt.fit(x_train,y_train)
# y_predicted = dt.predict(x_test).clip(0, 11)
# rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
# print("anomaly RMSE: %.5f"% rmse)
# rmse = np.sqrt(mean_squared_error(y_test[~anomaly_index], 
#                                   y_predicted[~anomaly_index]))
# print("RMSE: %.5f"% rmse)
# # 2.45 without prev_2

In [28]:
# rf = RandomForestRegressor() 
# rf.fit(x_train,y_train)
# y_predicted = rf.predict(x_test).clip(0, 11)
# rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
# print("anomaly RMSE: %.5f"% rmse)
# rmse = np.sqrt(mean_squared_error(y_test[~anomaly_index], 
#                                   y_predicted[~anomaly_index]))
# print("RMSE: %.5f"% rmse)
# # 1.92

In [29]:
# {x_train.columns[i]:rf.feature_importances_[i] for i in np.argsort(rf.feature_importances_)}

In [30]:
# xgb = XGBRegressor(learning_rate=0.01, n_estimators=1000)
# xgb.fit(x_train,y_train, 
#          eval_set=[(x_test, y_test)], 
#          early_stopping_rounds=10, 
#          verbose=0)
# y_predicted = xgb.predict(x_test).clip(0, 11)
# rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
# print("anomaly RMSE: %.5f"% rmse)
# rmse = np.sqrt(mean_squared_error(y_test[~anomaly_index], 
#                                   y_predicted[~anomaly_index]))
# print("RMSE: %.5f"% rmse)
# # 1.79

In [31]:
# {x_train.columns[i]:xgb.feature_importances_[i] for i in np.argsort(xgb.feature_importances_)}

In [32]:
# lgbm = LGBMRegressor()
# lgbm.fit(x_train,y_train, 
#          eval_set=(x_test, y_test), 
#          early_stopping_rounds=10, 
#          verbose=0)
# y_predicted = lgbm.predict(x_test).clip(0, 11)
# rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
# print("anomaly RMSE: %.5f"% rmse)
# rmse = np.sqrt(mean_squared_error(y_test[~anomaly_index], 
#                                   y_predicted[~anomaly_index]))
# print("RMSE: %.5f"% rmse)
# # 1.78

In [33]:
# {x_train.columns[i]:lgbm.feature_importances_[i] for i in np.argsort(lgbm.feature_importances_)}

In [34]:
# cb = CatBoostRegressor(verbose=0)
# cb.fit(x_train,y_train, 
#        eval_set=(x_test, y_test), 
#        early_stopping_rounds=10, 
#        verbose=0)
# y_predicted = cb.predict(x_test).clip(0, 11)
# rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
# print("anomaly RMSE: %.5f"% rmse)

# cb = CatBoostRegressor(verbose=0)
# cb.fit(x_train,y_train, 
#        eval_set=(x_test[~anomaly_index], y_test[~anomaly_index]), 
#        early_stopping_rounds=10, 
#        verbose=0)
# y_predicted = cb.predict(x_test).clip(0, 11)
# rmse = np.sqrt(mean_squared_error(y_test[~anomaly_index], 
#                                   y_predicted[~anomaly_index]))
# print("RMSE: %.5f"% rmse)
# # 1.78

In [35]:
# {x_train.columns[i]:cb.feature_importances_[i] for i in np.argsort(cb.feature_importances_)}

# Cluster 

In [27]:
train_cluster = train_merge_3.copy()

In [28]:
n_cluster = [3, 5, 10]
vectors = train_cluster[['prev_7', 'prev_14']].values
kmeans = {c:KMeans(n_clusters=c, random_state=0).fit(vectors) for c in n_cluster}

for c in n_cluster:
    train_cluster['cr_'+str(c)] = kmeans[c].predict(vectors)
    
cluster_group = train_cluster[['driver_id', 'dayofweek']+['cr_'+str(c) for c in n_cluster]]
train_merge_cluster = pd.merge(train_merge_3, cluster_group, on=['driver_id', 'dayofweek'], how='left')
test_merge_cluster = pd.merge(test_merge_3, cluster_group, on=['driver_id', 'dayofweek'], how='left')
test_merge_cluster.isnull().any().any(), train_merge_cluster.isnull().any().any()

(False, False)

In [29]:
train_merge_cluster.to_csv('data/train_cluster_v1.csv', index=False)
test_merge_cluster.to_csv('data/test_cluster_v1.csv', index=False)

In [30]:
x_train_0 = train_merge_cluster.drop(drop_cols, axis=1)
y_train_0 = train_merge_cluster['online_hours']

x_test_0 = test_merge_cluster.drop(drop_cols, axis=1)
y_test_0 = test_merge_cluster['online_hours']

In [31]:
experiment(x_train_0, y_train_0, x_test_0, y_test_0, df_test, '_cluster')

c1 0.26
prev_14 0.21
prev_7 0.18

prev_14 0.38
prev_7 0.29
c2 0.23

prev_7 27.45
prev_14 26.42
c2 10.89

prev_14 256.00
prev_7 254.00
age 191.00

prev_14 0.32
c1 0.24
prev_7 0.22

prev_14 0.50
prev_7 0.30
age 0.05




In [32]:
experiment(x_train_0, y_train_0, 
           x_test_0[~anomaly_index], y_test_0[~anomaly_index], 
           df_test[~anomaly_index], '_cluster_no_anomaly')

NameError: name 'experiment' is not defined

In [None]:
lgbm = LGBMRegressor()
lgbm = lgbm.fit(x_train_0, y_train_0)
perm = PermutationImportance(lgbm, random_state=0).fit(x_test_0, y_test_0)
eli5.show_weights(perm)

In [33]:
negative_features_2 = x_train_0.columns[perm.feature_importances_<=0]
negative_features_2

NameError: name 'x_train_0' is not defined

In [45]:
x_train_2 = train_merge_cluster.drop(drop_cols, axis=1)
x_train_2 = x_train_2.drop(negative_features_2, axis=1)
y_train_2 = train_merge_cluster['online_hours']

x_test_2 = test_merge_cluster.drop(drop_cols, axis=1)
x_test_2 = x_test_2.drop(negative_features_2, axis=1)
y_test_2 = test_merge_cluster['online_hours']

In [46]:
experiment(x_train_2, y_train_2, x_test_2, y_test_2, df_test, '_perm')

c1 0.29
prev_14 0.20
prev_7 0.14

prev_14 0.38
prev_7 0.29
c2 0.23

prev_7 27.28
prev_14 26.10
c2 11.51

prev_14 256.00
prev_7 254.00
age 191.00

prev_14 0.32
c1 0.24
prev_7 0.22

prev_14 0.50
prev_7 0.30
age 0.06




In [47]:
experiment(x_train_2, y_train_2, 
           x_test_2[~anomaly_index], y_test_2[~anomaly_index], 
           df_test[~anomaly_index], '_perm_no_anomaly')

c1 0.29
prev_14 0.20
prev_7 0.14

prev_14 0.38
prev_7 0.29
c2 0.23

prev_7 27.28
prev_14 26.10
c2 11.51

prev_14 256.00
prev_7 254.00
age 191.00

prev_14 0.32
c1 0.24
prev_7 0.22

prev_14 0.50
prev_7 0.30
age 0.06




In [48]:
lgbm = LGBMRegressor()
selector = RFE(lgbm, len(x_train.columns)-len(negative_features_2), step=1)
selector = selector.fit(x_train, y_train)
negative_features_1 = x_train.columns[~selector.support_]
negative_features_1

Index(['age2', 'age3'], dtype='object')

In [49]:
x_train_1 = train_merge_cluster.drop(drop_cols, axis=1)
x_train_1 = x_train_1.drop(negative_features_1, axis=1)
y_train_1 = train_merge_cluster['online_hours']

x_test_1 = test_merge_cluster.drop(drop_cols, axis=1)
x_test_1 = x_test_1.drop(negative_features_1, axis=1)
y_test_1 = test_merge_cluster['online_hours']

In [50]:
experiment(x_train_1, y_train_1, x_test_1, y_test_1, df_test, '_recursive')

c1 0.29
prev_14 0.20
prev_7 0.14

prev_14 0.38
prev_7 0.29
c2 0.23

prev_7 27.28
prev_14 26.10
c2 11.51

prev_14 256.00
prev_7 254.00
age 191.00

prev_14 0.32
c1 0.24
prev_7 0.22

prev_14 0.50
prev_7 0.30
age 0.06




In [51]:
experiment(x_train_1, y_train_1, 
           x_test_1[~anomaly_index], y_test_1[~anomaly_index], 
           df_test[~anomaly_index], '_recursive_no_anomaly')

c1 0.29
prev_14 0.20
prev_7 0.14

prev_14 0.38
prev_7 0.29
c2 0.23

prev_7 27.28
prev_14 26.10
c2 11.51

prev_14 256.00
prev_7 254.00
age 191.00

prev_14 0.32
c1 0.24
prev_7 0.22

prev_14 0.50
prev_7 0.30
age 0.06


