In [138]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
%matplotlib inline
np.random.seed(42)
def f1_score(l_true,l_pred):
    tp = set(l_true).intersection(set(l_pred))
    if not len(tp):
        return 0
    fp = set(l_pred).difference(tp)
    fn = set(l_true).difference(tp)
    p = len(tp) / (len(tp) + len(fp))
    r = len(tp) / (len(tp) + len(fn))
    f1 = 2 * (p * r) / (p + r)
    return f1
def avg_f1_score(df,pred,order_products_compact=order_products_compact):
    df_pred = pd.DataFrame({'order_id':df.order_id,'pred':pred,'product_id':df.product_id}).\
                sort_values(['order_id','pred'],ascending = [True,False]).reset_index(drop=True)
    df_pred['pred_rank'] = df_pred.groupby('order_id').cumcount()
    df_pred = df_pred[df_pred.pred_rank<15].reset_index(drop=True)
    d = {}
    for row in df_pred.itertuples():
        order_id = row.order_id
        if row.pred_rank == 0 or row.pred > 0.05:
            try:
                d[order_id] += ' ' + str(row.product_id)
            except:
                d[order_id] = str(row.product_id)
    df_pred_compact = pd.DataFrame.from_dict(d, orient='index')

    df_pred_compact.reset_index(inplace=True)
    df_pred_compact.columns = ['order_id', 'y_pred']
    df_pred_compact['y_pred'] = df_pred_compact['y_pred'].str.split()
    df_pred_compact = df_pred_compact.merge(order_products_compact[['order_id','product_id']],how='left',
                                                       on='order_id')
    scores = []
    for row in df_pred_compact.itertuples():
        y_pred = row.y_pred
        y_true = row.product_id
        score = f1_score(y_true,y_pred)
        scores.append(score)
    return np.mean(scores)
def auc_score(df,pred):
    return roc_auc_score(df['labels'].values,pred)



In [16]:
df = pd.read_hdf('../data/online_retail_transformed.h5','train')
order_products_compact = pd.read_hdf('../data/online_retail.h5','order_products_compact')
f_to_use1 = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_reorder_rate']
f_to_use2 = ['user_total_orders',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_ratio',
       'product_orders', 'UP_orders']

In [17]:
# correlation test
df[f_to_use2].corr()

Unnamed: 0,user_total_orders,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_ratio,product_orders,UP_orders
user_total_orders,1.0,-0.4123759,0.007886892,-0.004870814,-0.1020318,4.4352270000000006e-17,0.200472
user_average_days_between_orders,-0.4123759,1.0,0.02639173,0.04120785,0.06802189,-1.2493350000000001e-17,-0.067077
user_average_basket,0.007886892,0.02639173,1.0,0.02930158,-0.02165999,-4.4022570000000004e-17,0.090046
order_hour_of_day,-0.004870814,0.04120785,0.02930158,1.0,-0.02046899,-1.762072e-17,0.00341
days_since_ratio,-0.1020318,0.06802189,-0.02165999,-0.02046899,1.0,1.310377e-17,-0.016812
product_orders,4.4352270000000006e-17,-1.2493350000000001e-17,-4.4022570000000004e-17,-1.762072e-17,1.310377e-17,1.0,0.193062
UP_orders,0.2004725,-0.06707733,0.09004567,0.003410053,-0.01681185,0.1930615,1.0


In [18]:
# train / val split
unique_users = df.user_id.unique()
np.random.shuffle(unique_users)
sp = int(len(unique_users)*0.8)
train_users = unique_users[:sp]
val_users = unique_users[sp:]
df_train = df[df.user_id.isin(train_users)]
df_val = df[df.user_id.isin(val_users)]

### basic logistical regression model

In [136]:
%%time
lgr = LogisticRegression(random_state=42,n_jobs=-1).fit(df_train[f_to_use2],df_train['labels'].values)
# train_pred = lgr.predict_proba(df_train[f_to_use2])[:,1]
val_pred_lgr = lgr.predict_proba(df_val[f_to_use2])[:,1]
# train_auc = roc_auc_score(df_train['labels'].values,train_pred)
# val_auc = roc_auc_score(df_val['labels'].values,val_pred)
# print ('train auc is {:.3f}; val auc is {:.3f}'.format(train_auc,val_auc))

CPU times: user 16.1 s, sys: 220 ms, total: 16.3 s
Wall time: 16.1 s


In [137]:
avg_f1_score(df_val,val_pred_lgr)

0.13574313519047465

In [139]:
auc_score(df_val,val_pred_lgr)

0.84075227988515633

### basic lgb model

In [120]:
%%time
d_train = lgb.Dataset(df_train[f_to_use2],label=df_train['labels'].values)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100
bst = lgb.train(params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
val_pred_lgb = bst.predict(df_val[f_to_use2])

CPU times: user 1min 29s, sys: 272 ms, total: 1min 29s
Wall time: 11.6 s


In [123]:
avg_f1_score(df_val,val_pred_lgb)

0.17464065942417362

In [133]:
auc_score(df_val,val_pred_lgb)

0.87639669316758861

### basic ramdom forest model

In [129]:
%%time
rfc = RandomForestClassifier(random_state = 42, n_estimators=100, max_depth = 5, n_jobs=-1).\
        fit(df_train[f_to_use2],df_train['labels'].values)
val_pred_rf = rfc.predict_proba(df_val[f_to_use2])[:,1]

CPU times: user 4min 10s, sys: 1.52 s, total: 4min 12s
Wall time: 34.4 s


In [131]:
avg_f1_score(df_val,val_pred_rf)

0.17774172524686477

In [134]:
auc_score(df_val,val_pred_rf)

0.86569471006487864

### basic xgb model

In [144]:
%%time
d_train = xgb.DMatrix(df_train[f_to_use2],label=df_train['labels'].values)

xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.1
    ,"max_depth"        : 6
    ,"min_child_weight" :10
    ,"gamma"            :0.70
    ,"subsample"        :0.76
    ,"colsample_bytree" :0.95
    ,"alpha"            :2e-05
    ,"lambda"           :10
}
ROUNDS = 100
watchlist= [(d_train, "train")]
bst = xgb.train(params=xgb_params, dtrain=d_train, num_boost_round=ROUNDS, evals=watchlist, verbose_eval=10)
# xgboost.plot_importance(bst)
# val_pred_xgb = bst.predict(df_val[f_to_use2])

[0]	train-logloss:0.600244
[10]	train-logloss:0.189612
[20]	train-logloss:0.079399
[30]	train-logloss:0.043941
[40]	train-logloss:0.032301
[50]	train-logloss:0.028549
[60]	train-logloss:0.027333
[70]	train-logloss:0.026877
[80]	train-logloss:0.026668
[90]	train-logloss:0.026548
CPU times: user 7min 30s, sys: 2.01 s, total: 7min 32s
Wall time: 58.2 s


In [146]:
val_pred_xgb = bst.predict(xgb.DMatrix(df_val[f_to_use2]))

In [147]:
avg_f1_score(df_val,val_pred_xgb)

0.183218766311179

In [148]:
auc_score(df_val,val_pred_xgb)

0.87874221350447901