In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
%matplotlib inline
np.random.seed(42)

df = pd.read_pickle('../data/train_transformed.p')
order_products_compact = pd.read_hdf('../data/online_retail.h5','order_products_compact')

# def f1_score(l_true,l_pred):
#     tp = set(l_true).intersection(set(l_pred))
#     if not len(tp):
#         return 0
#     fp = set(l_pred).difference(tp)
#     fn = set(l_true).difference(tp)
#     p = len(tp) / (len(tp) + len(fp))
#     r = len(tp) / (len(tp) + len(fn))
#     f1 = 2 * (p * r) / (p + r)
#     return f1
def avg_f1_score(df,pred,order_products_compact=order_products_compact):
    df_pred = pd.DataFrame({'order_id':df.order_id,'pred':pred,'product_id':df.product_id}).\
                sort_values(['order_id','pred'],ascending = [True,False]).reset_index(drop=True)
    df_pred['pred_rank'] = df_pred.groupby('order_id').cumcount()
    df_pred = df_pred[df_pred.pred_rank<15].reset_index(drop=True)
    d = {}
    for row in df_pred.itertuples():
        order_id = row.order_id
        if row.pred_rank == 0 or row.pred > 0.05:
            try:
                d[order_id] += ' ' + str(row.product_id)
            except:
                d[order_id] = str(row.product_id)
    df_pred_compact = pd.DataFrame.from_dict(d, orient='index')

    df_pred_compact.reset_index(inplace=True)
    df_pred_compact.columns = ['order_id', 'y_pred']
    df_pred_compact['y_pred'] = df_pred_compact['y_pred'].str.split()
    df_pred_compact = df_pred_compact.merge(order_products_compact[['order_id','product_id']],how='left',
                                                       on='order_id')
    scores = []
    for row in df_pred_compact.itertuples():
        y_pred = row.y_pred
        y_true = row.product_id
        score = f1_score(y_true,y_pred)
        scores.append(score)
    return np.mean(scores)
def auc_score(df,pred):
    return roc_auc_score(df['labels'].values,pred)



In [6]:
f_to_use1 = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_reorder_rate']
f_to_use2 = ['user_total_orders',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_ratio',
       'product_orders', 'UP_orders']
f_to_use3 = ['user_total_orders', 'user_average_days_between_orders', 'user_average_basket', 
             'user_total_item_quantity', 'order_hour_of_day', 'days_since_ratio',
             'product_orders', 'product_avg_price', 
             'UP_orders', 'UP_total_quantity']

In [7]:
# correlation test
df[f_to_use3].corr()

Unnamed: 0,user_total_orders,user_average_days_between_orders,user_average_basket,user_total_item_quantity,order_hour_of_day,days_since_ratio,product_orders,product_avg_price,UP_orders,UP_total_quantity
user_total_orders,1.0,-0.4123759,0.007886892,0.5628045,-0.004870814,-0.1020318,4.4352270000000006e-17,-2.934025e-18,0.200472,0.060087
user_average_days_between_orders,-0.4123759,1.0,0.02639173,-0.2358832,0.04120785,0.06802189,-1.2493350000000001e-17,1.715587e-18,-0.067077,-0.025184
user_average_basket,0.007886892,0.02639173,1.0,0.05461234,0.02930158,-0.02165999,-4.4022570000000004e-17,1.990025e-18,0.090046,0.005831
user_total_item_quantity,0.5628045,-0.2358832,0.05461234,1.0,-0.02984752,-0.07512556,2.02016e-18,8.547502999999999e-19,0.115486,0.106763
order_hour_of_day,-0.004870814,0.04120785,0.02930158,-0.02984752,1.0,-0.02046899,-1.762072e-17,2.779258e-18,0.00341,-0.003187
days_since_ratio,-0.1020318,0.06802189,-0.02165999,-0.07512556,-0.02046899,1.0,1.310377e-17,-2.5624760000000002e-18,-0.016812,-0.008021
product_orders,4.4352270000000006e-17,-1.2493350000000001e-17,-4.4022570000000004e-17,2.02016e-18,-1.762072e-17,1.310377e-17,1.0,-0.04631732,0.193062,0.057101
product_avg_price,-2.934025e-18,1.715587e-18,1.990025e-18,8.547502999999999e-19,2.779258e-18,-2.5624760000000002e-18,-0.04631732,1.0,-0.008766,-0.006523
UP_orders,0.2004725,-0.06707733,0.09004567,0.1154858,0.003410053,-0.01681185,0.1930615,-0.008766088,1.0,0.324171
UP_total_quantity,0.06008667,-0.02518358,0.005830574,0.1067629,-0.003186609,-0.008020625,0.05710089,-0.006522737,0.324171,1.0


In [8]:
# train / val split
unique_users = df.user_id.unique()
np.random.shuffle(unique_users)
sp = int(len(unique_users)*0.8)
train_users = unique_users[:sp]
val_users = unique_users[sp:]
df_train = df[df.user_id.isin(train_users)]
df_val = df[df.user_id.isin(val_users)]

### basic logistical regression model

In [9]:
%%time
lgr = LogisticRegression(random_state=42,n_jobs=-1).fit(df_train[f_to_use3],df_train['labels'].values)
# train_pred = lgr.predict_proba(df_train[f_to_use2])[:,1]
val_pred_lgr = lgr.predict_proba(df_val[f_to_use3])[:,1]
# train_auc = roc_auc_score(df_train['labels'].values,train_pred)
# val_auc = roc_auc_score(df_val['labels'].values,val_pred)
# print ('train auc is {:.3f}; val auc is {:.3f}'.format(train_auc,val_auc))

CPU times: user 47.1 s, sys: 304 ms, total: 47.4 s
Wall time: 47.3 s


In [10]:
avg_f1_score(df_val,val_pred_lgr)

0.13563705810789795

In [11]:
auc_score(df_val,val_pred_lgr)

0.83611407123463255

### basic lgb model

In [12]:
%%time
d_train = lgb.Dataset(df_train[f_to_use3],label=df_train['labels'].values)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100
bst = lgb.train(params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
val_pred_lgb = bst.predict(df_val[f_to_use3])

CPU times: user 1min 53s, sys: 328 ms, total: 1min 54s
Wall time: 14.7 s


In [13]:
avg_f1_score(df_val,val_pred_lgb)

0.1778790000995055

In [14]:
auc_score(df_val,val_pred_lgb)

0.88430461526599202

### basic ramdom forest model

In [15]:
%%time
rfc = RandomForestClassifier(random_state = 42, n_estimators=100, max_depth = 5, n_jobs=-1).\
        fit(df_train[f_to_use3],df_train['labels'].values)
val_pred_rf = rfc.predict_proba(df_val[f_to_use3])[:,1]

CPU times: user 5min 46s, sys: 1.41 s, total: 5min 48s
Wall time: 45.3 s


In [16]:
avg_f1_score(df_val,val_pred_rf)

0.17841304459328036

In [17]:
auc_score(df_val,val_pred_rf)

0.87439835410977806

### basic xgb model

In [18]:
%%time
d_train = xgb.DMatrix(df_train[f_to_use3],label=df_train['labels'].values)

xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.1
    ,"max_depth"        : 6
    ,"min_child_weight" :10
    ,"gamma"            :0.70
    ,"subsample"        :0.76
    ,"colsample_bytree" :0.95
    ,"alpha"            :2e-05
    ,"lambda"           :10
}
ROUNDS = 100
watchlist= [(d_train, "train")]
bst = xgb.train(params=xgb_params, dtrain=d_train, num_boost_round=ROUNDS, evals=watchlist, verbose_eval=10)
# xgboost.plot_importance(bst)
val_pred_xgb = bst.predict(xgb.DMatrix(df_val[f_to_use3]))

[0]	train-logloss:0.600282
[10]	train-logloss:0.189791
[20]	train-logloss:0.079525
[30]	train-logloss:0.044026
[40]	train-logloss:0.032438
[50]	train-logloss:0.028758
[60]	train-logloss:0.027551
[70]	train-logloss:0.027082
[80]	train-logloss:0.026827
[90]	train-logloss:0.026633
CPU times: user 9min 32s, sys: 1.85 s, total: 9min 34s
Wall time: 1min 13s


In [19]:
avg_f1_score(df_val,val_pred_xgb)

0.18548316412881449

In [20]:
auc_score(df_val,val_pred_xgb)

0.88363361464251644

# Group K Fold CV

In [22]:
def group_kfold_cv(model = LogisticRegression(random_state=42,n_jobs=-1),f_to_use = f_to_use3):
    group_kfold = GroupKFold(n_splits=4)
    avg_f1_score_list = []
    auc_score_list = []
    i = 0
    df_shuffle = df.sample(frac=1,random_state=43).reset_index(drop=True)
    for train_index, val_index in group_kfold.split(
        X = df_shuffle[f_to_use].values, groups = df_shuffle['order_id'].values):
        i += 1
        print ('Fold {}...'.format(i))
        df_train = df_shuffle.loc[train_index]
        df_val = df_shuffle.loc[val_index]
        m = model.fit(df_train[f_to_use],df_train['labels'].values)
        val_pred = m.predict_proba(df_val[f_to_use])[:,1]
        avg_f1_score_list.append(avg_f1_score(df_val,val_pred))
        auc_score_list.append(auc_score(df_val,val_pred))
    return np.mean(avg_f1_score_list),np.mean(auc_score_list)

def group_kfold_lgb(f_to_use = f_to_use3):
    group_kfold = GroupKFold(n_splits=4)
    avg_f1_score_list = []
    auc_score_list = []
    i = 0
    df_shuffle = df.sample(frac=1,random_state=43).reset_index(drop=True)
    for train_index, val_index in group_kfold.split(
        X = df_shuffle[f_to_use].values, groups = df_shuffle['order_id'].values):
        i += 1
        print ('Fold {}...'.format(i))
        df_train = df_shuffle.loc[train_index]
        df_val = df_shuffle.loc[val_index]
        d_train = lgb.Dataset(df_train[f_to_use],label=df_train['labels'].values)
        params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss'},
        'num_leaves': 96,
        'max_depth': 10,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.95,
        'bagging_freq': 5
        }
        ROUNDS = 100
        bst = lgb.train(params, d_train, ROUNDS)
        val_pred = bst.predict(df_val[f_to_use])
        avg_f1_score_list.append(avg_f1_score(df_val,val_pred))
        auc_score_list.append(auc_score(df_val,val_pred))
    return np.mean(avg_f1_score_list),np.mean(auc_score_list)

def group_kfold_xgb(f_to_use = f_to_use3):
    group_kfold = GroupKFold(n_splits=4)
    avg_f1_score_list = []
    auc_score_list = []
    i = 0
    df_shuffle = df.sample(frac=1,random_state=43).reset_index(drop=True)
    for train_index, val_index in group_kfold.split(
        X = df_shuffle[f_to_use].values, groups = df_shuffle['order_id'].values):
        i += 1
        print ('Fold {}...'.format(i))
        df_train = df_shuffle.loc[train_index]
        df_val = df_shuffle.loc[val_index]
        d_train = xgb.DMatrix(df_train[f_to_use],label=df_train['labels'].values)

        xgb_params = {
            "objective"         : "reg:logistic"
            ,"eval_metric"      : "logloss"
            ,"eta"              : 0.1
            ,"max_depth"        : 6
            ,"min_child_weight" :10
            ,"gamma"            :0.70
            ,"subsample"        :0.76
            ,"colsample_bytree" :0.95
            ,"alpha"            :2e-05
            ,"lambda"           :10
        }
        ROUNDS = 100
#         watchlist= [(d_train, "train")]
        bst = xgb.train(params=xgb_params, dtrain=d_train, num_boost_round=ROUNDS)
        val_pred = bst.predict(xgb.DMatrix(df_val[f_to_use]))
        avg_f1_score_list.append(avg_f1_score(df_val,val_pred))
        auc_score_list.append(auc_score(df_val,val_pred))
    return np.mean(avg_f1_score_list),np.mean(auc_score_list)

In [60]:
%%time
group_kfold_cv()

Fold 1...
Fold 2...
Fold 3...
Fold 4...
CPU times: user 1min 10s, sys: 2.32 s, total: 1min 12s
Wall time: 1min 10s


(0.13208674109339155, 0.83524917996384851)

In [61]:
%%time
group_kfold_cv(RandomForestClassifier(random_state = 42, n_estimators=100, max_depth = 5, n_jobs=-1))

Fold 1...
Fold 2...
Fold 3...
Fold 4...
CPU times: user 26min 12s, sys: 10.2 s, total: 26min 22s
Wall time: 3min 32s


(0.17959987400227559, 0.8660472661686186)

In [64]:
%%time
group_kfold_lgb()

Fold 1...
Fold 2...
Fold 3...
Fold 4...
CPU times: user 7min 59s, sys: 1.3 s, total: 8min
Wall time: 1min 4s


(0.17928740680159366, 0.87701008867151198)

In [23]:
%%time
group_kfold_xgb()

Fold 1...
Fold 2...
Fold 3...
Fold 4...
CPU times: user 1h 4min 1s, sys: 10.4 s, total: 1h 4min 12s
Wall time: 8min 24s


(0.18824797592072021, 0.8803443391638035)