In [11]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
df_train = pd.read_pickle('../data/train_transformed.p')
df_test = pd.read_pickle('../data/test_transformed.p')
order_products_compact = pd.read_hdf('../data/online_retail.h5','order_products_compact')

def f1_score(l_true,l_pred):
    tp = set(l_true).intersection(set(l_pred))
    if not len(tp):
        return 0
    fp = set(l_pred).difference(tp)
    fn = set(l_true).difference(tp)
    p = len(tp) / (len(tp) + len(fp))
    r = len(tp) / (len(tp) + len(fn))
    f1 = 2 * (p * r) / (p + r)
    return f1
def avg_f1_score(df,pred,order_products_compact=order_products_compact,thres=0.09):
    df_pred = pd.DataFrame({'order_id':df.order_id,'pred':pred,'product_id':df.product_id,
                            'prior_size_max':df.user_order_size_max,
                            'prior_size_mean':df.user_order_size_mean,
                            'prior_size_std':df.user_order_size_std})\
                .sort_values(['order_id','pred'],ascending = [True,False]).reset_index(drop=True)
    df_pred['pred_rank'] = df_pred.groupby('order_id').cumcount()
    df_pred['prior_size_2std'] = df_pred.prior_size_mean + df_pred.prior_size_std * 2
    df_pred = df_pred[df_pred.pred_rank < df_pred.prior_size_max]\
            .reset_index(drop=True)
    d = {}
    for row in df_pred.itertuples():
        order_id = row.order_id
        if row.pred_rank == 0 or row.pred > thres:
            try:
                d[order_id] += ' ' + str(row.product_id)
            except:
                d[order_id] = str(row.product_id)
    df_pred_compact = pd.DataFrame.from_dict(d, orient='index')

    df_pred_compact.reset_index(inplace=True)
    df_pred_compact.columns = ['order_id', 'y_pred']
    df_pred_compact['y_pred'] = df_pred_compact['y_pred'].str.split()
    df_pred_compact = df_pred_compact.merge(order_products_compact[['order_id','product_id']],how='left',
                                                       on='order_id')
    scores = []
    for row in df_pred_compact.itertuples():
        y_pred = row.y_pred
        y_true = row.product_id
        score = f1_score(y_true,y_pred)
        scores.append(score)
    df_pred_compact['f1_score'] = scores
    return np.mean(scores),df_pred_compact

f_to_use_tree = ['user_total_orders',
       'user_total_items', 'user_total_distinct_items',
       'user_average_days_between_orders', 'user_order_size_mean','user_order_size_max','user_order_size_std',
       'user_total_item_quantity', 'user_total_spent',
       'user_sum_days_between_orders', 'user_reorder_ratio',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'product_reorder_rate',
       'product_total_quantity_sold', 'product_avg_price', 'prod_first_buy',
       'prod_1reorder_ratio',
       'UP_orders', 'UP_orders_ratio', 'UP_total_quantity',
       'UP_order_rate_since_first_order']
f_to_use_lgr = ['user_total_orders', 'user_average_days_between_orders', 'user_order_size_mean', 
             'user_total_item_quantity', 'order_hour_of_day','order_dow', 'days_since_ratio',
             'product_orders', 'product_avg_price', 
             'UP_orders', 'UP_total_quantity', 
             'user_sum_days_between_orders','user_reorder_ratio','prod_1reorder_ratio']

In [1]:
# logistic regression
print ('training logistic regression...')
lgr = LogisticRegression(random_state=42,n_jobs=-1,C=100).fit(df_train[f_to_use_lgr],df_train['labels'].values)
test_pred_lgr = lgr.predict_proba(df_test[f_to_use_lgr])[:,1]

# LightGBM
print ('training lightGBM...')
d_train = lgb.Dataset(df_train[f_to_use_tree],label=df_train['labels'].values)
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 128,
    'max_depth': 8,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'learning_rate': 0.053,
}
bst = lgb.train(params, d_train, 100)
test_pred_lgb = bst.predict(df_test[f_to_use_tree])

# random forest
print ('training random forest..')
rfc = RandomForestClassifier(random_state = 42, n_estimators=100, max_depth = 7, n_jobs=-1,min_samples_split=100).\
        fit(df_train[f_to_use_tree],df_train['labels'].values)
test_pred_rf = rfc.predict_proba(df_test[f_to_use_tree])[:,1]

# XGBoost
print ('training XGBoost...')
d_train = xgb.DMatrix(df_train[f_to_use_tree],label=df_train['labels'].values)
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.15
    ,"max_depth"        : 8
    ,"min_child_weight" :10
    ,"gamma"            :0.70
    ,"subsample"        :0.76
    ,"colsample_bytree" :0.95
    ,"alpha"            :2e-05
    ,"lambda"           :10
}
bst = xgb.train(params=xgb_params, dtrain=d_train, num_boost_round=100)
test_pred_xgb = bst.predict(xgb.DMatrix(df_test[f_to_use_tree]))

# Model combination
test_pred = test_pred_lgb * 0.2 + test_pred_xgb * 0.5 + test_pred_rf * 0.1 + test_pred_lgr * 0.2




training logistic regression...
training lightGBM...
training random forest..
training XGBoost...
test score is          order_id  product_id  user_id  user_total_orders  user_total_items  \
0          568172           0    12348                  4                21   
1          568172           1    12348                  4                21   
2          568172           2    12348                  4                21   
3          568172           3    12348                  4                21   
4          568172           4    12348                  4                21   
5          568172           5    12348                  4                21   
6          568172           6    12348                  4                21   
7          568172           7    12348                  4                21   
8          568172           8    12348                  4                21   
9          568172           9    12348                  4                21   
10         568172  

In [12]:
score,result = avg_f1_score(df_test,test_pred)
print ('test score is {}'.format(score))

test score is 0.21605161488977054


In [14]:
products = pd.read_hdf('../data/online_retail.h5','products')

In [17]:
products[products.product_id.isin([1009, 1011, 1008, 2171, 2789, 1094, 599])]

Unnamed: 0,product_id,description
592,599,FAWN BLUE HOT WATER BOTTLE
997,1008,SCOTTIE DOG HOT WATER BOTTLE
998,1009,CHOCOLATE HOT WATER BOTTLE
1000,1011,HOT WATER BOTTLE TEA AND SYMPATHY
1082,1094,FRYING PAN UNION FLAG
2158,2171,HOT WATER BOTTLE KEEP CALM
2762,2789,RED WOOLLY HOTTIE WHITE HEART.


In [13]:
result.sort_values('f1_score',ascending = False)

Unnamed: 0,order_id,y_pred,product_id,f1_score
122,575946,[1547],[1547],1.000000
260,580717,"[1009, 1011, 1008, 2171, 2789, 1094, 599]","[1008, 1009, 1011, 1094, 2171, 2789]",0.923077
164,577858,"[177, 983, 2136]","[177, 983]",0.800000
262,580727,"[2053, 507, 983, 1472, 1282, 1285, 3185, 854, ...","[8, 9, 14, 18, 65, 69, 73, 76, 120, 125, 139, ...",0.686090
113,575687,"[2589, 974, 1040, 275, 663, 276, 2601, 1421, 2...","[275, 276, 277, 278, 663, 962, 974, 1035, 1036...",0.666667
261,580718,"[331, 267]",[331],0.666667
116,575732,"[19, 18, 1062, 1708, 1011, 1063, 2767, 20, 170...","[18, 19, 20, 599, 1011, 1706, 1708]",0.636364
207,579499,"[169, 172, 1253, 3115, 170, 1510, 3053, 171, 1...","[169, 170, 171, 172, 1251, 1252, 1253, 1319, 1...",0.625000
265,580873,"[1327, 1411, 1480, 2618, 1507, 1286, 146, 1203...","[68, 1203, 1286, 1327, 1411, 1480, 1481, 1781,...",0.615385
181,578631,"[1058, 2100, 2614, 2674, 619, 2171, 2618, 620,...","[1009, 1011, 1238, 2100, 2171, 2172, 2614, 2617]",0.600000
