In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.sparse
pd.set_option("display.max_columns",101)
RANDOM_STATE = 42
DATA_PATH = "../data/instacart/"

In [6]:
print('loading prior')
priors = pd.read_csv(DATA_PATH + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})


print('loading orders')
orders = pd.read_csv(DATA_PATH + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})
orders.set_index('order_id', inplace=True, drop=False)

print('loading train')
train = pd.read_csv(DATA_PATH + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print ('split orders: train, test')
priors.drop('order_id_', inplace=True, axis=1)
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

loading prior
loading orders
loading train
split orders: train, test


In [8]:
users = pd.read_csv(DATA_PATH + "users_match.csv", dtype={
    'total_items': np.int32,
    'total_distinct_items': np.int32,
    'average_days_between_orders': np.float32,
    'nb_orders': np.int32,
    'average_basket': np.float32
})
users.set_index('user_id', inplace=True)
def xx(a):
    tmp = a.strip()[5:-2].split(",")
    result = [int(x) for x in tmp if x]
    return result
users['all_products'] = users['all_products'].apply(xx)

products = pd.read_csv(DATA_PATH + "products_match.csv", dtype={
    'product_id': np.int32,
    'aisle_id': np.int32,
    'department_id': np.int32,
    'orders': np.float32,
    'reorders': np.float32,
    'reorder_rate': np.float64
})

products.set_index('product_id.1', inplace=True)

userXproduct = pd.read_csv(DATA_PATH + "user_product_match.csv", index_col=0, dtype={
    'np_orders': np.int32,
    'last_order_id': np.int32,
    'sum_pos_in_cart': np.int32,
})

In [9]:
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        if user_id not in users.all_products:
            continue
            
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    df.head()
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders)
    df['product_reorders'] = df.product_id.map(products.reordered)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [11]:
df_train, labels = features(train_orders, labels_given=True)

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'

build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)
('order row', 40000)
('order row', 50000)
('order row', 60000)
('order row', 70000)
('order row', 80000)
('order row', 90000)
('order row', 100000)
('order row', 110000)
('order row', 120000)
('order row', 130000)
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int32
user_total_items                      int32
total_distinct_items                  int32
user_average_days_between_orders    float32
user_average_basket                 float32
dow                                    int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              int32
department_id                         int32
product_orders              

In [76]:
m = int(len(labels) * 0.8)
df_train_data = df_train[:m]
label_train = labels[:m]

In [77]:
df_test_dat = df_train[m:]
label_test = labels[m:]

In [78]:
import lightgbm as lgb
d_train = lgb.Dataset(df_train_data[f_to_use],
                      label=label_train,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'

In [79]:
d_test = lgb.Dataset(df_test_dat[f_to_use],
                      label=label_test,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'

In [93]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'learning_rate': 0.5
}
ROUNDS = 200

print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS, valid_sets=[d_test,], early_stopping_rounds=10)
# lgb.plot_importance(bst, figsize=(9,20))

light GBM train :-)
[1]	valid_0's auc: 0.818838
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's auc: 0.823068
[3]	valid_0's auc: 0.824455
[4]	valid_0's auc: 0.825735
[5]	valid_0's auc: 0.826267
[6]	valid_0's auc: 0.827207
[7]	valid_0's auc: 0.828055
[8]	valid_0's auc: 0.828616
[9]	valid_0's auc: 0.829163
[10]	valid_0's auc: 0.829462
[11]	valid_0's auc: 0.829751
[12]	valid_0's auc: 0.829862
[13]	valid_0's auc: 0.830007
[14]	valid_0's auc: 0.830154
[15]	valid_0's auc: 0.830204
[16]	valid_0's auc: 0.830172
[17]	valid_0's auc: 0.830361
[18]	valid_0's auc: 0.830462
[19]	valid_0's auc: 0.830513
[20]	valid_0's auc: 0.830583
[21]	valid_0's auc: 0.830652
[22]	valid_0's auc: 0.830646
[23]	valid_0's auc: 0.830626
[24]	valid_0's auc: 0.830595
[25]	valid_0's auc: 0.830605
[26]	valid_0's auc: 0.830612
[27]	valid_0's auc: 0.830591
[28]	valid_0's auc: 0.830536
[29]	valid_0's auc: 0.830476
[30]	valid_0's auc: 0.830483
[31]	valid_0's auc: 0.830441
Early stopping, best iteration is:
[

In [65]:
from collections import Counter
Counter(labels)

Counter({0: 8407170, 1: 59878})

In [193]:
np.mean(labels)

0.007071886211109232

In [160]:
bst.best_iteration

20

In [94]:
preduct_result =bst.predict(df_test_dat[f_to_use], num_iteration=bst.best_iteration)

In [82]:
np.mean(preduct_result)

0.098191938607873591

In [95]:
result = preduct_result > 0.7
sum(result)

12478

In [106]:
from sklearn.metrics import f1_score
for i in range(1, 100, 1):
    result = preduct_result > i / 100.0
    xx = f1_score(label_test, result)
    print i, xx

1 0.197559831908
2 0.228464302586
3 0.255728414259
4 0.279347535266
5 0.300647833671
6 0.319372188804
7 0.336055010674
8 0.351269695223
9 0.364781983159
10 0.377058937222
11 0.388025854012
12 0.397716371383
13 0.405644417807
14 0.412497539626
15 0.418289170332
16 0.423647614471
17 0.427063104383
18 0.42997396049
19 0.432684705944
20 0.434376029302
21 0.435220280518
22 0.435663557346
23 0.43526482819
24 0.434455301391
25 0.432199423889
26 0.430058370706
27 0.427060595813
28 0.423765988267
29 0.420363768243
30 0.416103551278
31 0.411654550583
32 0.406631301579
33 0.401466909256
34 0.395768878818
35 0.389827304369
36 0.383479510056
37 0.37676647814
38 0.370395942113
39 0.363065943409
40 0.356561218155
41 0.349540111363
42 0.341884639073
43 0.33407799834
44 0.326348520838
45 0.318085996906
46 0.309920257452
47 0.302276350992
48 0.293943708247
49 0.285378732534
50 0.277454807575
51 0.269509526201
52 0.260718257468
53 0.251868656022
54 0.24329304751
55 0.2344624652
56 0.225423745783
57 0.216

In [88]:
df_test, _ = features(test_orders)

build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)
('order row', 40000)
('order row', 50000)
('order row', 60000)
('order row', 70000)
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int32
user_total_items                      int32
total_distinct_items                  int32
user_average_days_between_orders    float32
user_average_basket                 float32
dow                                    int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              int32
department_id                         int32
product_orders                      float32
product_reorders                    float64
product_reorder_rate                float64
UP_orders                 

In [96]:
print('light GBM predict')
preds = bst.predict(df_test[f_to_use], num_iteration=bst.best_iteration)

light GBM predict


In [108]:
result_pred = preds > 0.22
sum(result_pred)

604423

In [107]:
df_test['pred'] = preds

TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv(DATA_PATH + 'sub.csv', index=False)