In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
IDIR = '../data/'

In [3]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [19]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

computing product f


In [26]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,order_id_,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,2,202279,prior,3,5,9,8.0
1,2,28985,2,1,2,202279,prior,3,5,9,8.0
2,2,9327,3,0,2,202279,prior,3,5,9,8.0
3,2,45918,4,1,2,202279,prior,3,5,9,8.0
4,2,30035,5,0,2,202279,prior,3,5,9,8.0


In [50]:
orders.groupby('user_id').size().head(10), orders.groupby('user_id').size().tail(10)

(user_id
 1     11
 2     15
 3     13
 4      6
 5      5
 6      4
 7     21
 8      4
 9      4
 10     6
 dtype: int64, user_id
 206200    24
 206201    33
 206202    23
 206203     6
 206204     5
 206205     4
 206206    68
 206207    17
 206208    50
 206209    14
 dtype: int64)

In [49]:
priors.groupby('user_id').size().head(10), priors.groupby('user_id').size().tail(10)

(user_id
 1      59
 2     195
 3      88
 4      18
 5      37
 6      14
 7     206
 8      49
 9      76
 10    143
 dtype: int64, user_id
 206200    279
 206201    404
 206202    198
 206203    119
 206204     54
 206205     32
 206206    285
 206207    223
 206208    677
 206209    129
 dtype: int64)

In [58]:
### user features

print('computing user f')
users = pd.DataFrame()
users['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['nb_orders'] = orders.groupby('user_id').size().astype(np.int16) # orders data gives user_id, order_id
users['total_items'] = priors.groupby('user_id').size().astype(np.int16) # priors data gives order_id, product_id
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['num_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

computing user f
user f (206209, 6)


In [68]:
### userXproduct features

print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000

compute userXproduct f - this is long...


In [70]:
# For each (user_id , product_id) pair, 
## count number of occurences of this pair
## latest, max(order_number, order_id)
## accumulated add_to_cart_order

d = dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1, \
                (row.order_number, row.order_id), \
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1, \
                max(d[z][1], (row.order_number, row.order_id)), \
                d[z][2] + row.add_to_cart_order)

print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')

In [77]:
print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))
userXproduct.head()

to dataframe (less memory)
user X product f 13293564


Unnamed: 0,nb_orders,last_order_id,sum_pos_in_cart
-1246903360,5,104690,10
-1246907495,5,132412,16
-1246927153,1,2,3
-1246890562,5,2382766,24
-1246906445,3,2382766,14


In [119]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']
prior_orders = orders[orders.eval_set == 'prior']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
priors.set_index(['order_id', 'product_id'], inplace=True, drop=False)

split orders : train, test


In [126]:
### Build list of candidate products to reorder, with features ###

def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    # df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [127]:
%%time

df_train, labels = features(train_orders, labels_given=True)
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', \
            'user_average_days_between_orders', 'user_average_basket', \
            'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', \
            'aisle_id', 'department_id', 'product_orders', 'product_reorders', \
            'product_reorder_rate', 'UP_orders', 'UP_orders_ratio', \
            'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last', \
            'UP_delta_hour_vs_last']
len(f_to_use)

build candidate list


KeyboardInterrupt: 

In [None]:
df_train.head()

In [None]:
labels.head()

In [None]:
print('formating for xgb')
d_train = xgb.DMatrix(df_train[f_to_use], \
                      label = labels)
params = {
  "objective"           : "reg:logistic",
  "eval_metric"         : "logloss",
  "eta"                 : 0.1,
  "max_depth"           : 6,
  "min_child_weight"    : 10,
  "gamma"               : 0.70,
  "subsample"           : 0.76,
  "colsample_bytree"    : 0.95,
  "alpha"               : 2e-05,
  "lambda"              : 10
}
ROUNDS = 100

In [None]:
print('XGB train...')
bst = xgb.train(params, d_train, ROUNDS)