In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb

In [4]:
base_path = '../Data/InstaCart/'

In [19]:
priors = pd.read_csv(base_path + 'order_products__prior.csv', dtype = {'order_id' : np.int32, 
                                                                       'product_id' : np.uint16,
                                                                       'add_to_cart_order' : np.int16,
                                                                       'reordered' : np.int8})

train = pd.read_csv(base_path + 'order_products__train.csv', dtype = {'order_id' : np.int32,
                                                                      'product_id' : np.uint16,
                                                                      'add_to_cart_order' : np.int16,
                                                                      'reordered' : np.int8})

orders = pd.read_csv(base_path + 'orders.csv', dtype = {'order_id' : np.int32,
                                                        'user_id' : np.int32,
                                                        'eval_set' : 'category',
                                                        'order_number' : np.int16,
                                                        'order_dow' : np.int8,
                                                        'order_hour_of_day' : np.int8,
                                                        'days_since_prior_order' : np.float32})

products = pd.read_csv(base_path + 'products.csv', dtype = {'product_id' : np.uint16,
                                                            'order_id' : np.int32,
                                                            'aisle_id' : np.uint8,
                                                            'department_id' : np.uint8},
                                                            usecols = ['product_id', 'aisle_id', 'department_id'])

In [20]:
prods = pd.DataFrame()
prods['orders'] = priors.groupby(['product_id']).size().astype(np.int32)
prods['reorders'] = priors.groupby(['product_id'])['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods['reorders'] / prods['orders']).astype(np.float32)
products = products.join(prods, on = 'product_id')
products.set_index('product_id', drop = False, inplace = True)
del prods

In [21]:
orders.set_index('order_id', inplace = True, drop = False)
priors = priors.join(orders, on = 'order_id', rsuffix = '_')
priors.drop('order_id_', inplace = True, axis = 1)

In [22]:
orders.head()

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2539329,2539329,1,prior,1,2,8,
2398795,2398795,1,prior,2,3,7,15.0
473747,473747,1,prior,3,3,12,21.0
2254736,2254736,1,prior,4,4,7,29.0
431534,431534,1,prior,5,4,15,28.0


In [24]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [26]:
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby(['user_id'])['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

In [28]:
users = pd.DataFrame()
users['total_items'] = priors.groupby(['user_id']).size().astype(np.int16)
users['all_products'] = priors.groupby(['user_id'])['product_id'].apply(set)
users['total_distinct_items'] = (users['all_products'].map(len)).astype(np.int16)

users = users.join(usr)
del usr

In [33]:
users['average_basket'] = (users['total_items'] / users['nb_orders']).astype(np.float32)
priors['user_product'] = priors['product_id'] + priors['user_id'] * 100000

In [36]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product
0,2,33120,1,1,202279,prior,3,5,9,8.0,20227933120
1,2,28985,2,1,202279,prior,3,5,9,8.0,20227928985
2,2,9327,3,0,202279,prior,3,5,9,8.0,20227909327
3,2,45918,4,1,202279,prior,3,5,9,8.0,20227945918
4,2,30035,5,0,202279,prior,3,5,9,8.0,20227930035


In [37]:
users.head()

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.0,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15,13.0
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.0,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.0,6,3.0
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.5,5,7.4
