In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.sparse
pd.set_option("display.max_columns",101)
RANDOM_STATE = 42

In [4]:
DATA_PATH = "../data/instacart/"

In [5]:
print('loading prior')
priors = pd.read_csv(DATA_PATH + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(DATA_PATH + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(DATA_PATH + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(DATA_PATH + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

loading prior
loading train
loading orders
loading products


In [6]:
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))
print('products {}: {}'.format(products.shape, ', '.join(products.columns)))

priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
products (49688, 3): product_id, aisle_id, department_id


In [7]:
print("computing product f")
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)

computing product f


In [8]:
prods['reordered'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reordered / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

In [28]:
print ('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

add order info to priors


In [29]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [33]:
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].\
mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

computing user f


In [34]:
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

In [35]:
users = users.join(usr)
del usr

In [36]:
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

('user f', (206209, 6))


In [38]:
print("compute userXproduct f")

compute userXproduct f


In [39]:
priors['user_product'] = priors.product_id + priors.user_id * 100000

In [40]:
d = dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1, (row.order_number, row.order_id), row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1,
               max(d[z][1], (row.order_number, row.order_id)),
               d[z][2] + row.add_to_cart_order)
        
userXproduct = pd.DataFrame.from_dict(d, orient='index')

In [41]:
userXproduct.head()

Unnamed: 0,0,1,2
14126415872,5,"(20, 843810)",63
20535312385,1,"(2, 2699553)",9
1375731717,7,"(32, 2844957)",51
8959033352,4,"(42, 1924807)",57
7342828204,3,"(6, 723315)",13


In [42]:
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']

In [43]:
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)

In [45]:
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)

In [46]:
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)

In [47]:
print ('user X product f', len(userXproduct))

('user X product f', 13307953)


In [48]:
del priors

In [50]:
users.to_csv(DATA_PATH + "users_match.csv")

In [10]:
products.to_csv(DATA_PATH + "products_match.csv")

In [54]:
userXproduct.to_csv(DATA_PATH + "user_product_match.csv")