In [1]:
import numpy as np
import pandas as pd

data = "/mnt/d/Data/Instacart/"

In [2]:
priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'order_number'], engine='c')

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)

Try to generage product feature, which includes:

* probability that a product is reordered 
* average time of a product being reordered

In [3]:
product['order_time']=product.groupby(by=['product_id', 'user_id']).cumcount()

In [4]:
second_order_ratio = lambda x: x[x==1].count()/x[x==0].count()

p1 = product[['product_id', 'order_time']]
p1 = p1.groupby(by='product_id').agg({'order_time': {"second_order_ratio": second_order_ratio}})

In [5]:
avg_reorder_ratio = lambda x: x.count()/x[x==0].count()

p2 = product[['product_id', 'reordered']]
p2 = p2.groupby(by='product_id').agg({'reordered': {'avg_reorder_ratio': avg_reorder_ratio}})

In [6]:
product_feature = pd.concat([p1, p2], axis=1, names=['second_order_ratio', 'avg_reorder_ratio'])

<hr/>

User-product feature:

* up_order_rate <- up_orders / user_orders
* up_orders_since_last_order <- user_orders - up_last_order
* up_order_rate_since_first_order <- up_orders / (user_orders - up_first_order + 1)
* up_orders = n()


In [32]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,user_order_count,user_prod_order_count
0,2539329,196,0,1,1,10,10
1,2398795,196,1,1,2,10,10
2,473747,196,1,1,3,10,10
3,2254736,196,1,1,4,10,10
4,431534,196,1,1,5,10,10


In [7]:
user_order_cnt = product.groupby(by='user_id')['order_number'].max().\
reset_index().rename(columns={'order_number': 'user_order_count'})

In [8]:
up0 = product.groupby(by=['user_id', 'product_id'])['order_number'].agg(['min', 'max', 'count']).reset_index()

In [9]:
up0 = up0.merge(user_order_cnt, on='user_id')

In [14]:
up0.head()

Unnamed: 0,user_id,product_id,user_product_order_cnt,user_prod_order_rate,user_prod_last_vacancy,user_prod_order_freq
0,1,196,10,1.0,1,1.0
1,1,10258,9,0.9,1,1.0
2,1,10326,1,0.1,6,1.0
3,1,12427,10,1.0,1,1.0
4,1,13032,3,0.3,1,0.333333


In [10]:
up0['user_prod_order_rate'] = up0['count']/up0['user_order_count']

In [11]:
up0['user_prod_last_vacancy'] = up0['user_order_count']-up0['max']+1
up0['user_prod_order_freq'] = up0['count']/(up0['max']-up0['min']+1)

In [12]:
up0 = up0.drop(['min', 'max', 'user_order_count'], axis=1).rename(columns={'count': 'user_product_order_cnt'})

<hr/>

user related feature

* user_reorder_ratio: whether user like to order new stuff or not

In [13]:
u1 = product[product['order_number']!=1].groupby('user_id')['reordered', 'order_number'].agg({'reordered': 'sum', 'order_number': 'count'}).reset_index()
u1['user_reorder_ratio']=u1['reordered']/u1['order_number']
u1 = u1.drop(['order_number', 'reordered'], axis=1)

<hr/>

order related feature

* order_vacancy

In [14]:
o1 = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'days_since_prior_order'], engine='c')

<hr/>

join with training set

In [15]:
train = pd.read_csv(data+"train.tsv", sep='\t', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'product_id': np.uint16,
    'label': np.int8
})

In [16]:
p0 = product_feature.reset_index()

In [17]:
p0.columns=['product_id', 'second_order_ratio', 'avg_reorder_ratio']

In [18]:
train = train.merge(p0, on='product_id').merge(u1, on='user_id').merge(o1, on='order_id').merge(up0, on=['user_id', 'product_id'])

In [45]:
valid = train[train['seed']>=3].drop('seed', axis=1)
train = train[train['seed']<3].drop('seed', axis=1)

In [43]:
train_set.shape

(3394627, 12)

In [44]:
valid.shape

(5080034, 12)

In [19]:
train.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "train")

join with test set

In [18]:
test = pd.read_csv(data+"test.tsv", sep='\t', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'product_id': np.uint16
})

test = test.merge(p0, on='product_id').merge(u1, on='user_id').merge(o1, on='order_id').merge(up0, on=['user_id', 'product_id'])

In [20]:
test.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "test")