In [3]:
import numpy as np
import pandas as pd

data = "/mnt/d/Data/Instacart/"
hdffile = "/mnt/d/Data/Instacart/dataset.hdf"

second_order_ratio = lambda x: x[x==1].count()/x[x==0].count()
avginterval = lambda x: np.inf if x.shape[0]==1 else (x.max()-x.min())/(x.shape[0]-1)

In [17]:
priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
        'order_id': np.int32,
        'product_id': np.uint16,
        'add_to_cart_order': np.int16,
        'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'eval_set', 'order_number', 'days_since_prior_order'], engine='c')
orders['interval_accu'] = orders.groupby(by='user_id')['days_since_prior_order'].cumsum().fillna(0)

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)
product['order_time']=product.groupby(by=['product_id', 'user_id']).cumcount()

p1 = product[['product_id', 'user_id', 'order_id']].groupby(by='product_id').agg({'user_id': pd.Series.nunique, 'order_id': pd.Series.nunique}).\
rename(columns={'user_id': 'prod_nuser', 'order_id': 'prod_norder'})
p2 = product[['product_id', 'user_id', 'order_id']][product.reordered==1].groupby(by='product_id').agg({'user_id': pd.Series.nunique, 'order_id': pd.Series.nunique}).\
rename(columns={'user_id': 'prod_nruser', 'order_id': 'prod_nrorder'})
up1 = product[['product_id', 'user_id', 'interval_accu']].sort_values(by=['user_id', 'product_id', 'interval_accu'])
#up1 = up1.groupby(by=['user_id', 'product_id'])['interval_accu'].agg({'user_prod_days_interval': avginterval})
up1 = up1.groupby(by=['user_id', 'product_id'])['interval_accu'].agg({'user_prod_days_interval': 'count'})
up1 = up1[up1.user_prod_days_interval!=np.inf].reset_index()
p3 = up1[['product_id', 'user_prod_days_interval']].groupby(by='product_id')['user_prod_days_interval'].agg({'prod_days_interval_avg': 'mean'})
up2 = product[['product_id', 'user_id', 'order_number']].sort_values(by=['user_id', 'product_id', 'order_number'])
#up2 = up2.groupby(by=['product_id', 'user_id'])['order_number'].agg({'user_prod_order_interval': avginterval}).reset_index()
up2 = up2.groupby(by=['product_id', 'user_id'])['order_number'].agg({'user_prod_order_interval': 'count'}).reset_index()
up2 = up2[up2.user_prod_order_interval!=np.inf].reset_index()
p4 = up2[['product_id', 'user_prod_order_interval']].groupby(by='product_id')['user_prod_order_interval'].agg({'prod_order_interval_avg': 'mean'})
p5 = product[['product_id', 'order_time']].groupby(by='product_id')['order_time'].agg({"prod_second_order_ratio": second_order_ratio})

product_feature = p1.merge(p2, how='left', left_index=True, right_index=True).\
merge(p3, how='left', left_index=True, right_index=True).\
merge(p4, how='left', left_index=True, right_index=True).\
merge(p5, left_index=True, right_index=True)

product_feature.prod_nruser = product_feature.prod_nruser.fillna(0)
product_feature.prod_nrorder = product_feature.prod_nrorder.fillna(0)
product_feature.prod_days_interval_avg = product_feature.prod_days_interval_avg.fillna(366)
product_feature.prod_order_interval_avg = product_feature.prod_order_interval_avg.fillna(100)
product_feature['prod_ruser_ratio'] = product_feature.prod_nruser/product_feature.prod_nuser
product_feature['prod_rorder_ratio'] = product_feature.prod_nrorder/product_feature.prod_norder
product_feature['prod_rorder_per_ruser'] = product_feature.prod_nrorder/product_feature.prod_nruser
product_feature['prod_order_per_user'] = product_feature.prod_norder/product_feature.prod_nuser
product_feature.prod_rorder_per_ruser = product_feature.prod_rorder_per_ruser.fillna(0)
del p1, p2, p3, p4, p5

up3 = product[['user_id', 'product_id', 'reordered']].groupby(by=['product_id', 'user_id'])['reordered'].agg({
    'user_prod_norder': 'count',
    'user_prod_reordered': 'max'
})
ut = product[['user_id', 'order_number']].groupby(by='user_id').agg({'order_number': 'max'})
up4 = product[['user_id', 'product_id', 'order_number']][product.order_time==0].rename(columns={'order_number': 'first_order_number'})
up4 = pd.merge(up4, ut, left_on='user_id', right_index=True)
up4['user_prod_recentlydiscovered'] = pd.Series(up4.first_order_number==up4.order_number)
up4.drop(['first_order_number', 'order_number'], axis=1, inplace=True)
user_product_feature = up3.reset_index().merge(up1, how='left', on=['product_id', 'user_id']).\
merge(up2, how='left', on=['product_id', 'user_id']).\
merge(up4.astype(np.int), on=['product_id', 'user_id'])
user_product_feature.user_prod_days_interval = user_product_feature.user_prod_days_interval.fillna(366)
user_product_feature.user_prod_order_interval = user_product_feature.user_prod_order_interval.fillna(100)
#del up1, up2, up3, up4

user_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_feature")
user_product_feature = user_product_feature.merge(user_feature[['user_interval', 'user_norder']], left_on='user_id',
                                             right_index=True)
user_product_feature['user_prod_norder_rate'] = user_product_feature.user_prod_norder/user_product_feature.user_norder
user_product_feature['user_prod_days_interval_rate'] = user_product_feature.user_prod_days_interval/user_product_feature.user_interval
user_product_feature.drop(['user_interval', 'user_norder'], axis=1, inplace=True)

In [6]:
upsp = product[['user_id', 'product_id', 'order_time']].groupby(by=['user_id', 'product_id']).agg({'order_time': 'max'}).reset_index()
upsp = pd.merge(upsp, product[['user_id', 'product_id', 'order_number', 'interval_accu', 'order_time']], on=['user_id', 'product_id', 'order_time'])
upsp = upsp.rename(columns={'order_number': 'last_order_number'}).drop('order_time', axis=1)
orders.rename(columns={'interval_accu': 'accu_interval'}, inplace=True)

train = pd.read_csv(data+"train.tsv", sep='\t', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'product_id': np.uint16,
    'label': np.int8
}, usecols=['order_id', 'user_id', 'product_id'], engine='c')
train = train.merge(orders[['order_id', 'user_id', 'order_number', 'accu_interval', 'days_since_prior_order']][orders.eval_set=='train'], on=['order_id', 'user_id'])
train = train.merge(upsp, on=['user_id', 'product_id'])
train['user_prod_lastorder_interval'] = train.order_number-train.last_order_number
train['user_prod_lastdays_interval'] = train.accu_interval-train.interval_accu
train.drop(['order_number', 'accu_interval', 'last_order_number', 'interval_accu'], axis=1, inplace=True)
train.rename(columns={'days_since_prior_order': 'user_lastorder_interval'}, inplace=True)
train = train.merge(product_feature, left_on='product_id', right_index=True).merge(user_product_feature, on=['user_id', 'product_id'])
train['user_prod_lastorder_interval_rate'] = train.user_prod_lastorder_interval / train.user_prod_order_interval
train['user_prod_lastdays_interval_rate']  = train.user_prod_lastdays_interval / train.user_prod_days_interval
train.user_prod_days_interval_rate = train.user_prod_days_interval_rate.fillna(1)
train.user_prod_days_interval_rate = train.user_prod_days_interval_rate.replace(np.inf, np.nan).fillna(1600)
train.user_prod_lastdays_interval_rate = train.user_prod_lastdays_interval_rate.replace(np.inf, np.nan).fillna(1600)

test = pd.read_csv(data+"test.tsv", sep='\t', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'product_id': np.uint16
})
test = test.merge(orders[['order_id', 'user_id', 'order_number', 'accu_interval', 'days_since_prior_order']][orders.eval_set=='test'], on=['order_id', 'user_id'])
test = test.merge(upsp, on=['user_id', 'product_id'])
test['user_prod_lastorder_interval'] = test.order_number-test.last_order_number
test['user_prod_lastdays_interval'] = test.accu_interval-test.interval_accu
test.drop(['order_number', 'accu_interval', 'last_order_number', 'interval_accu'], axis=1, inplace=True)
test.rename(columns={'days_since_prior_order': 'user_lastorder_interval'}, inplace=True)
test = test.merge(product_feature, left_on='product_id', right_index=True).merge(user_product_feature, on=['user_id', 'product_id'])
test['user_prod_lastorder_interval_rate'] = test.user_prod_lastorder_interval / test.user_prod_order_interval
test['user_prod_lastdays_interval_rate']  = test.user_prod_lastdays_interval / test.user_prod_days_interval
test.user_prod_days_interval_rate = test.user_prod_days_interval_rate.fillna(1)
test.user_prod_days_interval_rate = test.user_prod_days_interval_rate.replace(np.inf, np.nan).fillna(1600)
test.user_prod_lastdays_interval_rate = test.user_prod_lastdays_interval_rate.replace(np.inf, np.nan).fillna(1600)

In [7]:
train.head()

Unnamed: 0,order_id,user_id,product_id,user_lastorder_interval,user_prod_lastorder_interval,user_prod_lastdays_interval,prod_nuser,prod_norder,prod_nruser,prod_nrorder,...,user_prod_norder,user_prod_reordered,user_prod_days_interval,index,user_prod_order_interval,user_prod_recentlydiscovered,user_prod_norder_rate,user_prod_days_interval_rate,user_prod_lastorder_interval_rate,user_prod_lastdays_interval_rate
0,1187899,1,196,14.0,1,14.0,8000,35791,4660.0,27791.0,...,10,1,19.555555,45698.0,1.0,0,1.0,1.0,1.0,0.715909
1,1854765,21,196,28.0,24,252.0,8000,35791,4660.0,27791.0,...,1,0,366.0,,100.0,0,0.030303,36.946373,0.24,0.688525
2,1864787,43,196,26.0,3,50.0,8000,35791,4660.0,27791.0,...,2,1,28.0,45703.0,3.0,0,0.181818,2.692308,1.0,1.785714
3,1647290,52,196,3.0,7,69.0,8000,35791,4660.0,27791.0,...,14,1,13.076923,45704.0,1.538462,0,0.518519,1.440678,4.55,5.276471
4,2757217,67,196,5.0,2,6.0,8000,35791,4660.0,27791.0,...,19,1,9.611111,45705.0,1.222222,0,0.791667,1.270434,1.636364,0.624277


In [15]:
up3.reset_index().merge(up1, how='left', on=['product_id', 'user_id']).\
merge(up2, how='left', on=['product_id', 'user_id']).\
merge(up4.astype(np.int), on=['product_id', 'user_id'])

Unnamed: 0,order_id,user_id,product_id,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval,user_second_order_rate,user_nritem_ratio,user_nrdistinctitem_ratio,user_nitem_per_order,user_nritem_per_order,user_nritem_per_order_ratio,user_lastorder_interval,user_lastorder_interval_ratio
0,2774568,3,9387,88,12,33,19.0,55.0,12.090909,0.575758,0.625,0.575758,7.333333,5.0,0.681818,11.0,0.909774
1,2774568,3,17668,88,12,33,19.0,55.0,12.090909,0.575758,0.625,0.575758,7.333333,5.0,0.681818,11.0,0.909774
2,2774568,3,15143,88,12,33,19.0,55.0,12.090909,0.575758,0.625,0.575758,7.333333,5.0,0.681818,11.0,0.909774
3,2774568,3,16797,88,12,33,19.0,55.0,12.090909,0.575758,0.625,0.575758,7.333333,5.0,0.681818,11.0,0.909774
4,2774568,3,39190,88,12,33,19.0,55.0,12.090909,0.575758,0.625,0.575758,7.333333,5.0,0.681818,11.0,0.909774


In [26]:
up2 = product[['product_id', 'user_id', 'order_number']].sort_values(by=['user_id', 'product_id', 'order_number'])
#up2 = up2.groupby(by=['product_id', 'user_id'])['order_number'].agg({'user_prod_order_interval': avginterval}).reset_index()
up2 = up2.groupby(by=['product_id', 'user_id'])['order_number'].agg({'user_prod_order_interval': 'count'})
up2 = up2[up2.user_prod_order_interval!=np.inf]

In [27]:
up2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_prod_order_interval
product_id,user_id,Unnamed: 2_level_1
1,138,2
1,709,1
1,764,2
1,777,1
1,825,1


In [9]:
train.shape

(8474661, 27)

In [10]:
train.head()

Unnamed: 0,order_id,user_id,product_id,user_lastorder_interval,user_prod_lastorder_interval,user_prod_lastdays_interval,prod_nuser,prod_norder,prod_nruser,prod_nrorder,...,user_prod_norder,user_prod_reordered,user_prod_days_interval,index,user_prod_order_interval,user_prod_recentlydiscovered,user_prod_norder_rate,user_prod_days_interval_rate,user_prod_lastorder_interval_rate,user_prod_lastdays_interval_rate
0,1187899,1,196,14.0,1,14.0,8000,35791,4660.0,27791.0,...,10,1,19.555555,45698.0,1.0,0,1.0,1.0,1.0,0.715909
1,1854765,21,196,28.0,24,252.0,8000,35791,4660.0,27791.0,...,1,0,366.0,,100.0,0,0.030303,36.946373,0.24,0.688525
2,1864787,43,196,26.0,3,50.0,8000,35791,4660.0,27791.0,...,2,1,28.0,45703.0,3.0,0,0.181818,2.692308,1.0,1.785714
3,1647290,52,196,3.0,7,69.0,8000,35791,4660.0,27791.0,...,14,1,13.076923,45704.0,1.538462,0,0.518519,1.440678,4.55,5.276471
4,2757217,67,196,5.0,2,6.0,8000,35791,4660.0,27791.0,...,19,1,9.611111,45705.0,1.222222,0,0.791667,1.270434,1.636364,0.624277
