In [1]:
import numpy as np
import pandas as pd

data = "/mnt/d/Data/Instacart/"

priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'order_number', 'days_since_prior_order'], engine='c')
orders['interval_accu'] = orders.groupby(by='user_id')['days_since_prior_order'].cumsum().fillna(0)

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)
product['order_time']=product.groupby(by=['product_id', 'user_id']).cumcount()

In [22]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,days_since_prior_order,interval_accu,order_time
0,2539329,196,0,1,1,,0.0,0
1,2539329,12427,0,1,1,,0.0,0
2,2539329,14084,0,1,1,,0.0,0
3,2539329,26088,0,1,1,,0.0,0
4,2539329,26405,0,1,1,,0.0,0


In [3]:
# user features
u1 = product[['user_id', 'order_id']].groupby(by='user_id')['order_id'].agg({'user_norder': pd.Series.nunique, 'user_nitems': 'count'})

In [9]:
u2 = product[['user_id', 'product_id']].groupby(by='user_id')['product_id'].agg({
    'user_ndistinctitems': pd.Series.nunique
})
u3 = product[['user_id', 'product_id']][product.reordered==1].groupby(by='user_id')['product_id'].agg({
    'user_nritems': 'count',
    'user_nrdistinctitems': pd.Series.nunique
})

In [16]:
u4 = product[['user_id', 'order_id', 'days_since_prior_order']][~product.days_since_prior_order.isnull()].drop_duplicates().\
drop(['order_id'], axis=1).groupby(by='user_id')['days_since_prior_order'].agg({
    'user_interval': 'mean'
})

In [22]:
user_feature = u1.merge(u2, left_index=True, right_index=True).\
merge(u3, how='left', left_index=True, right_index=True).\
merge(u4, left_index=True, right_index=True)

In [24]:
del u1, u2, u3, u4
user_feature.head()

Unnamed: 0_level_0,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,10,18,10.0,41.0,19.555555
2,195,14,102,37.0,93.0,15.230769
3,88,12,33,19.0,55.0,12.090909
4,18,5,17,1.0,1.0,13.75
5,37,4,23,8.0,14.0,13.333333


In [26]:
user_feature = user_feature.fillna(0)

In [35]:
user_feature['user_nritem_ratio'] = user_feature.user_nritems/user_feature.user_nitems
user_feature['user_nrdistinctitem_ratio'] = user_feature.user_nrdistinctitems/user_feature.user_ndistinctitems
user_feature['user_nitem_per_order'] = user_feature.user_nitems/user_feature.user_norder
user_feature['user_nritem_per_order'] = user_feature.user_nritems/(user_feature.user_norder-1)
user_feature['user_nritem_per_order_ratio'] = user_feature.user_nritem_per_order/user_feature.user_nitem_per_order

In [36]:
user_feature.head()

Unnamed: 0_level_0,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval,user_nritem_ratio,user_nrdistinctitem_ratio,user_nitem_per_order,user_nritem_per_order,user_nritem_per_order_ratio
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,59,10,18,10.0,41.0,19.555555,0.694915,0.555556,5.9,4.555556,0.772128
2,195,14,102,37.0,93.0,15.230769,0.476923,0.362745,13.928571,7.153846,0.513609
3,88,12,33,19.0,55.0,12.090909,0.625,0.575758,7.333333,5.0,0.681818
4,18,5,17,1.0,1.0,13.75,0.055556,0.058824,3.6,0.25,0.069444
5,37,4,23,8.0,14.0,13.333333,0.378378,0.347826,9.25,4.666667,0.504505


In [37]:
product['order_time']=product.groupby(by=['product_id', 'user_id']).cumcount()
second_order_ratio = lambda x: x[x==1].count()/x[x==0].count()

p1 = product[['product_id', 'order_time']].groupby(by='product_id')['order_time'].agg({"prod_norder_second_order_rate": second_order_ratio})

In [39]:
u5 = product[['user_id', 'order_time']].groupby(by='user_id')['order_time'].agg({"user_second_order_rate": second_order_ratio})

In [42]:
user_feature = user_feature.merge(u5, left_index=True, right_index=True)

In [43]:
user_feature.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_feature")

In [6]:
# product features
p1 = product[['product_id', 'user_id', 'order_id']].groupby(by='product_id').agg({'user_id': pd.Series.nunique, 'order_id': pd.Series.nunique}).\
rename(columns={'user_id': 'prod_nuser', 'order_id': 'prod_norder'})
p2 = product[['product_id', 'user_id', 'order_id']][product.reordered==1].groupby(by='product_id').agg({'user_id': pd.Series.nunique, 'order_id': pd.Series.nunique}).\
rename(columns={'user_id': 'prod_nruser', 'order_id': 'prod_nrorder'})

In [9]:
p3 = product[['product_id', 'user_id', 'interval_accu']].sort_values(by=['user_id', 'product_id', 'interval_accu'])

In [12]:
avginterval = lambda x: np.inf if x.shape[0]==1 else (x.max()-x.min())/(x.shape[0]-1)
up1 = p3.groupby(by=['user_id', 'product_id'])['interval_accu'].agg({'user_prod_days_interval': avginterval})

In [14]:
up1 = up1[up1.user_prod_days_interval!=np.inf].reset_index()

In [20]:
p3 = up1[['product_id', 'user_prod_days_interval']].groupby(by='product_id')['user_prod_days_interval'].agg({'prod_days_interval_avg': 'mean'})

In [23]:
up2 = product[['product_id', 'user_id', 'order_number']].sort_values(by=['user_id', 'product_id', 'order_number'])
up2 = up2.groupby(by=['product_id', 'user_id'])['order_number'].agg({'user_prod_order_interval': avginterval}).reset_index()

In [26]:
up2 = up2[up2.user_prod_order_interval!=np.inf]

In [27]:
p4 = up2[['product_id', 'user_prod_order_interval']].groupby(by='product_id')['user_prod_order_interval'].agg({'prod_order_interval_avg': 'mean'})

In [28]:
product_feature = p1.merge(p2, how='left', left_index=True, right_index=True).\
merge(p3, how='left', left_index=True, right_index=True).\
merge(p4, how='left', left_index=True, right_index=True)

In [37]:
product_feature.prod_order_interval_avg.max()

94.0

In [40]:
product_feature.prod_nruser = product_feature.prod_nruser.fillna(0)
product_feature.prod_nrorder = product_feature.prod_nrorder.fillna(0)
product_feature.prod_days_interval_avg = product_feature.prod_days_interval_avg.fillna(365)
product_feature.prod_order_interval_avg = product_feature.prod_order_interval_avg.fillna(100)

In [44]:
product_feature['prod_ruser_ratio'] = product_feature.prod_nruser/product_feature.prod_nuser
product_feature['prod_rorder_ratio'] = product_feature.prod_nrorder/product_feature.prod_norder
product_feature['prod_rorder_per_ruser'] = product_feature.prod_nrorder/product_feature.prod_nruser
product_feature['prod_order_per_user'] = product_feature.prod_norder/product_feature.prod_nuser
product_feature.prod_rorder_per_ruser = product_feature.prod_rorder_per_ruser.fillna(0)

In [50]:
product_feature.head()

Unnamed: 0_level_0,prod_nuser,prod_norder,prod_nruser,prod_nrorder,prod_days_interval_avg,prod_order_interval_avg,prod_ruser_ratio,prod_rorder_ratio,prod_rorder_per_ruser,prod_order_per_user,prod_second_order_ratio
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,716,1852,276.0,1136.0,46.261837,5.244687,0.385475,0.613391,4.115942,2.586592,0.385475
2,78,90,8.0,12.0,53.849998,8.75,0.102564,0.133333,1.5,1.153846,0.102564
3,74,277,36.0,203.0,26.942787,3.561093,0.486486,0.732852,5.638889,3.743243,0.486486
4,182,329,64.0,147.0,37.590836,3.236756,0.351648,0.446809,2.296875,1.807692,0.351648
5,6,15,4.0,9.0,55.583332,3.541667,0.666667,0.6,2.25,2.5,0.666667


In [46]:
second_order_ratio = lambda x: x[x==1].count()/x[x==0].count()
p5 = product[['product_id', 'order_time']].groupby(by='product_id')['order_time'].agg({"prod_second_order_ratio": second_order_ratio})

In [49]:
product_feature = product_feature.merge(p5, left_index=True, right_index=True)

In [51]:
del p1, p2, p3, p4, p5
product_feature.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "product_feature")

In [56]:
# prod-user feature
up3 = product[['user_id', 'product_id', 'reordered']].groupby(by=['product_id', 'user_id'])['reordered'].agg({
    'user_prod_norder': 'count',
    'user_prod_reordered': 'max'
})

In [57]:
up

Unnamed: 0_level_0,Unnamed: 1_level_0,user_prod_reordered,user_prod_norder
product_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,138,1,2
1,709,0,1
1,764,1,2
1,777,0,1
1,825,0,1


In [101]:
ut = product[['user_id', 'order_number']].groupby(by='user_id').agg({'order_number': 'max'})
up4 = product[['user_id', 'product_id', 'order_number']][product.order_time==0].rename(columns={'order_number': 'first_order_number'})

In [102]:
up4 = pd.merge(up4, ut, left_on='user_id', right_index=True)

In [103]:
up4['user_prod_recentlydiscovered'] = pd.Series(up4.first_order_number==up4.order_number)

In [107]:
up4.drop(['first_order_number', 'order_number'], axis=1, inplace=True)

In [109]:
user_product_feature = up3.reset_index().merge(up1, how='left', on=['product_id', 'user_id']).\
merge(up2, how='left', on=['product_id', 'user_id']).\
merge(up4.astype(np.int), on=['product_id', 'user_id'])

In [114]:
user_product_feature.user_prod_days_interval = user_product_feature.user_prod_days_interval.fillna(366)
user_product_feature.user_prod_order_interval = user_product_feature.user_prod_order_interval.fillna(100)

In [115]:
del up1, up2, up3, up4

In [116]:
user_feature.head()

NameError: name 'user_feature' is not defined

In [117]:
user_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_feature")
user_feature.head()

Unnamed: 0_level_0,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval,user_nritem_ratio,user_nrdistinctitem_ratio,user_nitem_per_order,user_nritem_per_order,user_nritem_per_order_ratio,user_second_order_rate
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,59,10,18,10.0,41.0,19.555555,0.694915,0.555556,5.9,4.555556,0.772128,0.555556
2,195,14,102,37.0,93.0,15.230769,0.476923,0.362745,13.928571,7.153846,0.513609,0.362745
3,88,12,33,19.0,55.0,12.090909,0.625,0.575758,7.333333,5.0,0.681818,0.575758
4,18,5,17,1.0,1.0,13.75,0.055556,0.058824,3.6,0.25,0.069444,0.058824
5,37,4,23,8.0,14.0,13.333333,0.378378,0.347826,9.25,4.666667,0.504505,0.347826


In [118]:
user_product_feature = user_product_feature.merge(user_feature[['user_interval', 'user_norder']], left_on='user_id',
                                                 right_index=True)

In [119]:
user_product_feature['user_prod_norder_rate'] = user_product_feature.user_prod_norder/user_product_feature.user_norder
user_product_feature['user_prod_days_interval_rate'] = user_product_feature.user_prod_days_interval/user_product_feature.user_interval

In [120]:
user_product_feature.drop(['user_interval', 'user_norder'], axis=1, inplace=True)

In [121]:
user_product_feature.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_product_feature")

In [3]:
# hour and week
priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day'], engine='c')

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)

In [4]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,order_dow,order_hour_of_day
0,2539329,196,0,1,1,2,8
1,2539329,12427,0,1,1,2,8
2,2539329,14084,0,1,1,2,8
3,2539329,26088,0,1,1,2,8
4,2539329,26405,0,1,1,2,8


In [5]:
t1 = product[['product_id', 'order_hour_of_day', 'order_id']].groupby(by=['product_id', 'order_hour_of_day']).agg('count').reset_index()
t1 = t1.rename(columns={'order_id': 'hour_cnt'})
t11 = t1.groupby(by='product_id')['hour_cnt'].agg({'prod_hour_cnt': 'sum'}).reset_index()
t12 = t1.groupby(by='order_hour_of_day')['hour_cnt'].agg({'hour_prod_cnt': 'sum'}).reset_index()
t1 = t1.merge(t11, on='product_id').merge(t12, on='order_hour_of_day')
t1['prod_hour_prob']=t1.hour_cnt/t1.prod_hour_cnt
t1['hour_prod_prob']=t1.hour_cnt/t1.hour_prod_cnt
t1.drop(['hour_cnt', 'prod_hour_cnt', 'hour_prod_cnt'], axis=1, inplace=True)
del t11, t12

In [6]:
t2 = product[['product_id', 'order_dow', 'order_id']].groupby(by=['product_id', 'order_dow']).agg('count').reset_index()
t2 = t2.rename(columns={'order_id': 'week_cnt'})
t21 = t2.groupby(by='product_id')['week_cnt'].agg({'prod_week_cnt': 'sum'}).reset_index()
t22 = t2.groupby(by='order_dow')['week_cnt'].agg({'week_prod_cnt': 'sum'}).reset_index()
t2 = t2.merge(t21, on='product_id').merge(t22, on='order_dow')
t2['prod_week_prob']=t2.week_cnt/t2.prod_week_cnt
t2['week_prod_prob']=t2.week_cnt/t2.week_prod_cnt
t2.drop(['week_cnt', 'prob_week_cnt', 'week_prod_cnt'], axis=1, inplace=True)
del t22, t21

In [7]:
t3 = product[['user_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'order_hour_of_day'])['order_id'].\
agg({'user_hour_count': 'count'}).reset_index()
t31 = product[['user_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['user_id'])['order_id'].\
agg({'user_count': 'count'}).reset_index()
t3 = t3.merge(t31, on='user_id')
t3['hour_user_reorder_prob'] = t3.user_hour_count/t3.user_count
t3.drop(['user_hour_count', 'user_count'], axis=1, inplace=True)

In [8]:
t4 = product[['user_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'order_dow'])['order_id'].\
agg({'user_week_count': 'count'}).reset_index()
t41 = product[['user_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['user_id'])['order_id'].\
agg({'user_count': 'count'}).reset_index()
t4 = t4.merge(t41, on='user_id')
t4['week_user_reorder_prob'] = t4.user_week_count/t4.user_count
t4.drop(['user_week_count', 'user_count'], axis=1, inplace=True)