In [12]:
import numpy as np
import pandas as pd

data = "/mnt/d/Data/Instacart/"

priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'order_number', 'days_since_prior_order'], engine='c')
orders['interval_accu'] = orders.groupby(by='user_id')['days_since_prior_order'].cumsum().fillna(0)

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)
product['order_time']=product.groupby(by=['product_id', 'user_id']).cumcount()

In [22]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,days_since_prior_order,interval_accu,order_time
0,2539329,196,0,1,1,,0.0,0
1,2539329,12427,0,1,1,,0.0,0
2,2539329,14084,0,1,1,,0.0,0
3,2539329,26088,0,1,1,,0.0,0
4,2539329,26405,0,1,1,,0.0,0


In [3]:
# user features
u1 = product[['user_id', 'order_id']].groupby(by='user_id')['order_id'].agg({'user_norder': pd.Series.nunique, 'user_nitems': 'count'})

In [9]:
u2 = product[['user_id', 'product_id']].groupby(by='user_id')['product_id'].agg({
    'user_ndistinctitems': pd.Series.nunique
})
u3 = product[['user_id', 'product_id']][product.reordered==1].groupby(by='user_id')['product_id'].agg({
    'user_nritems': 'count',
    'user_nrdistinctitems': pd.Series.nunique
})

In [16]:
u4 = product[['user_id', 'order_id', 'days_since_prior_order']][~product.days_since_prior_order.isnull()].drop_duplicates().\
drop(['order_id'], axis=1).groupby(by='user_id')['days_since_prior_order'].agg({
    'user_interval': 'mean'
})

In [22]:
user_feature = u1.merge(u2, left_index=True, right_index=True).\
merge(u3, how='left', left_index=True, right_index=True).\
merge(u4, left_index=True, right_index=True)

In [24]:
del u1, u2, u3, u4
user_feature.head()

Unnamed: 0_level_0,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,10,18,10.0,41.0,19.555555
2,195,14,102,37.0,93.0,15.230769
3,88,12,33,19.0,55.0,12.090909
4,18,5,17,1.0,1.0,13.75
5,37,4,23,8.0,14.0,13.333333


In [26]:
user_feature = user_feature.fillna(0)

In [35]:
user_feature['user_nritem_ratio'] = user_feature.user_nritems/user_feature.user_nitems
user_feature['user_nrdistinctitem_ratio'] = user_feature.user_nrdistinctitems/user_feature.user_ndistinctitems
user_feature['user_nitem_per_order'] = user_feature.user_nitems/user_feature.user_norder
user_feature['user_nritem_per_order'] = user_feature.user_nritems/(user_feature.user_norder-1)
user_feature['user_nritem_per_order_ratio'] = user_feature.user_nritem_per_order/user_feature.user_nitem_per_order

In [36]:
user_feature.head()

Unnamed: 0_level_0,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval,user_nritem_ratio,user_nrdistinctitem_ratio,user_nitem_per_order,user_nritem_per_order,user_nritem_per_order_ratio
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,59,10,18,10.0,41.0,19.555555,0.694915,0.555556,5.9,4.555556,0.772128
2,195,14,102,37.0,93.0,15.230769,0.476923,0.362745,13.928571,7.153846,0.513609
3,88,12,33,19.0,55.0,12.090909,0.625,0.575758,7.333333,5.0,0.681818
4,18,5,17,1.0,1.0,13.75,0.055556,0.058824,3.6,0.25,0.069444
5,37,4,23,8.0,14.0,13.333333,0.378378,0.347826,9.25,4.666667,0.504505


In [37]:
product['order_time']=product.groupby(by=['product_id', 'user_id']).cumcount()
second_order_ratio = lambda x: x[x==1].count()/x[x==0].count()

p1 = product[['product_id', 'order_time']].groupby(by='product_id')['order_time'].agg({"prod_norder_second_order_rate": second_order_ratio})

In [39]:
u5 = product[['user_id', 'order_time']].groupby(by='user_id')['order_time'].agg({"user_second_order_rate": second_order_ratio})

In [42]:
user_feature = user_feature.merge(u5, left_index=True, right_index=True)

In [43]:
user_feature.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_feature")

In [6]:
# product features
p1 = product[['product_id', 'user_id', 'order_id']].groupby(by='product_id').agg({'user_id': pd.Series.nunique, 'order_id': pd.Series.nunique}).\
rename(columns={'user_id': 'prod_nuser', 'order_id': 'prod_norder'})
p2 = product[['product_id', 'user_id', 'order_id']][product.reordered==1].groupby(by='product_id').agg({'user_id': pd.Series.nunique, 'order_id': pd.Series.nunique}).\
rename(columns={'user_id': 'prod_nruser', 'order_id': 'prod_nrorder'})

In [9]:
p3 = product[['product_id', 'user_id', 'interval_accu']].sort_values(by=['user_id', 'product_id', 'interval_accu'])

In [12]:
avginterval = lambda x: np.inf if x.shape[0]==1 else (x.max()-x.min())/(x.shape[0]-1)
up1 = p3.groupby(by=['user_id', 'product_id'])['interval_accu'].agg({'user_prod_days_interval': avginterval})

In [14]:
up1 = up1[up1.user_prod_days_interval!=np.inf].reset_index()

In [20]:
p3 = up1[['product_id', 'user_prod_days_interval']].groupby(by='product_id')['user_prod_days_interval'].agg({'prod_days_interval_avg': 'mean'})

In [23]:
up2 = product[['product_id', 'user_id', 'order_number']].sort_values(by=['user_id', 'product_id', 'order_number'])
up2 = up2.groupby(by=['product_id', 'user_id'])['order_number'].agg({'user_prod_order_interval': avginterval}).reset_index()

In [26]:
up2 = up2[up2.user_prod_order_interval!=np.inf]

In [27]:
p4 = up2[['product_id', 'user_prod_order_interval']].groupby(by='product_id')['user_prod_order_interval'].agg({'prod_order_interval_avg': 'mean'})

In [28]:
product_feature = p1.merge(p2, how='left', left_index=True, right_index=True).\
merge(p3, how='left', left_index=True, right_index=True).\
merge(p4, how='left', left_index=True, right_index=True)

In [37]:
product_feature.prod_order_interval_avg.max()

94.0

In [40]:
product_feature.prod_nruser = product_feature.prod_nruser.fillna(0)
product_feature.prod_nrorder = product_feature.prod_nrorder.fillna(0)
product_feature.prod_days_interval_avg = product_feature.prod_days_interval_avg.fillna(365)
product_feature.prod_order_interval_avg = product_feature.prod_order_interval_avg.fillna(100)

In [44]:
product_feature['prod_ruser_ratio'] = product_feature.prod_nruser/product_feature.prod_nuser
product_feature['prod_rorder_ratio'] = product_feature.prod_nrorder/product_feature.prod_norder
product_feature['prod_rorder_per_ruser'] = product_feature.prod_nrorder/product_feature.prod_nruser
product_feature['prod_order_per_user'] = product_feature.prod_norder/product_feature.prod_nuser
product_feature.prod_rorder_per_ruser = product_feature.prod_rorder_per_ruser.fillna(0)

In [50]:
product_feature.head()

Unnamed: 0_level_0,prod_nuser,prod_norder,prod_nruser,prod_nrorder,prod_days_interval_avg,prod_order_interval_avg,prod_ruser_ratio,prod_rorder_ratio,prod_rorder_per_ruser,prod_order_per_user,prod_second_order_ratio
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,716,1852,276.0,1136.0,46.261837,5.244687,0.385475,0.613391,4.115942,2.586592,0.385475
2,78,90,8.0,12.0,53.849998,8.75,0.102564,0.133333,1.5,1.153846,0.102564
3,74,277,36.0,203.0,26.942787,3.561093,0.486486,0.732852,5.638889,3.743243,0.486486
4,182,329,64.0,147.0,37.590836,3.236756,0.351648,0.446809,2.296875,1.807692,0.351648
5,6,15,4.0,9.0,55.583332,3.541667,0.666667,0.6,2.25,2.5,0.666667


In [46]:
second_order_ratio = lambda x: x[x==1].count()/x[x==0].count()
p5 = product[['product_id', 'order_time']].groupby(by='product_id')['order_time'].agg({"prod_second_order_ratio": second_order_ratio})

In [49]:
product_feature = product_feature.merge(p5, left_index=True, right_index=True)

In [51]:
del p1, p2, p3, p4, p5
product_feature.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "product_feature")

In [56]:
# prod-user feature
up3 = product[['user_id', 'product_id', 'reordered']].groupby(by=['product_id', 'user_id'])['reordered'].agg({
    'user_prod_norder': 'count',
    'user_prod_reordered': 'max'
})

In [57]:
up

Unnamed: 0_level_0,Unnamed: 1_level_0,user_prod_reordered,user_prod_norder
product_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,138,1,2
1,709,0,1
1,764,1,2
1,777,0,1
1,825,0,1


In [101]:
ut = product[['user_id', 'order_number']].groupby(by='user_id').agg({'order_number': 'max'})
up4 = product[['user_id', 'product_id', 'order_number']][product.order_time==0].rename(columns={'order_number': 'first_order_number'})

In [102]:
up4 = pd.merge(up4, ut, left_on='user_id', right_index=True)

In [103]:
up4['user_prod_recentlydiscovered'] = pd.Series(up4.first_order_number==up4.order_number)

In [107]:
up4.drop(['first_order_number', 'order_number'], axis=1, inplace=True)

In [109]:
user_product_feature = up3.reset_index().merge(up1, how='left', on=['product_id', 'user_id']).\
merge(up2, how='left', on=['product_id', 'user_id']).\
merge(up4.astype(np.int), on=['product_id', 'user_id'])

In [114]:
user_product_feature.user_prod_days_interval = user_product_feature.user_prod_days_interval.fillna(366)
user_product_feature.user_prod_order_interval = user_product_feature.user_prod_order_interval.fillna(100)

In [115]:
del up1, up2, up3, up4

In [116]:
user_feature.head()

NameError: name 'user_feature' is not defined

In [117]:
user_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_feature")
user_feature.head()

Unnamed: 0_level_0,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval,user_nritem_ratio,user_nrdistinctitem_ratio,user_nitem_per_order,user_nritem_per_order,user_nritem_per_order_ratio,user_second_order_rate
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,59,10,18,10.0,41.0,19.555555,0.694915,0.555556,5.9,4.555556,0.772128,0.555556
2,195,14,102,37.0,93.0,15.230769,0.476923,0.362745,13.928571,7.153846,0.513609,0.362745
3,88,12,33,19.0,55.0,12.090909,0.625,0.575758,7.333333,5.0,0.681818,0.575758
4,18,5,17,1.0,1.0,13.75,0.055556,0.058824,3.6,0.25,0.069444,0.058824
5,37,4,23,8.0,14.0,13.333333,0.378378,0.347826,9.25,4.666667,0.504505,0.347826


In [118]:
user_product_feature = user_product_feature.merge(user_feature[['user_interval', 'user_norder']], left_on='user_id',
                                                 right_index=True)

In [119]:
user_product_feature['user_prod_norder_rate'] = user_product_feature.user_prod_norder/user_product_feature.user_norder
user_product_feature['user_prod_days_interval_rate'] = user_product_feature.user_prod_days_interval/user_product_feature.user_interval

In [120]:
user_product_feature.drop(['user_interval', 'user_norder'], axis=1, inplace=True)

In [121]:
user_product_feature.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_product_feature")

In [41]:
# hour and week
priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day'], engine='c')

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)

In [4]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,order_dow,order_hour_of_day
0,2539329,196,0,1,1,2,8
1,2539329,12427,0,1,1,2,8
2,2539329,14084,0,1,1,2,8
3,2539329,26088,0,1,1,2,8
4,2539329,26405,0,1,1,2,8


In [42]:
t1 = product[['product_id', 'order_hour_of_day', 'order_id']].groupby(by=['product_id', 'order_hour_of_day']).agg('count').reset_index()
t1 = t1.rename(columns={'order_id': 'hour_cnt'})
t11 = t1.groupby(by='product_id')['hour_cnt'].agg({'prod_hour_cnt': 'sum'}).reset_index()
t12 = t1.groupby(by='order_hour_of_day')['hour_cnt'].agg({'hour_prod_cnt': 'sum'}).reset_index()
t1 = t1.merge(t11, on='product_id').merge(t12, on='order_hour_of_day')
t1['prod_hour_prob']=t1.hour_cnt/t1.prod_hour_cnt
t1['hour_prod_prob']=t1.hour_cnt/t1.hour_prod_cnt
t1.drop(['hour_cnt', 'prod_hour_cnt', 'hour_prod_cnt'], axis=1, inplace=True)
del t11, t12

In [43]:
t2 = product[['product_id', 'order_dow', 'order_id']].groupby(by=['product_id', 'order_dow']).agg('count').reset_index()
t2 = t2.rename(columns={'order_id': 'week_cnt'})
t21 = t2.groupby(by='product_id')['week_cnt'].agg({'prod_week_cnt': 'sum'}).reset_index()
t22 = t2.groupby(by='order_dow')['week_cnt'].agg({'week_prod_cnt': 'sum'}).reset_index()
t2 = t2.merge(t21, on='product_id').merge(t22, on='order_dow')
t2['prod_week_prob']=t2.week_cnt/t2.prod_week_cnt
t2['week_prod_prob']=t2.week_cnt/t2.week_prod_cnt
t2.drop(['week_cnt', 'prod_week_cnt', 'week_prod_cnt'], axis=1, inplace=True)
del t22, t21

In [44]:
t3 = product[['user_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'order_hour_of_day'])['order_id'].\
agg({'user_hour_count': 'count'}).reset_index()
t31 = product[['user_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['user_id'])['order_id'].\
agg({'user_count': 'count'}).reset_index()
t3 = t3.merge(t31, on='user_id')
t3['hour_user_reorder_prob'] = t3.user_hour_count/t3.user_count
t3.drop(['user_hour_count', 'user_count'], axis=1, inplace=True)

In [45]:
t4 = product[['user_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'order_dow'])['order_id'].\
agg({'user_week_count': 'count'}).reset_index()
t41 = product[['user_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['user_id'])['order_id'].\
agg({'user_count': 'count'}).reset_index()
t4 = t4.merge(t41, on='user_id')
t4['week_user_reorder_prob'] = t4.user_week_count/t4.user_count
t4.drop(['user_week_count', 'user_count'], axis=1, inplace=True)

In [46]:
t5 = product[['product_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['product_id', 'order_hour_of_day'])['order_id'].\
agg({'prod_hour_count': 'count'}).reset_index()
t51 = product[['product_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['product_id'])['order_id'].\
agg({'prod_count': 'count'}).reset_index()
t5 = t5.merge(t51, on='product_id')
t5['hour_prod_reorder_prob'] = t5.prod_hour_count/t5.prod_count
t5.drop(['prod_hour_count', 'prod_count'], axis=1, inplace=True)

In [47]:
t6 = product[['product_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['product_id', 'order_dow'])['order_id'].\
agg({'prod_week_count': 'count'}).reset_index()
t61 = product[['product_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['product_id'])['order_id'].\
agg({'prod_count': 'count'}).reset_index()
t6 = t6.merge(t61, on='product_id')
t6['week_prod_reorder_prob'] = t6.prod_week_count/t6.prod_count
t6.drop(['prod_week_count', 'prod_count'], axis=1, inplace=True)

In [48]:
t7 = product[['user_id', 'product_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'product_id', 'order_hour_of_day'])['order_id'].\
agg({'user_prod_hour_count': 'count'}).reset_index()
t71 = product[['user_id', 'product_id', 'order_hour_of_day', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'product_id'])['order_id'].\
agg({'user_prod_count': 'count'}).reset_index()
t7 = t7.merge(t71, on=['user_id', 'product_id'])
t7['hour_user_prod_reorder_prob'] = t7.user_prod_hour_count/t7.user_prod_count
t7.drop(['user_prod_hour_count', 'user_prod_count'], axis=1, inplace=True)

In [49]:
t8 = product[['user_id', 'product_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'product_id', 'order_dow'])['order_id'].\
agg({'user_prod_week_count': 'count'}).reset_index()
t81 = product[['user_id', 'product_id', 'order_dow', 'order_id']][product.reordered==1].\
groupby(by=['user_id', 'product_id'])['order_id'].\
agg({'user_prod_count': 'count'}).reset_index()
t8 = t8.merge(t81, on=['user_id', 'product_id'])
t8['week_user_prod_reorder_prob'] = t8.user_prod_week_count/t8.user_prod_count
t8.drop(['user_prod_week_count', 'user_prod_count'], axis=1, inplace=True)

In [16]:
t8.head()

Unnamed: 0,user_id,product_id,order_dow,week_user_prod_reorder_prob
0,1,196,1,0.333333
1,1,196,2,0.111111
2,1,196,3,0.222222
3,1,196,4,0.333333
4,1,10258,1,0.375


In [2]:
# asile and department
product_detail = pd.read_csv(data+"products.csv", dtype = {
    'product_id': np.uint16,
    'product_name': str,
    'aisle_id': np.uint8,
    'department_id': np.uint8
}, usecols = ['product_id', 'aisle_id', 'department_id'])

In [22]:
product_detail.head()

Unnamed: 0,product_id,aisle_id,department_id
0,1,61,19
1,2,104,13
2,3,94,7
3,4,38,1
4,5,5,13


In [23]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,order_dow,order_hour_of_day
0,2539329,196,0,1,1,2,8
1,2539329,12427,0,1,1,2,8
2,2539329,14084,0,1,1,2,8
3,2539329,26088,0,1,1,2,8
4,2539329,26405,0,1,1,2,8


In [23]:
d1 = pd.merge(product_detail, product[['product_id', 'order_id']][product.reordered==1], on='product_id').groupby(by=['product_id', 'aisle_id', 'department_id']).\
agg({'order_id': 'count'}).reset_index()

In [24]:
d11 = d1[['aisle_id', 'order_id']].groupby(by='aisle_id')['order_id'].agg({'aisle_count': 'sum'})
d12 = d1[['department_id', 'order_id']].groupby(by='department_id')['order_id'].agg({'department_count': 'sum'})

In [25]:
d1 = d1.merge(d11, left_on='aisle_id', right_index=True).merge(d12, left_on='department_id', right_index=True)
d1['prod_aisle_reorder_prob'] = d1.order_id/d1.aisle_count
d1['prod_department_reorder_prob'] = d1.order_id/d1.department_count

In [26]:
d1.drop(['order_id', 'aisle_count', 'department_count'], axis=1, inplace=True)

In [10]:
d = pd.merge(product_detail, product[['product_id', 'user_id', 'order_id']][product.reordered==1], on='product_id')

In [33]:
d1.head()

Unnamed: 0,product_id,aisle_id,department_id,prod_aisle_reorder_prob,prod_department_reorder_prob
0,1,61,19,0.008845,0.000685
74,78,61,19,2.3e-05,2e-06
95,102,61,19,0.000273,2.1e-05
159,172,61,19,0.000553,4.3e-05
265,285,61,19,0.000436,3.4e-05


In [17]:
d2 = d[['user_id', 'order_id']].groupby(by='user_id')['order_id'].agg({'user_rorder_cnt': 'count'}).reset_index()
d21 = d[['user_id', 'aisle_id', 'order_id']].groupby(by=['user_id', 'aisle_id'])['order_id'].agg({'aisle_count': 'count'}).reset_index()
d22 = d[['user_id', 'department_id', 'order_id']].groupby(by=['user_id', 'department_id'])['order_id'].agg({'department_count': 'count'}).reset_index()
d2 = d2.merge(d21, on='user_id').merge(d22, on='user_id')
d2['aisle_user_reorder_prob'] = d2.aisle_count/d2.user_rorder_cnt
d2['department_user_reorder_prob'] = d2.department_count/d2.user_rorder_cnt
d2.drop(['user_rorder_cnt', 'aisle_count', 'department_count'], axis=1, inplace=True)

In [18]:
d2

Unnamed: 0,user_id,aisle_id,department_id,aisle_user_reorder_prob,department_user_reorder_prob
0,1,21,4,0.170732,0.024390
1,1,21,7,0.170732,0.268293
2,1,21,14,0.170732,0.048780
3,1,21,16,0.170732,0.195122
4,1,21,17,0.170732,0.024390
5,1,21,19,0.170732,0.439024
6,1,23,4,0.243902,0.024390
7,1,23,7,0.243902,0.268293
8,1,23,14,0.243902,0.048780
9,1,23,16,0.243902,0.195122


In [27]:
# instant feature generation

In [37]:
train = pd.read_csv(data+"train.tsv", sep='\t', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'product_id': np.uint16,
    'label': np.int8
})

In [32]:
train.shape

(8474661, 5)

In [38]:
train = train.merge(product_detail, on='product_id').merge(d1, how='left', on=['product_id', 'aisle_id', 'department_id']).fillna(0).\
merge(d2, how='left', on=['user_id', 'aisle_id', 'department_id']).fillna(0)

In [39]:
train.head()

Unnamed: 0,order_id,user_id,product_id,label,seed,aisle_id,department_id,prod_aisle_reorder_prob,prod_department_reorder_prob,aisle_user_reorder_prob,department_user_reorder_prob
0,1187899,1,196,1,1,77,7,0.121674,0.015809,0.268293,0.268293
1,1854765,21,196,0,1,77,7,0.121674,0.015809,0.019417,0.320388
2,1864787,43,196,0,1,77,7,0.121674,0.015809,0.016949,0.016949
3,1647290,52,196,0,4,77,7,0.121674,0.015809,0.211864,0.211864
4,2757217,67,196,1,4,77,7,0.121674,0.015809,0.310345,0.396552


In [53]:
orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'eval_set', 'order_dow', 'order_hour_of_day'], engine='c')
orders = orders[['order_id', 'order_dow', 'order_hour_of_day']][orders.eval_set=='train']

In [62]:
t8.head()

Unnamed: 0,user_id,product_id,order_dow,week_user_prod_reorder_prob
0,1,196,1,0.333333
1,1,196,2,0.111111
2,1,196,3,0.222222
3,1,196,4,0.333333
4,1,10258,1,0.375


In [63]:
train = train.merge(orders, on='order_id').merge(t1, how='left', on=['product_id', 'order_hour_of_day']).\
merge(t2, how='left', on=['product_id', 'order_dow']).\
merge(t3, how='left', on=['user_id', 'order_hour_of_day']).\
merge(t4, how='left', on=['user_id', 'order_dow']).\
merge(t5, how='left', on=['product_id', 'order_hour_of_day']).\
merge(t6, how='left', on=['product_id', 'order_dow']).fillna(0).\
merge(t7, how='left', on=['user_id', 'product_id', 'order_hour_of_day']).\
merge(t8, how='left', on=['user_id', 'product_id', 'order_dow']).fillna(0)

In [66]:
train.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "train")

In [67]:
#test

test = pd.read_csv(data+"test.tsv", sep='\t', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'product_id': np.uint16
})
test = test.merge(product_detail, on='product_id').merge(d1, how='left', on=['product_id', 'aisle_id', 'department_id']).fillna(0).\
merge(d2, how='left', on=['user_id', 'aisle_id', 'department_id']).fillna(0)
orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'eval_set', 'order_dow', 'order_hour_of_day'], engine='c')
orders = orders[['order_id', 'order_dow', 'order_hour_of_day']][orders.eval_set=='test']
test = test.merge(orders, on='order_id').merge(t1, how='left', on=['product_id', 'order_hour_of_day']).\
merge(t2, how='left', on=['product_id', 'order_dow']).\
merge(t3, how='left', on=['user_id', 'order_hour_of_day']).\
merge(t4, how='left', on=['user_id', 'order_dow']).\
merge(t5, how='left', on=['product_id', 'order_hour_of_day']).\
merge(t6, how='left', on=['product_id', 'order_dow']).fillna(0).\
merge(t7, how='left', on=['user_id', 'product_id', 'order_hour_of_day']).\
merge(t8, how='left', on=['user_id', 'product_id', 'order_dow']).fillna(0)

In [69]:
test.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "test")

In [1]:
# prepare user prod feature
import numpy as np
import pandas as pd

data = "/mnt/d/Data/Instacart/"

priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'order_number', 'days_since_prior_order'], engine='c')
orders['interval_accu'] = orders.groupby(by='user_id')['days_since_prior_order'].cumsum().fillna(0)

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)
product['order_time']=product.groupby(by=['product_id', 'user_id']).cumcount()

In [12]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,days_since_prior_order,interval_accu,order_time
0,2539329,196,0,1,1,,0.0,0
1,2539329,12427,0,1,1,,0.0,0
2,2539329,14084,0,1,1,,0.0,0
3,2539329,26088,0,1,1,,0.0,0
4,2539329,26405,0,1,1,,0.0,0


In [2]:
upsp = product[['user_id', 'product_id', 'order_time']].groupby(by=['user_id', 'product_id']).agg({'order_time': 'max'}).reset_index()

In [3]:
upsp = pd.merge(upsp, product[['user_id', 'product_id', 'order_number', 'interval_accu', 'order_time']], on=['user_id', 'product_id', 'order_time'])

In [4]:
upsp = upsp.rename(columns={'order_number': 'last_order_number'}).drop('order_time', axis=1)

In [38]:
upsp.head()

Unnamed: 0,user_id,product_id,last_order_number,interval_accu
0,1,196,10,176.0
1,1,10258,10,176.0
2,1,10326,5,93.0
3,1,12427,10,176.0
4,1,13032,10,176.0


In [5]:
train = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "train")

In [29]:
train.head()

Unnamed: 0,order_id,user_id,product_id,label,seed,aisle_id,department_id,prod_aisle_reorder_prob,prod_department_reorder_prob,aisle_user_reorder_prob,...,prod_hour_prob,hour_prod_prob,prod_week_prob,week_prod_prob,hour_user_reorder_prob,week_user_reorder_prob,hour_prod_reorder_prob,week_prod_reorder_prob,hour_user_prod_reorder_prob,week_user_prod_reorder_prob
0,1187899,1,196,1,1,77,7,0.121674,0.015809,0.268293,...,0.059652,0.001241,0.15258,0.001442,0.146341,0.390244,0.061531,0.151092,0.111111,0.333333
1,1187899,1,14084,0,1,91,16,0.029236,0.003563,0.0,...,0.055538,0.000515,0.117917,0.000496,0.146341,0.390244,0.05734,0.117697,0.0,0.0
2,1187899,1,12427,0,1,23,19,0.049554,0.002893,0.243902,...,0.065318,0.000246,0.130173,0.000223,0.146341,0.390244,0.071295,0.131957,0.111111,0.333333
3,1187899,1,26088,1,1,23,19,0.014049,0.00082,0.243902,...,0.049544,7.3e-05,0.129608,8.6e-05,0.146341,0.390244,0.054412,0.129412,0.0,0.0
4,1187899,1,26405,1,1,54,17,0.004178,0.001804,0.02439,...,0.066722,4.7e-05,0.147446,4.7e-05,0.146341,0.390244,0.067164,0.16791,0.0,1.0


In [6]:
orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'eval_set', 'order_number', 'days_since_prior_order'], engine='c')
orders['accu_interval'] = orders.groupby(by='user_id')['days_since_prior_order'].cumsum().fillna(0)

In [7]:
train = train.merge(orders[['order_id', 'user_id', 'order_number', 'accu_interval', 'days_since_prior_order']][orders.eval_set=='train'], on=['order_id', 'user_id'])

In [8]:
train = train.merge(upsp, on=['user_id', 'product_id'])
train['user_prod_lastorder_interval'] = train.order_number-train.last_order_number
train['user_prod_lastdays_interval'] = train.accu_interval-train.interval_accu

In [10]:
train.drop(['order_number', 'accu_interval', 'last_order_number', 'interval_accu'], axis=1, inplace=True)
train.rename(columns={'days_since_prior_order': 'user_lastorder_interval'}, inplace=True)

In [11]:
train.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "train")

In [12]:
test = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "test")
test = test.merge(orders[['order_id', 'user_id', 'order_number', 'accu_interval', 'days_since_prior_order']][orders.eval_set=='test'], on=['order_id', 'user_id'])
test = test.merge(upsp, on=['user_id', 'product_id'])
test['user_prod_lastorder_interval'] = test.order_number-test.last_order_number
test['user_prod_lastdays_interval'] = test.accu_interval-test.interval_accu
test.drop(['order_number', 'accu_interval', 'last_order_number', 'interval_accu'], axis=1, inplace=True)
test.rename(columns={'days_since_prior_order': 'user_lastorder_interval'}, inplace=True)
test.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "test")

In [47]:
test.shape

(4833292, 23)

In [48]:
train.shape

(8474661, 25)

In [1]:
# generate some other features

import numpy as np
import pandas as pd

data = "/mnt/d/Data/Instacart/"

train = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "train")
user_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_feature")
product_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "product_feature")
user_product_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_product_feature").reset_index()
train = train.merge(user_feature, left_on='user_id', right_index=True).merge(product_feature, left_on='product_id', right_index=True).merge(user_product_feature, on=['user_id', 'product_id'])

In [5]:
user_feature.head()

Unnamed: 0_level_0,user_nitems,user_norder,user_ndistinctitems,user_nrdistinctitems,user_nritems,user_interval,user_nritem_ratio,user_nrdistinctitem_ratio,user_nitem_per_order,user_nritem_per_order,user_nritem_per_order_ratio,user_second_order_rate
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,59,10,18,10.0,41.0,19.555555,0.694915,0.555556,5.9,4.555556,0.772128,0.555556
2,195,14,102,37.0,93.0,15.230769,0.476923,0.362745,13.928571,7.153846,0.513609,0.362745
3,88,12,33,19.0,55.0,12.090909,0.625,0.575758,7.333333,5.0,0.681818,0.575758
4,18,5,17,1.0,1.0,13.75,0.055556,0.058824,3.6,0.25,0.069444,0.058824
5,37,4,23,8.0,14.0,13.333333,0.378378,0.347826,9.25,4.666667,0.504505,0.347826


In [4]:
train.head()

Unnamed: 0,order_id,user_id,product_id,label,seed,aisle_id,department_id,prod_aisle_reorder_prob,prod_department_reorder_prob,aisle_user_reorder_prob,...,prod_order_per_user,prod_second_order_ratio,index,user_prod_reordered,user_prod_norder,user_prod_days_interval,user_prod_order_interval,user_prod_recentlydiscovered,user_prod_norder_rate,user_prod_days_interval_rate
0,1187899,1,196,1,1,77,7,0.121674,0.015809,0.268293,...,4.473875,0.5825,45698,1,10,19.555555,1.0,0,1.0,1.0
1,1854765,21,196,0,1,77,7,0.121674,0.015809,0.019417,...,4.473875,0.5825,45701,0,1,366.0,100.0,0,0.030303,36.946373
2,1864787,43,196,0,1,77,7,0.121674,0.015809,0.016949,...,4.473875,0.5825,45703,1,2,28.0,3.0,0,0.181818,2.692308
3,1647290,52,196,0,4,77,7,0.121674,0.015809,0.211864,...,4.473875,0.5825,45704,1,14,13.076923,1.538462,0,0.518519,1.440678
4,2757217,67,196,1,4,77,7,0.121674,0.015809,0.310345,...,4.473875,0.5825,45705,1,19,9.611111,1.222222,0,0.791667,1.270434


In [6]:
train['user_prod_lastorder_interval_rate'] = train.user_prod_lastorder_interval / train.user_prod_order_interval
train['user_prod_lastdays_interval_rate']  = train.user_prod_lastdays_interval / train.user_prod_days_interval
train['user_lastorder_interval_ratio'] = train.user_lastorder_interval / train.user_interval

In [31]:
train.user_prod_days_interval_rate = train.user_prod_days_interval_rate.fillna(1)
train.user_prod_days_interval_rate = train.user_prod_days_interval_rate.replace(np.inf, np.nan).fillna(1600)
train.user_prod_lastdays_interval_rate = train.user_prod_lastdays_interval_rate.replace(np.inf, np.nan).fillna(1600)
train.user_lastorder_interval_ratio = train.user_lastorder_interval_ratio.replace(np.inf, np.nan).fillna(1600)

In [29]:
train.loc[train.isnull().any(axis=1), ['user_lastorder_interval', 'user_interval', 'user_lastorder_interval_ratio']]

Unnamed: 0,user_lastorder_interval,user_interval,user_lastorder_interval_ratio
344,0.0,0.000000,
8342,0.0,19.299999,0.000000
9661,0.0,0.000000,
38756,0.0,5.833333,0.000000
49163,0.0,0.000000,
59064,0.0,0.000000,
108878,0.0,20.000000,0.000000
130460,30.0,0.000000,inf
132532,0.0,13.416667,0.000000
177413,0.0,12.500000,0.000000


In [30]:
train.user_lastorder_interval_ratio.replace(np.inf, np.nan).max()

90.0

In [15]:
train.user_prod_days_interval_rate.replace(np.inf, np.nan).max()

1559.4783

In [33]:
train.to_hdf("/mnt/d/Data/Instacart/dataset.hdf", "train")

In [1]:
# generate some other features

import numpy as np
import pandas as pd

data = "/mnt/d/Data/Instacart/"

test = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "test")
user_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_feature")
product_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "product_feature")
user_product_feature = pd.read_hdf("/mnt/d/Data/Instacart/dataset.hdf", "user_product_feature").reset_index()
test = test.merge(user_feature, left_on='user_id', right_index=True).merge(product_feature, left_on='product_id', right_index=True).merge(user_product_feature, on=['user_id', 'product_id'])

test['user_prod_lastorder_interval_rate'] = test.user_prod_lastorder_interval / test.user_prod_order_interval
test['user_prod_lastdays_interval_rate']  = test.user_prod_lastdays_interval / test.user_prod_days_interval
test['user_lastorder_interval_ratio'] = test.user_lastorder_interval / test.user_interval
test.user_prod_days_interval_rate = test.user_prod_days_interval_rate.fillna(1)
test.user_prod_days_interval_rate = test.user_prod_days_interval_rate.replace(np.inf, np.nan).fillna(1600)
test.user_prod_lastdays_interval_rate = test.user_prod_lastdays_interval_rate.replace(np.inf, np.nan).fillna(1600)
test.user_lastorder_interval_ratio = test.user_lastorder_interval_ratio.replace(np.inf, np.nan).fillna(1600)

AttributeError: 'DataFrame' object has no attribute 'user_lastorder_interval'

In [3]:
test.isnull().any(axis=0)

order_id                             False
user_id                              False
product_id                           False
aisle_id                             False
department_id                        False
prod_aisle_reorder_prob              False
prod_department_reorder_prob         False
aisle_user_reorder_prob              False
department_user_reorder_prob         False
order_dow                            False
order_hour_of_day                    False
prod_hour_prob                       False
hour_prod_prob                       False
prod_week_prob                       False
week_prod_prob                       False
hour_user_reorder_prob               False
week_user_reorder_prob               False
hour_prod_reorder_prob               False
week_prod_reorder_prob               False
hour_user_prod_reorder_prob          False
week_user_prod_reorder_prob          False
user_prod_lastorder_interval         False
user_prod_lastdays_interval          False
days_since_