In [1]:
import numpy as np
import pandas as pd
from setting import *
import gc

gc.enable()

second_order_ratio = lambda x: x[x==1].count()/x[x==0].count()

In [36]:
product_detail = pd.read_csv(data+"products.csv", dtype = {
    'product_id': np.uint16,
    'product_name': str,
    'aisle_id': np.uint8,
    'department_id': np.uint8
}, usecols = ['product_id', 'aisle_id', 'department_id'])

priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'eval_set', 'order_number'], engine='c')

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)
product.drop('eval_set', axis=1, inplace=True)
product = product.merge(product_detail, on='product_id')

In [37]:
product.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number,aisle_id,department_id
0,2539329,196,0,1,1,77,7
1,2398795,196,1,1,2,77,7
2,473747,196,1,1,3,77,7
3,2254736,196,1,1,4,77,7
4,431534,196,1,1,5,77,7


In [39]:
user_aisle_rcnt = product[product.reordered==1][['user_id', 'aisle_id', 'order_id']].groupby(by=['user_id', 'aisle_id'])['order_id'].\
agg({'user_aisle_rcnt': 'count'})
user_dep_rcnt = product[product.reordered==1][['user_id', 'department_id', 'order_id']].\
groupby(by=['user_id', 'department_id'])['order_id'].agg({'user_dep_rcnt': 'count'})
user_rcnt = product[product.reordered==1][['user_id', 'order_id']].groupby(by='user_id')['order_id'].agg({'user_rcnt': 'count'})

prod_aisle_rcnt = product[product.reordered==1][['product_id', 'aisle_id', 'order_id']].groupby(by=['product_id', 'aisle_id'])['order_id'].\
agg({'prod_aisle_rcnt': 'count'})
prod_dep_rcnt = product[product.reordered==1][['product_id', 'department_id', 'order_id']].\
groupby(by=['product_id', 'department_id'])['order_id'].agg({'prod_dep_rcnt': 'count'})
aisle_cnt = product[product.reordered==1][['aisle_id', 'order_id']].groupby(by='aisle_id')['order_id'].\
agg({'aisle_cnt': 'count'})
dep_cnt = product[product.reordered==1][['department_id', 'order_id']].groupby(by='department_id')['order_id'].\
agg({'dep_cnt': 'count'})

In [47]:
train = pd.read_csv(data+"train.tsv", sep='\t', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'product_id': np.uint16,
    'label': np.int8
}, usecols=['order_id', 'user_id', 'product_id'], engine='c')

train = train.merge(product_detail, on='product_id').\
merge(user_aisle_rcnt, how='left', left_on=['user_id', 'aisle_id'], right_index=True).fillna(0).\
merge(user_dep_rcnt, how='left', left_on=['user_id', 'department_id'], right_index=True).fillna(0).\
merge(user_rcnt, how='left', left_on='user_id', right_index=True).fillna(0).\
merge(prod_aisle_rcnt, how='left', left_on=['product_id', 'aisle_id'], right_index=True).fillna(0).\
merge(prod_dep_rcnt, how='left', left_on=['product_id', 'department_id'], right_index=True).fillna(0).\
merge(aisle_cnt, how='left', left_on='aisle_id', right_index=True).fillna(0).\
merge(dep_cnt, how='left', left_on='department_id', right_index=True).fillna(0)

In [49]:
train['aisle_user_reorder_prob'] = (train.user_aisle_rcnt+1) / (train.user_rcnt+134)
train['dep_user_reorder_prob'] = (train.user_dep_rcnt+1) / (train.user_rcnt+21)
train['prod_aisle_reorder_prob'] = train.prod_aisle_rcnt / train.aisle_cnt
train['prod_dep_reorder_prob'] = train.prod_dep_rcnt / train.dep_cnt

In [52]:
train.drop(['user_aisle_rcnt', 'user_dep_rcnt', 'user_rcnt', 'prod_aisle_rcnt', 'prod_dep_rcnt', 'aisle_cnt', 'dep_cnt'], axis=1, inplace=True)

In [51]:
train.head()


Unnamed: 0,order_id,user_id,product_id,aisle_id,department_id,user_aisle_rcnt,user_dep_rcnt,user_rcnt,prod_aisle_rcnt,prod_dep_rcnt,aisle_cnt,dep_cnt,aisle_user_reorder_prob,dep_user_reorder_prob,prod_aisle_reorder_prob,prod_dep_reorder_prob
0,1187899,1,196,77,7,11.0,11.0,41.0,27791.0,27791.0,228406,1757892,0.068571,0.193548,0.121674,0.015809
1,1854765,21,196,77,7,2.0,33.0,103.0,27791.0,27791.0,228406,1757892,0.012658,0.274194,0.121674,0.015809
2,1864787,43,196,77,7,1.0,1.0,59.0,27791.0,27791.0,228406,1757892,0.010363,0.025,0.121674,0.015809
3,1647290,52,196,77,7,25.0,25.0,118.0,27791.0,27791.0,228406,1757892,0.103175,0.18705,0.121674,0.015809
4,2757217,67,196,77,7,18.0,23.0,58.0,27791.0,27791.0,228406,1757892,0.098958,0.303797,0.121674,0.015809


In [6]:
product_detail = pd.read_csv(data+"products.csv", dtype = {
    'product_id': np.uint16,
    'product_name': str,
    'aisle_id': np.uint8,
    'department_id': np.uint8
}, usecols = ['product_id', 'aisle_id', 'department_id'])

priors = pd.read_csv(data + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8}, usecols = ['order_id', 'product_id', 'reordered'], engine='c')

orders = pd.read_csv(data + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32}, usecols = ['order_id', 'user_id', 'eval_set', 'order_number'], engine='c')

product = pd.merge(priors, orders, on='order_id').sort_values(by=['user_id', 'order_number', 'product_id']).reset_index(drop=True)
product.drop('eval_set', axis=1, inplace=True)

In [7]:
d = pd.merge(product_detail, product[['product_id', 'user_id', 'order_id']][product.reordered==1], on='product_id')

d2 = d[['user_id', 'order_id']].groupby(by='user_id')['order_id'].agg({'user_rorder_cnt': 'count'}).reset_index()
d21 = d[['user_id', 'aisle_id', 'order_id']].groupby(by=['user_id', 'aisle_id'])['order_id'].agg({'aisle_count': 'count'}).reset_index()
d22 = d[['user_id', 'department_id', 'order_id']].groupby(by=['user_id', 'department_id'])['order_id'].agg({'department_count': 'count'}).reset_index()
d2 = d2.merge(d21, on='user_id').merge(d22, on='user_id')
d2['aisle_user_reorder_prob'] = d2.aisle_count/d2.user_rorder_cnt
d2['department_user_reorder_prob'] = d2.department_count/d2.user_rorder_cnt
d2.drop(['user_rorder_cnt', 'aisle_count', 'department_count'], axis=1, inplace=True)