In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 14

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
aisles = pd.read_csv('../../data/aisles.csv')
departments = pd.read_csv('../../data/departments.csv')
order_products_prior = pd.read_csv('../../data/order_products__prior.csv')
order_products_train = pd.read_csv('../../data/order_products__train.csv')
orders = pd.read_csv('../../data/orders.csv')
products = pd.read_csv('../../data/products.csv')
order_test = pd.read_csv('../../data/sample_submission.csv')

order_products_prior_df = pd.read_csv('../../data/order_products_prior_df.csv')
prior_orders_with_product = pd.read_csv('../../data/prior_orders_with_product.csv') 

Creating base table by joining prior and training data. For the purpose of this project I'm removing the test dataset, and instead portioning out some of the training set to use as test data, to be able to measure accuracy. 

In [3]:
orders = orders
train = order_products_train
prior = order_products_prior

train = train.merge(orders, on='order_id', how='left')
prior = prior.merge(orders, on='order_id', how='left')

new = train.append(prior, ignore_index=True)
new['target'] = 0
new.loc[new['eval_set'] == 'train', 'target'] = 1
new.loc[new['eval_set'] == 'test', 'target'] = 2

In [4]:
user_products = new[(new.target==0)|(new.target==1)]
user_products_test = new[new.eval_set=='test']

In [5]:
# # order_id, product_id, add_to_cart_order, reorered, product_name, aisle_id, department_id, aisle, department 
# order_products_prior_df.head()
# order_products_prior_df.drop('Unnamed: 0',axis=1,inplace=True)

# #order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order, num_orders, num_products
# prior_orders_with_product.head()
# prior_orders_with_product.drop('Unnamed: 0',axis=1,inplace=True)

# user_products = new



In [6]:
user_products
# user_products.groupby(['user_id','product_id']).agg({'order_number':['count','min','max']})

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,target
0,1,49302,1,1,112108,train,4,4,10,9.000,1
1,1,11109,2,1,112108,train,4,4,10,9.000,1
2,1,10246,3,0,112108,train,4,4,10,9.000,1
3,1,49683,4,0,112108,train,4,4,10,9.000,1
4,1,43633,5,1,112108,train,4,4,10,9.000,1
...,...,...,...,...,...,...,...,...,...,...,...
33819101,3421083,39678,6,1,25247,prior,24,2,6,21.000,0
33819102,3421083,11352,7,0,25247,prior,24,2,6,21.000,0
33819103,3421083,4600,8,0,25247,prior,24,2,6,21.000,0
33819104,3421083,24852,9,1,25247,prior,24,2,6,21.000,0


## Product Stats

The purpose of this section is to pull order metrics aggregated by product. First we do a running count of products ordered group by the user and product. The max(row_num) will output how many times a user ordered a product across all orders. 

In [7]:
user_products['row_num'] = user_products.groupby(['user_id','product_id']).cumcount()+1

In [42]:
user_products.shape

(33819106, 12)

I'm curious to see reorder rates between first and second orders of a product. The underlying assumption here is that something that is ordered more than once indicates some interest in the product to be ordered again. The prod_first_orders and prod_second_orders dfs will output how many times a product was ordered once or twice by the same user respectively.

In [9]:
first_order = user_products[user_products.row_num ==1]
second_order = user_products[user_products.row_num ==2]

In [10]:
prod_orders = user_products.groupby(['product_id']).agg({'order_id':'count'})
prod_orders

Unnamed: 0_level_0,order_id
product_id,Unnamed: 1_level_1
1,1928
2,94
3,283
4,351
5,16
...,...
49684,9
49685,49
49686,127
49687,14


In [11]:
prod_reorders = user_products.groupby(['product_id']).agg({'reordered':'sum'})
prod_reorders

Unnamed: 0_level_0,reordered
product_id,Unnamed: 1_level_1
1,1185
2,13
3,209
4,161
5,10
...,...
49684,1
49685,6
49686,89
49687,6


In [12]:
prod_first_orders = first_order.groupby('product_id').agg({'row_num':'sum'})
prod_first_orders

Unnamed: 0_level_0,row_num
product_id,Unnamed: 1_level_1
1,743
2,81
3,74
4,190
5,6
...,...
49684,8
49685,43
49686,38
49687,8


In [13]:
prod_second_orders = second_order.groupby('product_id').agg({'row_num':'sum'})
prod_second_orders

Unnamed: 0_level_0,row_num
product_id,Unnamed: 1_level_1
1,578
2,18
3,74
4,134
5,8
...,...
49684,2
49685,12
49686,34
49687,8


In [14]:
prod_df = pd.merge(prod_orders, prod_reorders, how='left', on='product_id')
prod_df = pd.merge(prod_df, prod_first_orders, how='left', on='product_id')
prod_df = pd.merge(prod_df, prod_second_orders, how='left', on='product_id')

prod_df.rename(columns={'order_id':'ordered','row_num_x':'num_users_order_product_once','row_num_y':'num_users_order_product_twice'},inplace=True)

prod_df

Unnamed: 0_level_0,ordered,reordered,num_users_order_product_once,num_users_order_product_twice
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1928,1185,743,578.000
2,94,13,81,18.000
3,283,209,74,74.000
4,351,161,190,134.000
5,16,10,6,8.000
...,...,...,...,...
49684,9,1,8,2.000
49685,49,6,43,12.000
49686,127,89,38,34.000
49687,14,6,8,8.000


In [15]:
prod_df['reorder_ratio'] = prod_df.num_users_order_product_twice/prod_df.num_users_order_product_once
prod_df['times_reordered'] = (prod_df.reordered / prod_df.num_users_order_product_once)+1
prod_df['reorders_prob'] = prod_df.reordered / prod_df.ordered

prod_df

Unnamed: 0_level_0,ordered,reordered,num_users_order_product_once,num_users_order_product_twice,reorder_ratio,times_reordered,reorders_prob
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1928,1185,743,578.000,0.778,2.595,0.615
2,94,13,81,18.000,0.222,1.160,0.138
3,283,209,74,74.000,1.000,3.824,0.739
4,351,161,190,134.000,0.705,1.847,0.459
5,16,10,6,8.000,1.333,2.667,0.625
...,...,...,...,...,...,...,...
49684,9,1,8,2.000,0.250,1.125,0.111
49685,49,6,43,12.000,0.279,1.140,0.122
49686,127,89,38,34.000,0.895,3.342,0.701
49687,14,6,8,8.000,1.000,1.750,0.429


In [16]:
# prod_df.drop(['reorders_to_orders','reorder_probability'],axis=1,inplace=True)
# prod_df.rename(columns={'reorder_ratio':'order_twice_ratio','reorders_prob':'overall_reorder_prob'})

### User-Product

In [17]:
user_product_df = user_products.groupby(['user_id','product_id']).agg(user_product_count=("order_number","count"),
                                                                          user_product_first_order=("order_number","min"),
                                                                          user_product_last_order=("order_number","max"),
                                                                          user_product_avg_basket_placement=("add_to_cart_order","mean"))
                                                      

In [57]:
user_product_df

Unnamed: 0,user_id,product_id,user_product_count,user_product_first_order,user_product_last_order,user_product_avg_basket_placement
0,1,196,11,1,11,1.364
1,1,10258,10,2,11,3.600
2,1,10326,1,5,5,5.000
3,1,12427,10,1,10,3.300
4,1,13032,4,2,11,6.500
...,...,...,...,...,...,...
13863741,206209,43961,3,4,12,8.000
13863742,206209,44325,1,7,7,8.000
13863743,206209,48370,1,11,11,8.000
13863744,206209,48697,1,7,7,6.000


## User Stats

### Product

In [53]:
print(user_products.user_id.nunique())
user_products

206209


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,target,row_num
0,1,49302,1,1,112108,train,4,4,10,9.000,1,1
1,1,11109,2,1,112108,train,4,4,10,9.000,1,1
2,1,10246,3,0,112108,train,4,4,10,9.000,1,1
3,1,49683,4,0,112108,train,4,4,10,9.000,1,1
4,1,43633,5,1,112108,train,4,4,10,9.000,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
33819101,3421083,39678,6,1,25247,prior,24,2,6,21.000,0,3
33819102,3421083,11352,7,0,25247,prior,24,2,6,21.000,0,1
33819103,3421083,4600,8,0,25247,prior,24,2,6,21.000,0,1
33819104,3421083,24852,9,1,25247,prior,24,2,6,21.000,0,13


In [19]:
prod_counts= user_products.groupby('user_id').agg(total_products=('product_id','count'),
                                                  reordered_count=('reordered','sum'))

In [20]:
prod_unique = user_products.groupby('user_id').product_id.nunique().reset_index()
prod_unique.columns = ['user_id', 'unique_product_count']

user_prod_counts = pd.merge(prod_counts, prod_unique, how='left',on='user_id')

In [21]:
reordered = user_products[user_products.reordered ==1]
reordered_unique = reordered.groupby('user_id').product_id.nunique().reset_index()
reordered_unique.columns=['user_id','unique_reordered_count']

user_prod_counts= pd.merge(user_prod_counts, reordered_unique, how='left',on='user_id')

Getting count of total products purchased over customer lifetime vs unique products \
Also number of total reordered products ever purchased, and number of unique reordered products

In [22]:
user_prod_counts.head()
user_prod_counts['prob_unique'] = user_prod_counts.unique_product_count/user_prod_counts.total_products
user_prod_counts['prob_reordered_unique'] = user_prod_counts.unique_reordered_count/user_prod_counts.reordered_count
user_prod_counts.head()

Unnamed: 0,user_id,total_products,reordered_count,unique_product_count,unique_reordered_count,prob_unique,prob_reordered_unique
0,1,70,51,19,12.0,0.271,0.235
1,2,226,105,121,42.0,0.535,0.4
2,3,88,55,33,19.0,0.375,0.345
3,4,18,1,17,1.0,0.944,1.0
4,5,46,18,28,10.0,0.609,0.556


Calculating the reorder ratio per user\
reorder count is how many times user reordered any product \
multi_order_count is count of all products where there are multiple orders (total products -first order)

In [23]:
multi_order_user = user_products[user_products.order_number > 1]
sum_multi_orders = multi_order_user.groupby('user_id').agg(multi_order_count=('order_number','count'))
sum_reordered = reordered.groupby('user_id').agg(reordered_count=('reordered','sum'))

user_reorder_ratio = pd.merge(sum_reordered, sum_multi_orders,how='left',on='user_id')

In [24]:
user_reorder_ratio['reorder_ratio'] = user_reorder_ratio.reordered_count/user_reorder_ratio.multi_order_count
user_reorder_ratio

Unnamed: 0_level_0,reordered_count,multi_order_count,reorder_ratio
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,51,65,0.785
2,105,213,0.493
3,55,78,0.705
4,1,14,0.071
5,18,35,0.514
...,...,...,...
206205,14,34,0.412
206206,135,281,0.480
206207,131,199,0.658
206208,479,665,0.720


In [25]:
user_stats = pd.merge(user_prod_counts, user_reorder_ratio, how='left',on='user_id')
user_stats.drop('reordered_count_y',axis=1, inplace=True)
user_stats.rename(columns={'reordered_count_x':'total_products_reordered','prob_unique':'perc_unique','prob_reordered_unique':'perc_reordered_unique'},inplace=True)

### Orders

In [26]:
total_orders = user_products.groupby('user_id').agg(total_orders=('order_number','max'))

In [27]:
user_stats = pd.merge(user_stats,total_orders,how='left',on='user_id')
user_stats

Unnamed: 0,user_id,total_products,total_products_reordered,unique_product_count,unique_reordered_count,perc_unique,perc_reordered_unique,multi_order_count,reorder_ratio,total_orders
0,1,70,51,19,12.000,0.271,0.235,65.000,0.785,11
1,2,226,105,121,42.000,0.535,0.400,213.000,0.493,15
2,3,88,55,33,19.000,0.375,0.345,78.000,0.705,12
3,4,18,1,17,1.000,0.944,1.000,14.000,0.071,5
4,5,46,18,28,10.000,0.609,0.556,35.000,0.514,5
...,...,...,...,...,...,...,...,...,...,...
206204,206205,51,14,37,10.000,0.725,0.714,34.000,0.412,4
206205,206206,285,135,150,44.000,0.526,0.326,281.000,0.480,67
206206,206207,223,131,92,48.000,0.413,0.366,199.000,0.658,16
206207,206208,677,479,198,95.000,0.292,0.198,665.000,0.720,49


In [28]:
user_time_dim = user_products.groupby('user_id').agg(customer_lifetime_days = ('days_since_prior_order','sum'),
                                     avg_days_between_orders= ('days_since_prior_order','mean'),
                                     max_time_between_orders= ('days_since_prior_order', 'max'),
                                     min_time_between_orders= ('days_since_prior_order','min'))


In [29]:
user_stats = pd.merge(user_stats, user_time_dim,how='left',on='user_id')
user_stats


Unnamed: 0,user_id,total_products,total_products_reordered,unique_product_count,unique_reordered_count,perc_unique,perc_reordered_unique,multi_order_count,reorder_ratio,total_orders,customer_lifetime_days,avg_days_between_orders,max_time_between_orders,min_time_between_orders
0,1,70,51,19,12.000,0.271,0.235,65.000,0.785,11,1248.000,19.200,30.000,0.000
1,2,226,105,121,42.000,0.535,0.400,213.000,0.493,15,3836.000,18.009,30.000,3.000
2,3,88,55,33,19.000,0.375,0.345,78.000,0.705,12,896.000,11.487,21.000,7.000
3,4,18,1,17,1.000,0.944,1.000,14.000,0.071,5,215.000,15.357,21.000,0.000
4,5,46,18,28,10.000,0.609,0.556,35.000,0.514,5,431.000,12.314,19.000,6.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206204,206205,51,14,37,10.000,0.725,0.714,34.000,0.412,4,500.000,14.706,30.000,10.000
206205,206206,285,135,150,44.000,0.526,0.326,281.000,0.480,67,1136.000,4.043,15.000,0.000
206206,206207,223,131,92,48.000,0.413,0.366,199.000,0.658,16,2961.000,14.879,30.000,1.000
206207,206208,677,479,198,95.000,0.292,0.198,665.000,0.720,49,4949.000,7.442,20.000,0.000


In [30]:
user_stats['avg_cart_size'] = user_stats.total_products/user_stats.total_orders

user_stats

Unnamed: 0,user_id,total_products,total_products_reordered,unique_product_count,unique_reordered_count,perc_unique,perc_reordered_unique,multi_order_count,reorder_ratio,total_orders,customer_lifetime_days,avg_days_between_orders,max_time_between_orders,min_time_between_orders,avg_cart_size
0,1,70,51,19,12.000,0.271,0.235,65.000,0.785,11,1248.000,19.200,30.000,0.000,6.364
1,2,226,105,121,42.000,0.535,0.400,213.000,0.493,15,3836.000,18.009,30.000,3.000,15.067
2,3,88,55,33,19.000,0.375,0.345,78.000,0.705,12,896.000,11.487,21.000,7.000,7.333
3,4,18,1,17,1.000,0.944,1.000,14.000,0.071,5,215.000,15.357,21.000,0.000,3.600
4,5,46,18,28,10.000,0.609,0.556,35.000,0.514,5,431.000,12.314,19.000,6.000,9.200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206204,206205,51,14,37,10.000,0.725,0.714,34.000,0.412,4,500.000,14.706,30.000,10.000,12.750
206205,206206,285,135,150,44.000,0.526,0.326,281.000,0.480,67,1136.000,4.043,15.000,0.000,4.254
206206,206207,223,131,92,48.000,0.413,0.366,199.000,0.658,16,2961.000,14.879,30.000,1.000,13.938
206207,206208,677,479,198,95.000,0.292,0.198,665.000,0.720,49,4949.000,7.442,20.000,0.000,13.816


In [54]:
# orders.drop('num_orders',axis=1,inplace=True)
orders_train = user_products[user_products.eval_set == 'train']
orders_train

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,target,row_num
0,1,49302,1,1,112108,train,4,4,10,9.000,1,1
1,1,11109,2,1,112108,train,4,4,10,9.000,1,1
2,1,10246,3,0,112108,train,4,4,10,9.000,1,1
3,1,49683,4,0,112108,train,4,4,10,9.000,1,1
4,1,43633,5,1,112108,train,4,4,10,9.000,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1384612,3421063,14233,3,1,169679,train,30,0,10,4.000,1,1
1384613,3421063,35548,4,1,169679,train,30,0,10,4.000,1,1
1384614,3421070,35951,1,1,139822,train,15,6,10,8.000,1,1
1384615,3421070,16953,2,1,139822,train,15,6,10,8.000,1,1


In [32]:
order_products_train

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [33]:
# prod_df=prod_df.reset_index()
prod_df
prod_df.rename(columns={'ordered':'product_ordered_vol', 
                        'reordered':'product_reordered_vol',
                        'num_users_order_product_once': 'product_ordered_once_vol',
                        'num_users_order_product_twice': 'product_ordered_twice_vol',
                        'reorder_ratio':'product_order_twice_ratio',
                        'times_reordered':'product_avg_reorders',
                        'reorders_prob':'product_overall_reorder_prob'},inplace=True)

In [34]:
user_stats
user_stats.rename(columns={'total_products':'user_total_products',
                           'total_products_reordered':'user_total_products_reordered',
                           'unique_product_count':'user_unique_product_count',
                           'unique_reordered_count':'user_unique_reorder_count',
                           'perc_unique':'user_unique_product_perc',
                           'perc_reordered_unique': 'user_unique_reorder_perc',
                           'multi_order_count':'user_total_items_after_first_order',
                           'reorder_ratio':'user_reorder_ratio',
                           'total_orders':'user_total_orders',
                           'customer_lifetime_days':'user_lifetime_days',
                           'avg_days_between_orders':'user_avg_days_between_orders',
                           'max_time_between_orders':'user_max_time_between_orders',
                           'min_time_between_orders':'user_min_time_between_orders',
                           'avg_cart_size':'user_avg_cart_size'},inplace=True)
                           

In [35]:
# user_product_df= user_product_df.reset_index()
user_product_full = user_product_df.merge(prod_df, how='left',on='product_id').merge(user_stats, how='left', on='user_id')

In [56]:
user_product_full.shape

(13863746, 6)

In [36]:
user_product_full['user_product_order_rate'] = user_product_full.user_product_count/user_product_full.user_total_orders
user_product_full['user_product_reorder_rate'] = (user_product_full.user_product_count / 
                                                  (user_product_full.user_total_orders-
                                                   user_product_full.user_product_first_order +1))
user_product_full['user_product_last_time_product_ordered'] = user_product_full.user_total_orders - user_product_full.user_product_last_order


In [37]:
user_product_full

Unnamed: 0,user_id,product_id,user_product_count,user_product_first_order,user_product_last_order,user_product_avg_basket_placement,product_ordered_vol,product_reordered_vol,product_ordered_once_vol,product_ordered_twice_vol,...,user_reorder_ratio,user_total_orders,user_lifetime_days,user_avg_days_between_orders,user_max_time_between_orders,user_min_time_between_orders,user_avg_cart_size,user_product_order_rate,user_product_reorder_rate,user_product_last_time_product_ordered
0,1,196,11,1,11,1.364,37298,29012,8286,9614.000,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,1.000,1.000,0
1,1,10258,10,2,11,3.600,2050,1467,583,660.000,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,0.909,1.000,0
2,1,10326,1,5,5,5.000,5947,3886,2061,2178.000,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,0.091,0.143,6
3,1,12427,10,1,10,3.300,6697,4957,1740,1828.000,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,0.909,0.909,1
4,1,13032,4,2,11,6.500,3904,2581,1323,1302.000,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,0.364,0.400,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13863741,206209,43961,3,4,12,8.000,57831,36582,21249,22112.000,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.214,0.273,2
13863742,206209,44325,1,7,7,8.000,3643,1471,2172,1278.000,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.071,0.125,7
13863743,206209,48370,1,11,11,8.000,4118,2887,1231,1472.000,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.071,0.250,3
13863744,206209,48697,1,7,7,6.000,10151,3645,6506,3416.000,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.071,0.125,7


In [68]:
# orders_train_df = user_products[user_products.eval_set=='train']
prior_train_df = user_products[['order_id','user_id','product_id','reordered','target']]

In [71]:
prior_train_df.target.value_counts()
prior_train_df

Unnamed: 0,order_id,user_id,product_id,reordered,target
0,1,112108,49302,1,1
1,1,112108,11109,1,1
2,1,112108,10246,0,1
3,1,112108,49683,0,1
4,1,112108,43633,1,1
...,...,...,...,...,...
33819101,3421083,25247,39678,1,0
33819102,3421083,25247,11352,0,0
33819103,3421083,25247,4600,0,0
33819104,3421083,25247,24852,1,0


In [72]:
full_df = prior_train_df.merge(user_product_full, how='right',on=['user_id','product_id'])
full_df

Unnamed: 0,order_id,user_id,product_id,reordered,target,user_product_count,user_product_first_order,user_product_last_order,user_product_avg_basket_placement,product_ordered_vol,...,user_reorder_ratio,user_total_orders,user_lifetime_days,user_avg_days_between_orders,user_max_time_between_orders,user_min_time_between_orders,user_avg_cart_size,user_product_order_rate,user_product_reorder_rate,user_product_last_time_product_ordered
0,1187899,1,196,1,1,11,1,11,1.364,37298,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,1.000,1.000,0
1,431534,1,196,1,0,11,1,11,1.364,37298,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,1.000,1.000,0
2,473747,1,196,1,0,11,1,11,1.364,37298,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,1.000,1.000,0
3,550135,1,196,1,0,11,1,11,1.364,37298,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,1.000,1.000,0
4,2254736,1,196,1,0,11,1,11,1.364,37298,...,0.785,11,1248.000,19.200,30.000,0.000,6.364,1.000,1.000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33819101,550836,206209,44325,0,0,1,7,7,8.000,3643,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.071,0.125,7
33819102,1854736,206209,48370,0,0,1,11,11,8.000,4118,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.071,0.250,3
33819103,550836,206209,48697,0,0,1,7,7,6.000,10151,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.071,0.125,7
33819104,550836,206209,48742,0,0,2,7,12,9.000,1798,...,0.524,14,2592.000,20.903,30.000,3.000,9.786,0.143,0.250,2


In [74]:
# train_df.fillna(0, inplace=True)
# train_df.isnull().sum()

In [75]:
#checking for duplicates
full_df[full_df.duplicated(subset=['user_id','product_id','order_id'])==True]

Unnamed: 0,order_id,user_id,product_id,reordered,target,user_product_count,user_product_first_order,user_product_last_order,user_product_avg_basket_placement,product_ordered_vol,...,user_reorder_ratio,user_total_orders,user_lifetime_days,user_avg_days_between_orders,user_max_time_between_orders,user_min_time_between_orders,user_avg_cart_size,user_product_order_rate,user_product_reorder_rate,user_product_last_time_product_ordered


In [76]:
train_df = full_df[full_df.target == 1]
prior_df = full_df[full_df.target == 0]
print(train_df.shape, prior_df.shape)

(1384617, 33) (32434489, 33)


#### Data Dict
- user_id: unique user
- product_id: unique product per user 
- reordered: Boolean, 1 if product was reordered by user
- user_product_count: Total times product was ordered by user
- user_product_first_order: First order placed containing product by user
- user_product_last_order: Last order placed containing product by user
- user_product_avg_basket_placement: Average 'add_to_cart_order' for product by user
- product_ordered_vol: Total amount of this product ordered
- product_reordered_vol: 
- product_ordered_once_vol: 
- product_ordered_twice_vol: 
- product_order_twice_ratio: 
- product_avg_reorders: 
- product_overall_reorder_prob: 
- user_total_products: 
- user_total_products_reordered: 
- user_unique_product_count: 
- user_unique_reorder_count: 
- user_unique_product_perc: 
- user_unique_reorder_perc: 
- user_total_items_after_first_order: 
- user_reorder_ratio: reorder_count/total_items_after_first_order
- user_total_orders: 
- user_lifetime_days: 
- user_avg_days_between_orders: 
- user_max_time_between_orders: 
- user_min_time_between_orders: 
- user_avg_cart_size: 
- user_product_order_rate: 
- user_product_reorder_rate: 
- user_product_last_time_product_ordered: number of orders since user last ordered a product

## Save to CSV

In [77]:
full_df.to_csv('../../data/full_df.csv')
train_df.to_csv('../../data/train_df.csv')
prior_df.to_csv('../../data/prior_df.csv')
user_products_test.to_csv('../../data/test_df.csv')

In [78]:
user_products.to_csv('../../data/user_products.csv')