# Machine Learning Engineer Nanodegree
## Capstone Project - InstaCart Market Basket Analysis

### Baseline Benchmarks

This notebook includes code and results for the three baseline models we propose to use as benchmarks:
- Each test order is filled with the most popular products limited to average basket size.
- Each test order duplicates the user's prior order.
- Each test order includes top M products from the users prior orders, where M is the average number of products purchased by the user.

In [2]:
%%time
# Import libraries necessary for this project
import numpy as np
import pandas as pd
import os

print "Loaded libraries"

_datapath = './'
os.chdir(_datapath)

RAW_STORE = 'instacart_raw.hdf5'
HDF_STORE = 'instacart.hdf5'
FVARS_STORE = 'features.hdf5'
STATS_STORE = 'stats.hdf5'

Loaded libraries
Wall time: 18.5 s


In [5]:
%%time

try:
    print "Loading datasets ..."
    priors = pd.read_hdf(HDF_STORE, "priors")
    #departments = pd.read_hdf(HDF_STORE, "departments")
    #products = pd.read_hdf(HDF_STORE, "products")
    orders = pd.read_hdf(HDF_STORE, "orders")
except Exception as  e:
    print e
    print "Dataset could not be loaded. Is the hdf store missing?"

Loading datasets ...
Wall time: 1.81 s


In [2]:
%%time
#Summarize
#orders = orders[:300000]
#print "Summary of aisles {}".format(orders.groupby(by='order_id').reset_index().describe())
print "Number of unique users in test",test_orders.user_id.unique().shape
ttrain = orders[orders.eval_set == 1]
print "Number of unique orders ={} and users ={} in train".format(ttrain.order_id.unique().shape[0], ttrain.user_id.unique().shape[0])
#print "Number of unique users in test",orders[orders['eval_set']==2].user_id.unique().shape
print test_orders[test_orders.user_id.isin(ttrain.user_id)].shape[0]

Number of unique users in test (75000L,)
Number of unique orders =131209 and users =131209 in train
0
Wall time: 281 ms


### Benchmark 1
#### Each test order is filled with the most popular products limited to average basket size

This benchmark does not consider differences between users. We follow these steps:
- Calculate the number of times each product has been ordered
- Calculate average basket size
- Sort the list of products in decreasing order of popularity, then truncate it to the average basket (order) size


In [6]:
priors = priors.sample(frac=0.05, random_state=46)

In [46]:
%%time

bench = priors.merge(orders[['order_id', 'eval_set']], how='left', on='order_id')
test = orders[orders.eval_set==2][['order_id', 'user_id', 'order_number']]

# Find the history set -- prior and train
bench1 = bench[bench.eval_set<2]
# Calculate product popularity
popular = bench1[bench1.reordered==1].groupby('product_id')['order_id'].count().reset_index()\
            .rename(columns={'order_id':'frequency'})
popular.sort_values(by='frequency', ascending=False, inplace=True)

#Calculate average basket size
basket_size = bench1.groupby('order_id')['product_id'].count()\
    .reset_index().rename(columns={'product_id':'Average Size'})
topN = int(round(np.float(basket_size[[1]].mean()[0]), 0))

#Make a list of the truncated list of top-N most popular products
topN_products = ' '.join([str(e) for e in list(popular.product_id[:topN].as_matrix())])

# Create submission file
bench1_submit = test.groupby('order_id')['user_id'].count()\
           .apply(lambda x: topN_products).reset_index()\
           .rename(columns={'user_id':'products'})
#bench1_submit.to_csv('bench1_submission.csv', encoding='utf-8', index=False)

print bench1_submit.head()
# The score reported by Kaggle on submission  - 0.0638739 (scored on 30% of test set)

   order_id products
0        17    24852
1        34    24852
2       137    24852
3       182    24852
4       257    24852
Wall time: 10.1 s


### Benchmark 2 
#### Each test order duplicates the user's prior order. 
We are guaranteed to have a prior order as the orders table includes at least 4 orders per user. 

In [81]:
%%time
last_order_id = orders[orders.eval_set<2]
last_order_id.sort_values(by=['user_id','order_number'],ascending=[True, False], inplace=True)
last_order_id = last_order_id.groupby(['user_id'])['order_id'].apply(lambda x: x.iloc[0])\
                    .reset_index().rename(columns={'order_id':'last_order_id'})
bench2 = test.merge(last_order_id[['user_id', 'last_order_id']], how='left', on='user_id')
bench2 = bench2.merge(priors[['order_id', 'product_id', 'reordered']], how='inner',\
                      left_on='last_order_id', right_on='order_id')\
            .rename(columns={'order_id_x':'order_id', 'order_id_y':'last_order'})

# Create submission file
bench2_submit=bench2.groupby('order_id')['product_id']\
                    .apply(lambda x: " ".join([str(s) for s in list(x.as_matrix())]))\
                    .reset_index().rename(columns={'product_id':'products'})
bench2_submit.sort_values(by='order_id', inplace=True)

print bench2_submit.head()

#bench2_submit.to_csv('bench2_submission.csv', encoding='utf-8', index=False)

# The score reported by Kaggle on submission - 0.3118026 (on 30% of samples)

   order_id products
0        34    47792
1       182    11198
2       414    31215
3       418     8382
4       452    36606
Wall time: 8.77 s


In [76]:
print orders[orders.user_id==6]
print priors[priors.order_id==998866]

    order_id  user_id  eval_set  order_number  order_dow  order_hour_of_day  \
50   2086598        6         0             1          5                 18   
51    298250        6         0             2          4                 16   
52    998866        6         0             3          2                 18   
53   1528013        6         2             4          3                 16   

    days_since_prior_order  
50                     NaN  
51                     6.0  
52                    12.0  
53                    22.0  
         order_id  product_id  add_to_cart_order  reordered
9462737    998866        8424                  3          0


### Benchmark 3 
#### Each test order includes top M products from the users prior orders
This considers the user's average preferences (M is the average number of products purchased by the user).
- compute the number of times a product has been purchased by this user
- compute the average size of each user's orders
- 

In [86]:
bench3.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,eval_set,user_id
0,1427675,26096,20,0,0,103035
1,42191,38273,10,0,0,121983
2,1266959,2067,3,1,0,188407
3,368331,15203,5,0,0,101224
4,503520,42443,1,1,0,77597


In [94]:
%%time
# Find the history set -- prior and train
bench3 = bench[bench.eval_set<2].merge(orders[['order_id', 'user_id']], how='left', on='order_id')
# Calculate product popularity
u_popular = bench3.groupby(['user_id','product_id'])['order_id'].count().reset_index()\
            .rename(columns={'order_id':'frequency'})
u_popular.sort_values(by=['user_id', 'frequency'], ascending=[True, False], inplace=True)

#Calculate average basket size
u_baskets = bench3.groupby(['user_id', 'order_id'])['product_id'].count()\
    .reset_index().rename(columns={'product_id':'basket_size'})
u_baskets = u_baskets.groupby('user_id')['basket_size'].mean().reset_index()
u_baskets.basket_size = u_baskets.basket_size.astype('int16')                            
u_extra = u_popular.merge(u_baskets, how='left', on='user_id')
print u_extra.head()


   user_id  product_id  frequency  basket_size
0        1         196          1            1
1        1       12427          1            1
2        2       32792          2            1
3        2        2002          1            1
4        2        5907          1            1
Wall time: 1.24 s


In [63]:
%%time
# Benchmark 3  submission
# Each test order includes top M products from the users prior orders. M is the average number of products for the user
#del prev3_o
#del prev3_p
if True:
    prev3_o = orders[orders.user_id.isin(test_orders.user_id)]

#prev3_os = prev3_o[:50]
    #prev3_op = op_prior[op_prior.order_id.isin(prev3_o.order_id)]
    prev3_p = pd.merge(left=prev3_o[['order_id','user_id','order_number']],
                       right=op_prior[['order_id','product_id','reordered']],
                       on='order_id')
    print prev3_o.shape, prev3_p.shape
    prev3_os = prev3_p

    user_order_size =  prev3_os.groupby(['user_id', 'order_id'])['product_id'].count().reset_index()
    user_order_size = user_order_size.groupby('user_id')['product_id'].apply(lambda x: int(np.round(x.mean())))\
        .reset_index()
    user_order_size.columns=['user_id', 'avg_size']

#print prev3_os.head()
    user_product_freq =  prev3_os.groupby(['user_id', 'product_id'])['order_id'].count().reset_index()
    user_product_freq.columns = ['user_id', 'product_id', 'order_freq']
    user_product_freq = user_product_freq.sort_values(by=['user_id','order_freq'],ascending=[1,0])
    user_int = pd.merge(left=user_product_freq, right=user_order_size, on='user_id')
    print user_order_size.head()
    print user_product_freq.head()
    print user_int.head()

def agg(x):
    #print "Agg function has been called for {} with {}".format(x[0], type(x[1]))
    size = int(x[1].avg_size.mean())
    t = x[1][:size]
    
    return [x[0]," ".join([str(s) for s in set(t.product_id)])]

groups = user_int.groupby('user_id')
groups=pd.DataFrame([agg(x) for x in groups])
groups.columns=['user_id','products']
bench3 = pd.merge(left=test_orders,
                 right=groups, on='user_id')
print bench3.head()
bench3=bench3[['order_id', 'products']]
if False:
    prev_order.order_number = prev_order.order_number-1
    prev_order.columns = ['user_id','order_number', 'torder_id']
    b2=pd.merge(left=prev_order, right=orders[['user_id', 'order_number','order_id', 'eval_set']], on=['user_id', 'order_number'])
    #print test_orders.head()
    print b2.head()
    print set(b2.eval_set)
    print "orders:", orders.shape
    test_prior = op_prior[op_prior.order_id.isin(b2.order_id)]
    print test_prior.head()

##bench2=pd.merge(left=b2, right=op_prior[['order_id','product_id']], on='order_id')
##print prev_order[prev_order['user_id']==3]
##print test.columns
#bg2 = test_prior.groupby('order_id')['product_id'].apply(lambda x: ' '.join([str(e) for e in set(x)])).reset_index()
#bg2 = bg2[['order_id', 'product_id']]

#bench2=pd.merge(left=b2[['torder_id', 'order_id']], right=bg2, on="order_id")
#print bg2.head()
#print bench2.head()
#print set(test.order_id.isin(bench2.torder_id))

#bench2 = bench2[['torder_id', 'product_id']]
#bench2.columns = ['order_id', 'products']
#print bench2.head()
#test['products'] = bench2
#test['']
bench3=bench3.sort_values(by='order_id',ascending=[1])
bench3.to_csv('bench3_submission.csv', encoding='utf-8', index=False)

# The score reported by Kaggle on submission - 0.3284533 (on 30% of samples) 
#print test_orders.head()


(1242497, 7) (11792498, 5)
   user_id  avg_size
0        3         7
1        4         4
2        6         5
3       11        13
4       12        15
    user_id  product_id  order_freq
23        3       39190          10
30        3       47766           9
15        3       21903           8
5         3        9387           5
11        3       17668           5
   user_id  product_id  order_freq  avg_size
0        3       39190          10         7
1        3       47766           9         7
2        3       21903           8         7
3        3        9387           5         7
4        3       17668           5         7
   order_id  user_id  eval_set  order_number  order_dow  order_hour_of_day  \
0   2774568        3         2            13          5                 15   
1    329954        4         2             6          3                 12   
2   1528013        6         2             4          3                 16   
3   1376945       11         2             8     