### Instacart released a public dataset, “The Instacart Online Grocery Shopping Dataset 2017”. The dataset contains over 3 million anonymized grocery orders from more than 200,000 Instacart users. This analysis will make use of this datasets.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, precision_recall_fscore_support

# Set data directory and uploading files

In [3]:
dir = './'

# Loading datasets
print('loading aisles')
aislesDF = pd.read_csv(dir + 'aisles.csv')
aislesDF

loading aisles


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation
...,...,...
129,130,hot cereal pancake mixes
130,131,dry pasta
131,132,beauty
132,133,muscles joints pain relief


In [4]:
print('loading departments')
departmentsDF = pd.read_csv(dir + 'departments.csv')
departmentsDF

loading departments


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [5]:
print('loading order_products_prior')
order_products_priorDF = pd.read_csv(dir + 'order_products__prior.csv')
order_products_priorDF

loading order_products_prior


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [6]:
print('loading order_products_train')
order_products_trainDF = pd.read_csv(dir + 'order_products__train.csv')
order_products_trainDF

loading order_products_train


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [7]:
print('loading orders')
ordersDF = pd.read_csv(dir + 'orders.csv')
ordersDF

loading orders


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [8]:
print('loading products')
productsDF = pd.read_csv(dir + 'products.csv')
productsDF

loading products


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


# Create a DataFrame for products with reorder statistics

In [10]:
products_raw = pd.DataFrame()
products_raw['ordersTotal'] = order_products_priorDF.groupby(order_products_priorDF.product_id).size()

products_raw

Unnamed: 0_level_0,ordersTotal
product_id,Unnamed: 1_level_1
1,1852
2,90
3,277
4,329
5,15
...,...
49684,9
49685,49
49686,120
49687,13


In [11]:
products_raw['reordersTotal'] = order_products_priorDF['reordered'].groupby(order_products_priorDF.product_id).sum()
products_raw

Unnamed: 0_level_0,ordersTotal,reordersTotal
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1852,1136
2,90,12
3,277,203
4,329,147
5,15,9
...,...,...
49684,9,1
49685,49,6
49686,120,84
49687,13,6


In [12]:
products_raw['reorder_rate'] = (products_raw.reordersTotal / products_raw.ordersTotal)
products_raw

Unnamed: 0_level_0,ordersTotal,reordersTotal,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1852,1136,0.613391
2,90,12,0.133333
3,277,203,0.732852
4,329,147,0.446809
5,15,9,0.600000
...,...,...,...
49684,9,1,0.111111
49685,49,6,0.122449
49686,120,84,0.700000
49687,13,6,0.461538


# Merge product details with reorder statistics

In [14]:
products = productsDF.join(products_raw, on = 'product_id')
products

Unnamed: 0,product_id,product_name,aisle_id,department_id,ordersTotal,reordersTotal,reorder_rate
0,1,Chocolate Sandwich Cookies,61,19,1852.0,1136.0,0.613391
1,2,All-Seasons Salt,104,13,90.0,12.0,0.133333
2,3,Robust Golden Unsweetened Oolong Tea,94,7,277.0,203.0,0.732852
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,329.0,147.0,0.446809
4,5,Green Chile Anytime Sauce,5,13,15.0,9.0,0.600000
...,...,...,...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,9.0,1.0,0.111111
49684,49685,En Croute Roast Hazelnut Cranberry,42,1,49.0,6.0,0.122449
49685,49686,Artisan Baguette,112,3,120.0,84.0,0.700000
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,13.0,6.0,0.461538


In [15]:
del products_raw

# Merge order details with prior order products

In [17]:
ordersDF.reset_index(drop=True, inplace=True)
ordersDF

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [18]:
priors = pd.merge(order_products_priorDF, ordersDF, how = 'left', on = 'order_id')
priors

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0
...,...,...,...,...,...,...,...,...,...,...
32434484,3421083,39678,6,1,25247,prior,24,2,6,21.0
32434485,3421083,11352,7,0,25247,prior,24,2,6,21.0
32434486,3421083,4600,8,0,25247,prior,24,2,6,21.0
32434487,3421083,24852,9,1,25247,prior,24,2,6,21.0


# Create a DataFrame for users with product-related statistics

In [20]:
users = pd.DataFrame()
users['total_user'] = priors.groupby('product_id').size()
users

Unnamed: 0_level_0,total_user
product_id,Unnamed: 1_level_1
1,1852
2,90
3,277
4,329
5,15
...,...
49684,9
49685,49
49686,120
49687,13


In [21]:
users['all_users'] = priors.groupby('product_id')['user_id'].apply(set)
users

Unnamed: 0_level_0,total_user,all_users
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1852,"{65537, 186370, 86020, 83973, 94215, 71688, 14..."
2,90,"{45059, 10252, 113679, 153105, 22035, 5652, 35..."
3,277,"{128640, 180481, 96386, 202359, 197255, 14984,..."
4,329,"{175106, 140804, 123909, 195589, 30213, 137222..."
5,15,"{184864, 179876, 58825, 151922, 102105, 160220}"
...,...,...
49684,9,"{188130, 55524, 79791, 154576, 15858, 33465, 1..."
49685,49,"{170882, 38530, 121479, 87434, 7564, 143891, 1..."
49686,120,"{44672, 101897, 36495, 28819, 17687, 149790, 1..."
49687,13,"{163907, 62395, 116871, 110824, 84777, 6252, 5..."


In [22]:
users['total_distinct_users_perProduct'] = users.all_users.map(len)
users

Unnamed: 0_level_0,total_user,all_users,total_distinct_users_perProduct
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1852,"{65537, 186370, 86020, 83973, 94215, 71688, 14...",716
2,90,"{45059, 10252, 113679, 153105, 22035, 5652, 35...",78
3,277,"{128640, 180481, 96386, 202359, 197255, 14984,...",74
4,329,"{175106, 140804, 123909, 195589, 30213, 137222...",182
5,15,"{184864, 179876, 58825, 151922, 102105, 160220}",6
...,...,...,...
49684,9,"{188130, 55524, 79791, 154576, 15858, 33465, 1...",8
49685,49,"{170882, 38530, 121479, 87434, 7564, 143891, 1...",43
49686,120,"{44672, 101897, 36495, 28819, 17687, 149790, 1...",36
49687,13,"{163907, 62395, 116871, 110824, 84777, 6252, 5...",7


# Create a DataFrame for customers with order-related statistics

In [24]:
customers_raw = pd.DataFrame()
customers_raw['avgDaysBetwOrders'] = ordersDF.groupby('user_id')['days_since_prior_order'].mean()
customers_raw

Unnamed: 0_level_0,avgDaysBetwOrders
user_id,Unnamed: 1_level_1
1,19.000000
2,16.285714
3,12.000000
4,17.000000
5,11.500000
...,...
206205,16.666667
206206,3.716418
206207,14.312500
206208,7.367347


In [25]:
customers_raw['NumberOfOrders'] = ordersDF.groupby('user_id').size()
customers_raw

Unnamed: 0_level_0,avgDaysBetwOrders,NumberOfOrders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,19.000000,11
2,16.285714,15
3,12.000000,13
4,17.000000,6
5,11.500000,5
...,...,...
206205,16.666667,4
206206,3.716418,68
206207,14.312500,17
206208,7.367347,50


# Create a DataFrame for customers with product-related statistics

In [27]:
customers = pd.DataFrame()
customers['total_items'] = priors.groupby('user_id').size()
customers

Unnamed: 0_level_0,total_items
user_id,Unnamed: 1_level_1
1,59
2,195
3,88
4,18
5,37
...,...
206205,32
206206,285
206207,223
206208,677


In [28]:
customers['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
customers

Unnamed: 0_level_0,total_items,all_products
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088..."
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1..."
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324..."
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905..."
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398..."
...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210..."
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102..."
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235..."
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,..."


In [29]:
customers['total_unique_items'] = customers.all_products.map(len)
customers

Unnamed: 0_level_0,total_items,all_products,total_unique_items
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23
...,...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198


# Combine order-related and product-related statistics

In [31]:
customers = customers.join(customers_raw)
customers

Unnamed: 0_level_0,total_items,all_products,total_unique_items,avgDaysBetwOrders,NumberOfOrders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285714,15
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5
...,...,...,...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24,16.666667,4
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150,3.716418,68
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92,14.312500,17
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198,7.367347,50


In [32]:
customers['avg_per_cart'] = (customers.total_items / customers.NumberOfOrders)
customers

Unnamed: 0_level_0,total_items,all_products,total_unique_items,avgDaysBetwOrders,NumberOfOrders,avg_per_cart
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285714,15,13.000000
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6,3.000000
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5,7.400000
...,...,...,...,...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24,16.666667,4,8.000000
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150,3.716418,68,4.191176
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92,14.312500,17,13.117647
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198,7.367347,50,13.540000


In [33]:
del customers_raw

# Create a DataFrame for customer-product interactions

In [35]:
customerXproduct = priors.copy()
customerXproduct['user_product'] = (customerXproduct.product_id + 
                                    customerXproduct.user_id * 100000)
customerXproduct

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product
0,2,33120,1,1,202279,prior,3,5,9,8.0,20227933120
1,2,28985,2,1,202279,prior,3,5,9,8.0,20227928985
2,2,9327,3,0,202279,prior,3,5,9,8.0,20227909327
3,2,45918,4,1,202279,prior,3,5,9,8.0,20227945918
4,2,30035,5,0,202279,prior,3,5,9,8.0,20227930035
...,...,...,...,...,...,...,...,...,...,...,...
32434484,3421083,39678,6,1,25247,prior,24,2,6,21.0,2524739678
32434485,3421083,11352,7,0,25247,prior,24,2,6,21.0,2524711352
32434486,3421083,4600,8,0,25247,prior,24,2,6,21.0,2524704600
32434487,3421083,24852,9,1,25247,prior,24,2,6,21.0,2524724852


In [36]:
customerXproduct = customerXproduct.sort_values('order_number')
customerXproduct

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product
14408558,1520399,6075,14,0,86239,prior,1,0,14,,8623906075
19427551,2049062,41329,3,0,154293,prior,1,6,11,,15429341329
19427552,2049062,23081,4,0,154293,prior,1,6,11,,15429323081
19427553,2049062,21527,5,0,154293,prior,1,6,11,,15429321527
19427554,2049062,35050,6,0,154293,prior,1,6,11,,15429335050
...,...,...,...,...,...,...,...,...,...,...,...
2940453,310464,33897,17,1,81625,prior,99,6,20,4.0,8162533897
2940452,310464,33647,16,0,81625,prior,99,6,20,4.0,8162533647
2940451,310464,46906,15,1,81625,prior,99,6,20,4.0,8162546906
32016740,3377086,18531,1,1,61275,prior,99,3,6,4.0,6127518531


In [37]:
customerXproduct = customerXproduct.groupby('user_product', sort = False).agg(
{'order_id': ['size', 'last'], 'add_to_cart_order': 'sum'})
customerXproduct

Unnamed: 0_level_0,order_id,order_id,add_to_cart_order
Unnamed: 0_level_1,size,last,sum
user_product,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
8623906075,1,1520399,14
15429341329,1,2049062,3
15429323081,3,1489630,8
15429321527,3,2251505,14
15429335050,4,2251505,18
...,...,...,...
16999119311,1,51086,6
8998913176,1,2768756,3
8162545368,1,310464,13
8162535690,1,310464,14


In [38]:
customerXproduct.columns = ['numbOfOrders', 'last_order_id', 'sum_add_to_cart_order']
customerXproduct

Unnamed: 0_level_0,numbOfOrders,last_order_id,sum_add_to_cart_order
user_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8623906075,1,1520399,14
15429341329,1,2049062,3
15429323081,3,1489630,8
15429321527,3,2251505,14
15429335050,4,2251505,18
...,...,...,...
16999119311,1,51086,6
8998913176,1,2768756,3
8162545368,1,310464,13
8162535690,1,310464,14


# Function to extract features from specified orders

In [40]:
def get_features(specified_orders, given_labels = False):
    print('create initial empty list')
    orders_list = []
    products_list = []
    labels = []
    
    training_index = set(order_products_trainDF.index)
    
    for row in specified_orders.itertuples():
        user_id = row.user_id
        order_id = row.order_id
        
        user_products = customers['all_products'][user_id]
        products_list += user_products
        orders_list += [order_id] * len(user_products)
        
        if given_labels:
            labels += [(order_id, product) in training_index for product in user_products]
        
    DF = pd.DataFrame({'order_id': orders_list, 'product_id': products_list}, dtype = np.int32)
    labels = np.array(labels, dtype = np.int8)
        
    print('get features for user part')
    DF['user_id'] = DF.order_id.map(ordersDF.user_id)
    DF['user_total_orders'] = DF.user_id.map(customers.NumberOfOrders)
    DF['user_total_items'] = DF.user_id.map(customers.total_items)
    DF['total_unique_items'] = DF.user_id.map(customers.total_unique_items)
    DF['user_avgDaysBetwOrders'] = DF.user_id.map(customers.avgDaysBetwOrders)
    DF['user_avg_per_cart'] = DF.user_id.map(customers.avg_per_cart) 
        
    print('get features for order part')
    DF['order_hour_of_day'] = DF.order_id.map(ordersDF.order_hour_of_day)
    DF['days_since_prior_order'] = DF.order_id.map(ordersDF.days_since_prior_order)
    DF['daysSincePrior_avgDaysBetw_ratio'] = DF.days_since_prior_order / DF.user_avgDaysBetwOrders
        
    print('get features for product part')
    DF['aisle_id'] = DF.product_id.map(products.aisle_id)
    DF['department_id'] = DF.product_id.map(products.department_id)
    DF['product_order'] = DF.product_id.map(products.ordersTotal)
    DF['product_reorder'] = DF.product_id.map(products.reordersTotal)
    DF['product_reorder_rate'] = DF.product_id.map(products.reorder_rate)
    DF['product_distinct_user'] = DF.product_id.map(users.total_distinct_users_perProduct)
    
    print('get features for customerXproduct')
    DF['user_product_id']  = (DF.product_id + DF.user_id * 100000).astype(np.int64)
    DF.drop(['user_id'], axis = 1, inplace = True)
    DF['CP_numOrders'] = DF.user_product_id.map(customerXproduct.numbOfOrders)
    DF['CP_orders_ratio'] = DF.CP_numOrders / DF.user_total_orders
    DF['CP_last_order_id'] = DF.user_product_id.map(customerXproduct.last_order_id)
    DF['CP_avg_pos_inCart'] = DF.user_product_id.map(customerXproduct.sum_add_to_cart_order) / DF.CP_numOrders
    DF['CP_order_since_last'] = DF.user_total_orders - DF.CP_last_order_id.map(ordersDF.order_number)
    DF['CP_hour_vs_last'] = abs(DF.order_hour_of_day - DF.CP_last_order_id.map(
    ordersDF.order_hour_of_day)).map(lambda x: min(x, 24 - x)).replace([np.inf, -np.inf], np.nan).fillna(0).astype(np.int8)
    
    DF.drop(['CP_last_order_id', 'user_product_id'], axis=1, inplace=True)
    return DF, labels

# Split data into training and test sets

In [42]:
test = ordersDF[ordersDF.eval_set == 'test']
test

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0
...,...,...,...,...,...,...,...
3420918,2728930,206202,test,23,2,17,6.0
3420929,350108,206204,test,5,4,14,14.0
3421001,1043943,206206,test,68,0,20,0.0
3421018,2821651,206207,test,17,2,13,14.0


In [43]:
train = ordersDF[ordersDF.eval_set == 'train']
train

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,train,11,4,8,14.0
25,1492625,2,train,15,1,11,30.0
49,2196797,5,train,5,0,11,6.0
74,525192,7,train,21,2,11,6.0
78,880375,8,train,4,1,14,10.0
...,...,...,...,...,...,...,...
3420838,2585586,206199,train,20,2,16,30.0
3420862,943915,206200,train,24,6,19,6.0
3420924,2371631,206203,train,6,4,19,30.0
3420933,1716008,206205,train,4,1,16,10.0


# Set index for training data

In [45]:
order_products_trainDF.set_index(['order_id', 'product_id'], inplace = True, drop = False)
order_products_trainDF

Unnamed: 0_level_0,Unnamed: 1_level_0,order_id,product_id,add_to_cart_order,reordered
order_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,49302,1,49302,1,1
1,11109,1,11109,2,1
1,10246,1,10246,3,0
1,49683,1,49683,4,0
1,43633,1,43633,5,1
...,...,...,...,...,...
3421063,14233,3421063,14233,3,1
3421063,35548,3421063,35548,4,1
3421070,35951,3421070,35951,1,1
3421070,16953,3421070,16953,2,1


# Extract features and labels for training data

In [127]:
df_to_train, train_labels = get_features(train, given_labels=True)
df_to_train['Labels'] = train_labels
df_to_train

create initial empty list
get features for user part
get features for order part
get features for product part
get features for customerXproduct


Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_unique_items,user_avgDaysBetwOrders,user_avg_per_cart,order_hour_of_day,days_since_prior_order,daysSincePrior_avgDaysBetw_ratio,...,product_order,product_reorder,product_reorder_rate,product_distinct_user,CP_numOrders,CP_orders_ratio,CP_avg_pos_inCart,CP_order_since_last,CP_hour_vs_last,Labels
0,1187899,17122,9,170,87,5.00,18.888889,21,4.0,0.800000,...,1795.0,752.0,0.418942,4503,,,,,0,0
1,1187899,196,9,170,87,5.00,18.888889,21,4.0,0.800000,...,15.0,5.0,0.333333,8000,,,,,0,1
2,1187899,26405,9,170,87,5.00,18.888889,21,4.0,0.800000,...,5.0,1.0,0.200000,678,,,,,0,1
3,1187899,46149,9,170,87,5.00,18.888889,21,4.0,0.800000,...,172.0,76.0,0.441860,1605,,,,,0,1
4,1187899,14084,9,170,87,5.00,18.888889,21,4.0,0.800000,...,37.0,15.0,0.405405,3012,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8474656,272231,31477,26,388,149,12.24,14.923077,17,18.0,1.470588,...,932.0,514.0,0.551502,1040,,,,,0,0
8474657,272231,28156,26,388,149,12.24,14.923077,17,18.0,1.470588,...,2016.0,1180.0,0.585317,6407,1.0,0.038462,8.0,23.0,4,0
8474658,272231,41213,26,388,149,12.24,14.923077,17,18.0,1.470588,...,59.0,19.0,0.322034,1160,,,,,0,0
8474659,272231,890,26,388,149,12.24,14.923077,17,18.0,1.470588,...,21.0,7.0,0.333333,11529,,,,,0,0


# Extract features for test data

In [49]:
df_to_test, _ = get_features(test)

create initial empty list
get features for user part
get features for order part
get features for product part
get features for customerXproduct


# Select features to use for the model

In [51]:
features_to_use = ['user_total_orders', 'user_total_items', 'total_unique_items', 
                  'user_avgDaysBetwOrders', 'user_avg_per_cart', 'order_hour_of_day',
                  'days_since_prior_order', 'daysSincePrior_avgDaysBetw_ratio',
                  'aisle_id', 'department_id', 'product_order', 'product_reorder',
                  'product_reorder_rate', 'CP_numOrders', 'CP_orders_ratio', 
                  'CP_avg_pos_inCart', 'CP_order_since_last', 'CP_hour_vs_last',
                  'product_distinct_user']
features_to_use

['user_total_orders',
 'user_total_items',
 'total_unique_items',
 'user_avgDaysBetwOrders',
 'user_avg_per_cart',
 'order_hour_of_day',
 'days_since_prior_order',
 'daysSincePrior_avgDaysBetw_ratio',
 'aisle_id',
 'department_id',
 'product_order',
 'product_reorder',
 'product_reorder_rate',
 'CP_numOrders',
 'CP_orders_ratio',
 'CP_avg_pos_inCart',
 'CP_order_since_last',
 'CP_hour_vs_last',
 'product_distinct_user']

# Applying Random Forest

In [53]:
rf_model = RandomForestClassifier(
    n_estimators=100,    # Number of trees in the forest
    max_depth=10,        # Maximum depth of the tree
    random_state=42,     # For reproducibility
    n_jobs=-1            # Use all available cores
)

In [54]:
rf_model.fit(df_to_train[features_to_use], train_labels)

# Generate predictions for the training set

In [56]:
train_preds_rf = rf_model.predict_proba(df_to_train[features_to_use])[:, 1]
train_preds_rf

array([0.09135543, 0.13838119, 0.08285749, ..., 0.08776905, 0.09257609,
       0.07831286])

In [57]:
train_log_loss_rf = log_loss(train_labels, train_preds_rf)
print(f'Random Forest Log Loss: {train_log_loss_rf}')

Random Forest Log Loss: 0.3118261694166173


# Apply the threshold to get binary predictions

In [59]:
threshold = 0.10  # Can adjust the threshold for better results
train_pred_labels_rf = (train_preds_rf > threshold).astype(int)
train_pred_labels_rf

array([0, 1, 0, ..., 0, 0, 0])

# Performance metrics

In [61]:
precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(train_labels, train_pred_labels_rf, average='binary')
print(f'Random Forest Precision: {precision_rf}')
print(f'Random Forest Recall: {recall_rf}')
print(f'Random Forest F1 Score: {f1_rf}')

Random Forest Precision: 0.16416736565262557
Random Forest Recall: 0.34854685675125235
Random Forest F1 Score: 0.22320433792953182


In [62]:
# Display a few predictions
train_results_rf = pd.DataFrame({'order_id': df_to_train['order_id'], 'product_id': df_to_train['product_id'], 'actual': train_labels, 'predicted': train_pred_labels_rf})
print(train_results_rf.head())

   order_id  product_id  actual  predicted
0   1187899       17122       0          0
1   1187899         196       1          1
2   1187899       26405       1          0
3   1187899       46149       1          0
4   1187899       14084       0          0


In [63]:
# Generate predictions for the test set
test_preds_rf = rf_model.predict_proba(df_to_test[features_to_use])[:, 1]
test_preds_rf

array([0.08856684, 0.09642599, 0.09165604, ..., 0.08303128, 0.08345278,
       0.12891782])

In [64]:
# Add predictions to test DataFrame
df_to_test['pred'] = test_preds_rf
df_to_test

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_unique_items,user_avgDaysBetwOrders,user_avg_per_cart,order_hour_of_day,days_since_prior_order,daysSincePrior_avgDaysBetw_ratio,...,product_order,product_reorder,product_reorder_rate,product_distinct_user,CP_numOrders,CP_orders_ratio,CP_avg_pos_inCart,CP_order_since_last,CP_hour_vs_last,pred
0,2774568,17668,27,219,146,12.576923,8.111111,17,30.0,2.385321,...,90.0,36.0,0.400000,890,,,,,0,0.088567
1,2774568,44683,27,219,146,12.576923,8.111111,17,30.0,2.385321,...,269.0,113.0,0.420074,10294,,,,,0,0.096426
2,2774568,48523,27,219,146,12.576923,8.111111,17,30.0,2.385321,...,509.0,280.0,0.550098,2753,,,,,0,0.091656
3,2774568,21903,27,219,146,12.576923,8.111111,17,30.0,2.385321,...,25.0,6.0,0.240000,55037,,,,,0,0.218743
4,2774568,14992,27,219,146,12.576923,8.111111,17,30.0,2.385321,...,16.0,8.0,0.500000,12127,,,,,0,0.099496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4833287,803273,44532,11,36,22,21.500000,3.272727,16,20.0,0.930233,...,3613.0,2236.0,0.618876,187,,,,,0,0.077767
4833288,803273,46069,11,36,22,21.500000,3.272727,16,20.0,0.930233,...,48.0,15.0,0.312500,3820,,,,,0,0.088656
4833289,803273,12791,11,36,22,21.500000,3.272727,16,20.0,0.930233,...,459.0,248.0,0.540305,638,,,,,0,0.083031
4833290,803273,14332,11,36,22,21.500000,3.272727,16,20.0,0.930233,...,1039.0,351.0,0.337825,4622,,,,,0,0.083453


# Prepare submission DataFrame

In [66]:
d_rf = dict()
for row in df_to_test.itertuples():
    if row.pred > threshold:
        try:
            d_rf[row.order_id] += ' ' + str(row.product_id)
        except:
            d_rf[row.order_id] = str(row.product_id)

for order in test.order_id:
    if order not in d_rf:
        d_rf[order] = 'None'

In [67]:
sub_rf = pd.DataFrame.from_dict(d_rf, orient='index')
sub_rf

Unnamed: 0,0
2774568,21903 21137 22035 47766 42265 40604 16797 9387...
329954,37646 19057 25146
1528013,27521 48679 8424 45007 21903 25659
1376945,17794 26209 18465 4799 33731 44632 43352 12384...
1356845,11520 17794 44683 37646 14992 31506 49683 2293...
...,...
1011941,
1801443,
1517566,
333209,


In [68]:
sub_rf.reset_index(inplace=True)
sub_rf.columns = ['order_id', 'products']
sub_rf

Unnamed: 0,order_id,products
0,2774568,21903 21137 22035 47766 42265 40604 16797 9387...
1,329954,37646 19057 25146
2,1528013,27521 48679 8424 45007 21903 25659
3,1376945,17794 26209 18465 4799 33731 44632 43352 12384...
4,1356845,11520 17794 44683 37646 14992 31506 49683 2293...
...,...,...
74995,1011941,
74996,1801443,
74997,1517566,
74998,333209,


In [69]:
sub_rf.to_csv('sub_rf.csv', index=False)
sub_rf

Unnamed: 0,order_id,products
0,2774568,21903 21137 22035 47766 42265 40604 16797 9387...
1,329954,37646 19057 25146
2,1528013,27521 48679 8424 45007 21903 25659
3,1376945,17794 26209 18465 4799 33731 44632 43352 12384...
4,1356845,11520 17794 44683 37646 14992 31506 49683 2293...
...,...,...
74995,1011941,
74996,1801443,
74997,1517566,
74998,333209,


# Applying XGBoost

In [71]:
import xgboost as xgb


xgb_model = xgb.XGBClassifier(
    n_estimators=100,    # Number of trees
    max_depth=10,        # Maximum depth of each tree
    learning_rate=0.1,   # Learning rate
    random_state=42      # For reproducibility
)


# Train the model
xgb_model.fit(df_to_train[features_to_use], train_labels)

# Generate predictions for the training set
train_preds_xgb = xgb_model.predict_proba(df_to_train[features_to_use])[:, 1]

# Calculate log loss for the training set
train_log_loss_xgb = log_loss(train_labels, train_preds_xgb)
print(f'XGBoost Log Loss: {train_log_loss_xgb}')

# Apply the threshold to get binary predictions
threshold = 0.10  # Adjust the threshold for better results
train_pred_labels_xgb = (train_preds_xgb > threshold).astype(int)

# Calculate precision, recall, and F1 score
precision_xgb, recall_xgb, f1_xgb, _ = precision_recall_fscore_support(train_labels, train_pred_labels_xgb, average='binary')
print(f'XGBoost Precision: {precision_xgb}')
print(f'XGBoost Recall: {recall_xgb}')
print(f'XGBoost F1 Score: {f1_xgb}')

# Display a few predictions
train_results_xgb = pd.DataFrame({'order_id': df_to_train['order_id'], 'product_id': df_to_train['product_id'], 'actual': train_labels, 'predicted': train_pred_labels_xgb})
print(train_results_xgb.head())

# Generate predictions for the test set
test_preds_xgb = xgb_model.predict_proba(df_to_test[features_to_use])[:, 1]

# Add predictions to test DataFrame
df_to_test['pred'] = test_preds_xgb

# Prepare submission DataFrame
d_xgb = dict()
for row in df_to_test.itertuples():
    if row.pred > threshold:
        try:
            d_xgb[row.order_id] += ' ' + str(row.product_id)
        except:
            d_xgb[row.order_id] = str(row.product_id)

for order in test.order_id:
    if order not in d_xgb:
        d_xgb[order] = 'None'

sub_xgb = pd.DataFrame.from_dict(d_xgb, orient='index')

sub_xgb.reset_index(inplace=True)
sub_xgb.columns = ['order_id', 'products']
sub_xgb.to_csv('./submission_xgb.csv', index=False)

XGBoost Log Loss: 0.3010011465808673
XGBoost Precision: 0.1740687744578137
XGBoost Recall: 0.5512847118326689
XGBoost F1 Score: 0.2645922463454238
   order_id  product_id  actual  predicted
0   1187899       17122       0          1
1   1187899         196       1          1
2   1187899       26405       1          1
3   1187899       46149       1          1
4   1187899       14084       0          1
