# Project Description

The goal of the project is given a dataset that includes information about online purchases of users in an online-shop to predict whether or not the user will purchase a beverage in the next order. 

Our task is therefore to perform binary classification (since we are talking about 2 classes) and make predictions  by extracting features from the dataset and choosing the appropriate model.

# Data Analysis and Feature Extraction 

In order to extract features for our model we first need to explore the dataset and consider habits of the users described.

In [None]:
from __future__ import division
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
#read train test data
X_train=pd.read_csv("data/X_train.csv")
y_train=pd.read_csv("data/y_train.csv")
y_train.drop('order_id', axis=1, inplace=True)


In [None]:
orders = pd.read_csv('data/orders.csv')
products = pd.read_csv('data/products.csv')
orders_priors = pd.read_csv('data/order_products__prior.csv')
departments = pd.read_csv('data/departments.csv')
aisles = pd.read_csv('data/aisles.csv')

In [None]:
# number of aisles and departments
num_aisles= aisles.aisle_id.nunique()
num_deps = departments.department_id.nunique()


# Feature Extraction

In [None]:
# convert categorical data to dummy values
X_train = X_train.drop(['order_dow', 'order_hour_of_day'], axis=1)
#X_train= pd.get_dummies(X_train, prefix=["d", "h"], columns=['order_dow', 'order_hour_of_day'])
X_train.head(2)

## Beverages

In [None]:
# products in department 7
products_bev = products[products['department_id']==7]
products_bev_id = list(products_bev.product_id.values)
num_aisles_7 = products_bev.aisle_id.nunique()
num_bev = products_bev.product_id.nunique()

In [None]:
#  prior orders of department 7
orders_priors_bev = orders_priors[orders_priors['product_id'].isin(products_bev_id)]
orders_prior_id_bev = list(orders_priors_bev.order_id.values)

In [None]:
# orders of department 7
orders_bev = orders[orders['order_id'].isin(orders_prior_id_bev)]
orders_bev.head(2)

## Orders per User

A user with many orders as well as many orders that involve beverages is more likely to purchase a beverage in the next order. Also a user with a big ratio of beverages orders over orders seems to order beverages frequently and therefore this seems to be a good feature for our model.

In [None]:
# count orders of department 7 for every user
count_ord_bev = orders_bev.groupby('user_id')['order_id'].count().reset_index()
count_ord_bev.rename(columns={'order_id':'order_count'}, inplace=True)

In [None]:
# count orders of all departments for every user
count_ord = orders.groupby('user_id')['order_id'].count().reset_index()
count_ord.rename(columns={'order_id':'orders_count'}, inplace=True)

In [None]:
count_ord_bev = count_ord_bev.merge(count_ord, on='user_id', how='left')

In [None]:
# calculate ratio of orders with beverages to orders of all deps
count_ord_bev['orders_bev_ratio']= count_ord_bev['order_count']/count_ord_bev['orders_count']
count_ord_bev.head(2)

In [None]:
# feature 2 bev_orders/orders per user
X_train = X_train.merge(count_ord_bev[['user_id', 'orders_bev_ratio']], on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
# feature 3 beverages orders per user
X_train = X_train.merge(count_ord_bev[['user_id', 'order_count']], on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
# feature 4 orders per user
X_train = X_train.merge(count_ord_bev[['user_id', 'orders_count']], on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Beverages per basket per user

If a user has a big ratio of beverages per basket this seems to be a good indicator that he is more likely to purchase a beverage in his next order. The same applies for a user with a high average of beverages. 

In [None]:
# calculate size of each order
orders_priors['size_of_order']=orders_priors.groupby('order_id')['add_to_cart_order'].transform('max')

In [None]:
# add size of basket 
orders_priors_bev = orders_priors_bev.merge(orders_priors[['order_id', 'size_of_order']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
orders_bev = orders_bev.merge(orders_priors_bev[['order_id', 'size_of_order']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
orders_bev.head(3)

In [None]:
# calculate total beverages count per order
beverages_count= orders_priors_bev.groupby('order_id')['product_id'].count().reset_index()
beverages_count = beverages_count.rename(columns={'product_id':'beverages_count'})

In [None]:
orders_priors_bev = orders_priors_bev.merge(beverages_count, on='order_id', how='left')
orders_bev = orders_bev.merge(beverages_count, on='order_id', how='left')

In [None]:
# calculate beverages per basket
orders_priors_bev['bev_per_basket']= np.where(orders_priors_bev['beverages_count'] < 1, orders_priors_bev['beverages_count'], orders_priors_bev['beverages_count']/orders_priors_bev['size_of_order'])


In [None]:
orders_bev = orders_bev.merge(orders_priors_bev[['order_id', 'bev_per_basket']].drop_duplicates(subset=['order_id']), on='order_id', how='left')

In [None]:
# average beverage per basket per user
bev_per_basket = orders_bev.groupby('user_id')['bev_per_basket'].mean().reset_index()

In [None]:
# feature 5 average beverage per basket per user
X_train = X_train.merge(bev_per_basket, on='user_id', how='left')
X_train  = X_train.fillna(0.0)

In [None]:
# mean beverages count per user
beverages = orders_bev.groupby('user_id')['beverages_count'].mean().reset_index()
beverages = beverages.rename(columns={'beverages_count':'mean_beverage_count'})

In [None]:
# feature 6 mean beverages count per user
X_train = X_train.merge(beverages, on='user_id', how='left')
X_train  = X_train.fillna(0.0)

In [None]:
orders = orders.merge(count_ord, on='user_id', how='left')

In [None]:
orders_bev = orders_bev.merge(count_ord_bev, on='user_id', how='left')

## Size of Order

In [None]:
# feature 7 size of order
X_train = X_train.merge(orders_bev[['user_id','size_of_order' ]].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_train  = X_train.fillna(0.0)

## Reordered beverages per user

The ratio of reordered beverages per user shows how frequently the user prefers the same beverages and tends to rebuy them which is a good predictor. In this basis we can consider certain features to add to our model. Other than the average of reordered beverages per user, we can consider the reorder beverages per basket meaning how many of the products in an order are reordered beverages at a time. 

In [None]:
# count how many of the beverages are reordered by order_id
reordered_beverages = orders_priors_bev.groupby('order_id')['reordered'].sum().reset_index()
reordered_beverages = reordered_beverages.rename(columns={'reordered':'reordered_bev_count'})

In [None]:
orders_bev = orders_bev.merge(reordered_beverages, on='order_id', how='left')

In [None]:
# calculate beverage reorder ratio for every order
orders_bev['reorder_bev_ratio']= orders_bev['reordered_bev_count']/orders_bev['beverages_count']
orders_bev['reorder_bev_basket_rt']=orders_bev['reordered_bev_count']/orders_bev['size_of_order']
orders_bev.head(2)

In [None]:
# calculate user average ratio of beverages that are reordered
user_reorder_bev_ratio = orders_bev.groupby('user_id')['reordered_bev_count'].mean().reset_index()
user_reorder_bev_ratio = user_reorder_bev_ratio.rename(columns={'reordered_bev_count':'user_bev_reorder_rt'})

In [None]:
# calculate user average ratio of bev per basket that are reordered
user_reorder_bev_basket_rt = orders_bev.groupby('user_id')['reorder_bev_basket_rt'].mean().reset_index()
user_reorder_bev_basket_rt = user_reorder_bev_basket_rt.rename(columns={'reorder_bev_basket_rt':'user_bev_basket_reorder_rt'})

In [None]:
orders_bev = orders_bev.merge(user_reorder_bev_ratio[['user_id', 'user_bev_reorder_rt']], on='user_id', how='left')
orders_bev = orders_bev.merge(user_reorder_bev_basket_rt[['user_id', 'user_bev_basket_reorder_rt']], on='user_id', how='left')

In [None]:
# feature 8 user average ratio of beverages that are reordered
X_train = X_train.merge(user_reorder_bev_ratio, on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
# feature 9 user average ratio of bev per basket that are reordered
X_train = X_train.merge(user_reorder_bev_basket_rt, on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Product Reorder per user

Relative to the ratio of beverages that are reordered by user this feature, the ratio of products that are reordered per user also shows a measure of habbit

In [None]:
# count how many of the products are reordered by order_id
reordered_prod = orders_priors.groupby('order_id')['reordered'].sum().reset_index()
reordered_prod = reordered_prod.rename(columns={'reordered':'reordered_prod_count'})

In [None]:
product_count= orders_priors.groupby('order_id')['product_id'].count().reset_index()
product_count = product_count.rename(columns={'product_id':'product_count'})
orders = orders.merge(product_count, on='order_id', how='left')

In [None]:
orders = orders.merge(reordered_prod, on='order_id', how='left')

In [None]:
orders['reorder_ratio_all']= np.where(orders['reordered_prod_count']<1, orders['reordered_prod_count'], orders['reordered_prod_count']/orders['product_count'])

In [None]:
orders_bev = orders_bev.merge(orders[['order_id', 'reordered_prod_count', 'reorder_ratio_all']].drop_duplicates(subset=['order_id']), on='order_id', how='left')

In [None]:
user_reorder_ratio = orders_bev.groupby('user_id')['reordered_prod_count'].mean().reset_index()
user_reorder_ratio = user_reorder_ratio.rename(columns={'reordered_prod_count':'user_prod_reorder_rt'})

In [None]:
orders_bev = orders_bev.merge(user_reorder_ratio[['user_id', 'user_prod_reorder_rt']], on='user_id', how='left')

In [None]:
# feature 10 average user product reorder ratio 
X_train = X_train.merge(user_reorder_ratio, on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Average interval between days 

The frequency in days the user purchases beverages and products as well as the ratio between those seems to provide information about the user's behaviour

In [None]:
# average interval between buying beverages per user
average_order_days_bev = orders_bev.groupby('user_id')['days_since_prior_order'].mean().reset_index()
average_order_days_bev = average_order_days_bev.rename(columns={'days_since_prior_order':'avg_bev_days_since_prior'})

In [None]:
# average interval between buying products per user
avg_order_days = orders.groupby('user_id')['days_since_prior_order'].mean().reset_index()
avg_order_days = avg_order_days.rename(columns={'days_since_prior_order':'avg_days_since_prior'})

In [None]:
average_order_days_bev = average_order_days_bev.merge(avg_order_days, on='user_id', how='left')

In [None]:
# average interval ratio bev/prod
average_order_days_bev['avg_days_bev']=average_order_days_bev['avg_bev_days_since_prior']/average_order_days_bev['avg_days_since_prior']

In [None]:
# feature 11 average interval ratio bev/prod
X_train = X_train.merge(average_order_days_bev[['user_id', 'avg_days_bev']], on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
# feature 12 average interval between buying products per user
X_train = X_train.merge(average_order_days_bev[['user_id', 'avg_days_since_prior']], on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Reorder mean per days since prior

Relative to the previous feature, this feature aims to give us information on whether there is a relationship between the days that have passed since the prior order and the reordered ratio of products.

In [None]:
# calculate average mean reorder per days since prior order
orders_prod_days = orders_bev.merge(orders_priors_bev, on='order_id', how='left')
grouped_days_df = orders_prod_days.groupby('days_since_prior_order')['reordered'].mean().reset_index()
grouped_days_df = grouped_days_df.rename(columns={'reordered':'days_reorder'})

In [None]:
orders_bev = orders_bev.merge(grouped_days_df, on='days_since_prior_order', how='left')

In [None]:
# user average mean reorder per days since prior order
user_days_reorder = orders_bev.groupby('user_id')['days_reorder'].mean().reset_index()
user_days_reorder = user_days_reorder.rename(columns={'days_reorder':'user_days_reorder'})

In [None]:
# feature 13 user average mean reorder per days since prior order
X_train = X_train.merge(user_days_reorder, on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Department Ratio


The department ratio indicates whether the user shops from many departments in each order he places. If he does that makes him more likely to include beverages in his next order. 

In [None]:
# create department columns in prior orders frame
orders_priors = orders_priors.merge(products[['product_id', 'department_id']], on = 'product_id', how='left')
unique_deps = orders_priors.groupby('order_id')['department_id'].nunique().reset_index()
unique_deps = unique_deps.rename(columns={'department_id':'unique_deps'})
orders_priors = orders_priors.merge(unique_deps, on='order_id', how='left')

In [None]:
# calculate the ratio of how many departments did the user shop from in relation to total departments per order
orders_priors['department_ratio']= orders_priors['unique_deps']/num_deps


In [None]:
orders_priors_bev = orders_priors_bev.merge(orders_priors[['order_id', 'department_ratio']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
orders_bev = orders_bev.merge(orders_priors_bev[['order_id','department_ratio']].drop_duplicates(subset=['order_id']), on='order_id', how='left')

In [None]:
# calculate average user department ratio
user_department_ratio = orders_bev.groupby('user_id')['department_ratio'].mean().reset_index()
user_department_ratio = user_department_ratio.rename(columns={'department_ratio':'user_department_ratio'})

In [None]:
# feature14 average user department ratio
X_train = X_train.merge(user_department_ratio, on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Aisles Ratio

Aisles ratio is also a measure that indicates variety since the more aisles the user roams in every order the more likely he is to have a wide range of products in his basket.

In [None]:
# create aisles columns in prior orders frame
orders_priors = orders_priors.merge(products[['product_id', 'aisle_id']], on='product_id', how='left')
unique_aisles = orders_priors.groupby('order_id')['aisle_id'].nunique().reset_index()
unique_aisles = unique_aisles.rename(columns={'aisle_id':'unique_aisles'})
orders_priors = orders_priors.merge(unique_aisles, on='order_id', how='left')

In [None]:
# calculate the ratio of how many aisles did the user shop from in relation to total aisles per order
orders_priors_bev = orders_priors_bev.merge(products[['product_id', 'aisle_id']], on='product_id', how='left')

In [None]:
# calculate the ratio of how many aisles did the user shop from in relation to total aisles per order
orders_priors['aisles_ratio']= orders_priors['unique_aisles']/num_aisles

In [None]:
# calculate average user aisle ratio
orders_priors_bev = orders_priors_bev.merge(orders_priors[['order_id','aisles_ratio']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
orders_bev = orders_bev.merge(orders_priors_bev[['order_id','aisles_ratio']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
user_aisles_ratio = orders_bev.groupby('user_id')['aisles_ratio'].mean().reset_index()
user_aisles_ratio = user_aisles_ratio.rename(columns={'aisles_ratio':'user_aisles_ratio'})

In [None]:
# feature 15 average user aisle ratio
X_train = X_train.merge(user_aisles_ratio, on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Add to cart early per user

We now consider the relation between the order in which the user places a product in the cart with the possibility that this product is reordered. To do so, we can plot the average reorder ratio per add to cart order.

In [None]:
reorder_add = orders_priors_bev.copy()
reorder_add['add_to_cart_order'] = np.where(reorder_add['add_to_cart_order']>60, 60, reorder_add['add_to_cart_order'])

grouped_df = reorder_add.groupby(["add_to_cart_order"])["reordered"].aggregate("mean").reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df['add_to_cart_order'].values, grouped_df['reordered'].values, alpha=0.5, color='red')
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Add to cart order', fontsize=12)

plt.title("Add to cart reorder ratio", fontsize=15)
plt.xticks(rotation='vertical')
plt.savefig('Add to cart reorder ratio.png')
plt.show()

According to the diagram there is a relation between the order the customer places beverages to the cart with the possibility of reordering them. We can assume that this is reasonably true for all the products since we are talking about an e-shop and considering that we place first the products that we usually buy whereas in a physical store the order in which we put products to the cart dependes on how the aisles are shaped.

In [None]:
# calculate average reorder per add to cart order
grouped_df = orders_priors_bev.groupby(["add_to_cart_order"])["reordered"].aggregate("mean").reset_index()
grouped_df = grouped_df.rename(columns={'reordered':'prob_reorder_add_to_cart'})
orders_priors_bev = orders_priors_bev.merge(grouped_df, on='add_to_cart_order', how='left')

In [None]:
orders_prod_df = orders_bev.merge(orders_priors_bev, on='order_id', how='left')
prob_add = orders_prod_df.groupby('user_id')['prob_reorder_add_to_cart'].mean().reset_index()
orders_bev = orders_bev.merge(prob_add.drop_duplicates(subset=['user_id']), on='user_id', how='left')

In [None]:
user_prob_reorder_add_to_cart = orders_bev.groupby('user_id')['prob_reorder_add_to_cart'].mean().reset_index()
user_prob_reorder_add_to_cart = user_prob_reorder_add_to_cart.rename(columns={'prob_reorder_add_to_cart':'user_prob_reorder_add_to_cart'})

In [None]:
# feature 16 user average reorder per add to cart order
X_train = X_train.merge(user_prob_reorder_add_to_cart, on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Unique Beverages 

How many of the beverages has the user purchased before?

In [None]:
# calculate how many of the unique beverages code the user has shopped from
orders_products_df = orders_bev.merge(orders_priors_bev, on='order_id', how='left')
bev_rt = orders_products_df.groupby('user_id')['product_id'].nunique().reset_index()
bev_rt = bev_rt.rename(columns={'product_id':'bev_unique'})
orders_bev = orders_bev.merge(bev_rt, on='user_id', how='left')
orders_bev['bev_rt']= orders_bev['bev_unique']/num_bev

In [None]:
# feature 18 unique beverages ratio
X_train = X_train.merge(orders_bev[['user_id', 'bev_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Reorder Bev vs Reorder Prod

The ratio of reordered beverages vs reordered products can indicate the tendency of the user of reordering beverages is stronger than this of reordering other products.

In [None]:
orders_bev['bev_prod_reorder_rt']= orders_bev['reordered_bev_count']/orders_bev['reordered_prod_count']

In [None]:
# feature 19 beverages reordered versus product reordered ratio
X_train = X_train.merge(orders_bev[['user_id', 'bev_prod_reorder_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Days 

In [None]:
# estimate in which day do the most orders appear per user
days_rt = orders_bev.groupby('user_id')['order_dow'].nunique().reset_index()
days_rt = days_rt.rename(columns={'order_dow':'shopping_days'})
orders_bev = orders_bev.merge(days_rt, on='user_id', how='left')
orders_bev['days_rt']= orders_bev['shopping_days']/7

In [None]:
# feature 20 days ratio per user
X_train = X_train.merge(orders_bev[['user_id','days_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Hours

In [None]:
# estimate in which hour do the most orders appear per user
hours_rt = orders_bev.groupby('user_id')['order_hour_of_day'].nunique().reset_index()
hours_rt = hours_rt.rename(columns={'order_hour_of_day':'shopping_hours'})
orders_bev = orders_bev.merge(hours_rt, on='user_id', how='left')
orders_bev['hours_rt']= orders_bev['shopping_hours']/24

In [None]:
# feature 21 hours ratio per user
X_train = X_train.merge(orders_bev[['user_id','hours_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Aisles of department 7

Calculate the ratio of from how many of the aisles of the department 7 has the user shopped from

In [None]:
orders_aisles = orders_bev.merge(orders_priors_bev[['order_id', 'aisle_id']], on='order_id', how='left')

In [None]:
aisles_7 = orders_aisles.groupby('user_id')['aisle_id'].nunique().reset_index()
aisles_7 = aisles_7.rename(columns={'aisle_id':'unique_aisles_7'})
num_aisles_7 = products_bev.aisle_id.nunique()

In [None]:
orders_bev = orders_bev.merge(aisles_7, on='user_id', how='left')
orders_bev['aisles_7_ratio']=orders_bev['unique_aisles_7']/num_aisles_7

In [None]:
# feature 22 unique aisles of dep 7 per user
X_train = X_train.merge(orders_bev[['user_id','aisles_7_ratio']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_train = X_train.fillna(0.0)

## Departments of User

In total how many of the departments has the user shopped from.

In [None]:
orders_deps = orders.merge(orders_priors[['order_id', 'department_id']], on='order_id', how='left')

In [None]:
# calculate from how many unique departments has the user shopped from 
deps = orders_deps.groupby('user_id')['department_id'].nunique().reset_index()
deps = deps.rename(columns={'department_id':'unique_dpts'})

In [None]:
orders_bev = orders_bev.merge(deps, on='user_id', how='left')
orders_bev['dpts_ratio']=orders_bev['unique_dpts']/num_deps

In [None]:
# feature 23 unique departments per user
X_train = X_train.merge(orders_bev[['user_id','dpts_ratio']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
X_train = X_train.drop(['order_id','user_id'], axis=1)

In [None]:
X_train.shape

## X_test 

In [None]:
X_test=pd.read_csv("data/X_test.csv")

In [None]:
X_test = X_test.drop(['order_dow', 'order_hour_of_day'], axis=1)

In [None]:
# feature 2 bev_orders/orders per user
X_test  = X_test.merge(count_ord_bev[['user_id', 'orders_bev_ratio']], on='user_id', how='left')
X_test  = X_test.fillna(0.0)

In [None]:
# feature 3 beverages orders per user
#X_test = X_test.merge(count_ord_bev[['user_id', 'order_count']], on='user_id', how='left')
#X_test = X_test.fillna(0.0)

In [None]:
# feature 4 orders per user
#X_test = X_test.merge(count_ord_bev[['user_id', 'orders_count']], on='user_id', how='left')
#X_test = X_test.fillna(0.0)

In [None]:
# feature 5 average beverage per basket per user
X_test = X_test.merge(bev_per_basket, on='user_id', how='left')
X_test  = X_test.fillna(0.0)

In [None]:
# feature 6 mean beverages count per user
X_test = X_test.merge(beverages, on='user_id', how='left')
X_test  = X_test.fillna(0.0)

In [None]:
# feature 7 size of order
#X_test = X_test.merge(orders_bev[['user_id','size_of_order' ]].drop_duplicates(subset=['user_id']), on='user_id', how='left')
#X_test  = X_test.fillna(0.0)

In [None]:
# feature 8 user average ratio of beverages that are reordered
X_test = X_test.merge(user_reorder_bev_ratio, on='user_id', how='left')
X_test  = X_test.fillna(0.0)

In [None]:
# feature 9 user average ratio of bev per basket that are reordered
X_test = X_test.merge(user_reorder_bev_basket_rt, on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 10 average user product reorder ratio 
X_test = X_test.merge(user_reorder_ratio, on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 11 average interval ratio bev/prod
X_test = X_test.merge(average_order_days_bev[['user_id', 'avg_days_bev']], on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 12 average interval between buying products per user
X_test = X_test.merge(average_order_days_bev[['user_id', 'avg_days_since_prior']], on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 13 user average mean reorder per days since prior order
X_test = X_test.merge(user_days_reorder, on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature14 average user department ratio
X_test = X_test.merge(user_department_ratio, on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 15 average user aisle ratio
X_test = X_test.merge(user_aisles_ratio, on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 16 user average reorder per add to cart order
#X_test = X_test.merge(user_prob_reorder_add_to_cart, on='user_id', how='left')
#X_test = X_test.fillna(0.0)

In [None]:
# feature 18 unique beverages ratio
#X_test = X_test.merge(orders_bev[['user_id', 'bev_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
#X_test = X_test.fillna(0.0)

In [None]:
# feature 19 beverages reordered versus product reordered ratio
#X_test = X_test.merge(orders_bev[['user_id', 'bev_prod_reorder_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
#X_test = X_test.fillna(0.0)

In [None]:
# feature 20 days ratio per user
X_test = X_test.merge(orders_bev[['user_id','days_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 21 hours ratio per user
X_test = X_test.merge(orders_bev[['user_id','hours_rt']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 22 unique aisles of dep 7 per user
X_test = X_test.merge(orders_bev[['user_id','aisles_7_ratio']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
# feature 23 unique departments per user
X_test = X_test.merge(orders_bev[['user_id','dpts_ratio']].drop_duplicates(subset=['user_id']), on='user_id', how='left')
X_test = X_test.fillna(0.0)

In [None]:
X_test.shape

In [None]:
X_test = X_test.drop(['order_id','user_id'], axis=1)

# Feature Selection 

We now need to establish how many of the features we extracted previously are usefull. This can be achieved with 2 methods among others: Decision Trees and Correlation between the features.

In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
import seaborn as sns


## Decision Tree

In [None]:
# Build a forest and compute the feature importances
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.33)
forest = ExtraTreesClassifier(n_estimators=100, random_state=0)
forest.fit(X_tr, y_tr["category"])
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_tr.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(figsize=(12,8))
plt.title("Feature importances")
plt.bar(range(X_tr.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_tr.shape[1]), indices)
plt.xlim([-1, X_tr.shape[1]])
plt.savefig('Feature_importances 23 features.png')
plt.show()
y_pred = forest.predict(X_tst)
print("extratrees",accuracy_score(y_tst["category"], y_pred))

We can determine that feature 18 unique beverages per user is worthless and therefore remove it. We can also remove feature 3 beverages orders  19 beverages reordered versus product reordered ratio as they don't really contribute to the model.

# Correlated Features

In [None]:
# calculate the correlation matrix
features_corr = X_train.corr()
fig, ax = plt.subplots(figsize=(12,12))         # Sample figsize in inches
# plot the heatmap
sns.heatmap(features_corr, 
        xticklabels=features_corr.columns,
        yticklabels=features_corr.columns,annot=True, linewidths=.5)
plt.savefig("corrrelation.png")


From the correlation matrix we can determine that feature  user average of beverages added early in the cart is highly correlated with feature 5 beverages per basket as well as size of orders feature 7 is highly correlated with  aisles ratio feature 15 and therefore we can remove them since they are also lower in the feature importance rank. Another highly correlated feature is feature 4 orders count to feature 1 order number. Since order number is higher in the feature importance rank we drop feature 4 as well.

In [None]:
X_train = X_train.drop(['order_count', 'bev_rt', 'bev_prod_reorder_rt', 'size_of_order'], axis=1)
X_train.shape

In [None]:
X_train = X_train.drop(['orders_count'], axis=1)

The features that are finally selected are the following:
- order number
- days since prior order
- orders beverages ratio
- beverages per basket ratio
- mean beverages count per user
- user beverage per basket reorder ratio
- user beverage reorder ratio 
- average days between beverages orders
- average days between orders
- average reorder per days since prior order
- user department ratio
- user aisles ratio
- days ratio
- hours ratio
- aisles of department 7 ratio
- unique departments ratio

We consider 4 different classifiers for the task

- Logistic Regression Classifier
- Xgboost Classifier
- Random Forests Classifier
- Neural Network

## Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

In [None]:
# Train-test evaluate
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.33)

X_tr_scaled = preprocessing.scale(X_tr)
X_tst_scaled = preprocessing.scale(X_tst)
logreg = LogisticRegression()
logreg.fit(X_tr_scaled, y_tr["category"])
y_pred = logreg.predict(X_tst_scaled)
print("logreg",accuracy_score(y_tst["category"], y_pred))

In [None]:
X_train_scaled = preprocessing.scale(X_train) 
X_test_scaled = preprocessing.scale(X_test)
model_logreg = LogisticRegression()
model_logreg.fit(X_train_scaled, y_train["category"])

In [None]:
X_test['category'] = logreg.predict(X_test_scaled)
X_test['category'] = np.round(X_test['category']).astype(int)
X_tmp = pd.read_csv("data/X_test.csv")
submission = pd.concat([X_tmp['order_id'], X_test['category']], axis=1)
submission.to_csv("logreg_submission.csv",index=False)

## XgBoost Classifier

In [None]:
import xgboost

In [None]:
# Train-test evaluate
xgb = xgboost.XGBClassifier()

X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.33)
xgb.fit(X_tr, y_tr["category"])
y_pred_xgb = xgb.predict(X_tst)
print("xgboost",accuracy_score(y_tst["category"], y_pred_xgb))

In [None]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train["category"])
X_test['category'] = xgb.predict(X_test)
X_test['category'] = np.round(X_test['category']).astype(int)
X_tmp = pd.read_csv("data/X_test.csv")
submission = pd.concat([X_tmp['order_id'], X_test['category']], axis=1)
submission.to_csv("xgboost_submission.csv",index=False)

In [None]:
submission_score_xgb = 0.617448652

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Train-test evaluate
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.33)
clf = RandomForestClassifier(n_estimators=20)
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X_tr, y_tr['category'])

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

In [None]:
random_forest = RandomForestClassifier(n_estimators=250, n_jobs=8)
random_forest.fit(X_train, y_train["category"])
X_test['category'] = random_forest.predict(X_test)
X_test['category'] = np.round(X_test['category']).astype(int)
X_tmp = pd.read_csv("data/X_test.csv")
submission = pd.concat([X_tmp['order_id'], X_test['category']], axis=1)
submission.to_csv("rand_for_submission.csv",index=False)

# Neural Network

We consider a simple neural network since our number of features is not that high to justify a deep learning approach. The neural network is comprised of the input layer, a middle layer with 17 units (same as the number of features) and tanh as activation and the output layer with a sigmoid function as activation. 

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.33)

In [None]:
model = Sequential()
model.add(Dense(17, input_dim=17, init = keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None), activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer= 'adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5000, batch_size=X_train.shape[0])

In [None]:
scores = model.evaluate(X_tst, y_tst)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
X_test['category'] = model.predict(X_test)
X_test['category'] = np.round(X_test['category']).astype(int)

In [None]:
X_tmp = pd.read_csv("data/X_test.csv")

In [None]:
submission = pd.concat([X_tmp['order_id'], X_test['category']], axis=1)

In [None]:
submission.to_csv("neuralnet_submission.csv",index=False)

In [None]:
submission_score_nn = 0.618340827681

The best accuracy score in the submission file was given by the neural network and was 61,83%