In [1]:
from __future__ import division
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np


#read train test data
X_train=pd.read_csv("data/X_train.csv")
y_train=pd.read_csv("data/y_train.csv")
X_test=pd.read_csv("data/X_test.csv")
X_train.head(3)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,3110915,94104,58,3,22,2.0
1,2277131,18463,4,6,21,24.0
2,2251215,104676,56,4,12,9.0


In [2]:
orders = pd.read_csv('data/orders.csv')
products = pd.read_csv('data/products.csv')
orders_priors = pd.read_csv('data/order_products__prior.csv')
departments = pd.read_csv('data/departments.csv')
aisles = pd.read_csv('data/aisles.csv')

In [3]:
# convert categorical data to dummy values
X_train= pd.get_dummies(X_train, prefix=["d", "h"], columns=['order_dow', 'order_hour_of_day'])
X_train.head(3)

Unnamed: 0,order_id,user_id,order_number,days_since_prior_order,d_0,d_1,d_2,d_3,d_4,d_5,...,h_14,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23
0,3110915,94104,58,2.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2277131,18463,4,24.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2251215,104676,56,9.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
orders.head(2)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0


In [5]:
# products in department 7
products_bev = products[products['department_id']==7]
products_bev_id = list(products_bev.product_id.values)
products_bev.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id
2,3,Robust Golden Unsweetened Oolong Tea,94,7
6,7,Pure Coconut Water With Orange,98,7


In [6]:
# order priors of department 7
orders_priors_bev = orders_priors[orders_priors['product_id'].isin(products_bev_id)]
orders_prior_id_bev = list(orders_priors_bev.order_id.values)
orders_priors_bev.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
27,4,25146,11,1
28,4,32645,12,1
29,4,41276,13,1


In [7]:
# orders of department 7
orders_bev = orders[orders['order_id'].isin(orders_prior_id_bev)]
orders_bev.loc[orders_bev['order_id']==3421081]

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
1907047,3421081,117076,1,3,11,


In [8]:
# count orders of department 7 for every user
count_ord_bev = orders_bev.groupby('user_id').count()
count_ord_bev = count_ord_bev['order_id']
count_ord_bev= count_ord_bev.reset_index()
count_ord_bev.rename(columns={'order_id':'order_count'}, inplace=True)
count_ord_bev.head(2)

Unnamed: 0,user_id,order_count
0,1,10
1,2,6


In [9]:
X_train = X_train.merge(count_ord_bev, on='user_id', how='left')

In [10]:
X_train = X_train.fillna(0.0)

In [11]:
X_example_train, X_example_test, y_example_train, y_example_test = train_test_split(X_train, y_train, test_size=0.33)
logreg = LogisticRegression()
logreg.fit(X_example_train, y_example_train["category"])
y_pred = logreg.predict(X_example_test)
print("logreg",accuracy_score(y_example_test["category"], y_pred))

('logreg', 0.51578137028483451)


In [None]:
orders_priors['size_of_order']=orders_priors.groupby('order_id')['add_to_cart_order'].transform('max')
orders_priors['total_beverages']=orders_priors.groupby('order_id')['product_id'].transform(lambda x: (x.isin(products_bev_id)).sum())

In [None]:
orders_priors['bev_per_basket']= np.where(orders_priors['total_beverages'] < 1, orders_priors['total_beverages'], np.around(orders_priors['total_beverages']/orders_priors['size_of_order'], decimals=2))
orders_priors.head(20)

In [None]:
orders = orders.merge(orders_priors, how='left', on='order_id')
orders.drop(columns=['product_id', 'add_to_cart_order', 'reordered','size_of_order', 'total_beverages'])