In [1]:
from __future__ import division
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np
import cPickle as pickle

In [2]:
#read train test data
X_train=pd.read_csv("data/X_train.csv")
y_train=pd.read_csv("data/y_train.csv")
X_test=pd.read_csv("data/X_test.csv")

In [3]:
orders = pd.read_csv('data/orders.csv')
products = pd.read_csv('data/products.csv')
orders_priors = pd.read_csv('data/order_products__prior.csv')
departments = pd.read_csv('data/departments.csv')
aisles = pd.read_csv('data/aisles.csv')

In [4]:
# convert categorical data to dummy values
X_train= pd.get_dummies(X_train, prefix=["d", "h"], columns=['order_dow', 'order_hour_of_day'])

In [5]:
# products in department 7
products_bev = products[products['department_id']==7]
products_bev_id = list(products_bev.product_id.values)
products_bev.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id
2,3,Robust Golden Unsweetened Oolong Tea,94,7
6,7,Pure Coconut Water With Orange,98,7


In [6]:
# order priors of department 7
orders_priors_bev = orders_priors[orders_priors['product_id'].isin(products_bev_id)]
orders_prior_id_bev = list(orders_priors_bev.order_id.values)

In [7]:
# orders of department 7
orders_bev = orders[orders['order_id'].isin(orders_prior_id_bev)]

In [8]:
# count orders of department 7 for every user
count_ord_bev = orders_bev.groupby('user_id').count()
count_ord_bev = count_ord_bev['order_id']
count_ord_bev= count_ord_bev.reset_index()
count_ord_bev.rename(columns={'order_id':'order_count'}, inplace=True)

In [9]:
X_train = X_train.merge(count_ord_bev, on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [10]:
# beverages per basket per user
#orders_priors['size_of_order']=orders_priors.groupby('order_id')['add_to_cart_order'].transform('max')
#orders_priors.shape
#orders_priors['total_beverages']=orders_priors.groupby('order_id')['product_id'].transform(lambda x: (x.isin(products_bev_id)).sum())

In [11]:
#orders_priors['bev_per_basket']= np.where(orders_priors['total_beverages'] < 1, orders_priors['total_beverages'], np.around(orders_priors['total_beverages']/orders_priors['size_of_order'], decimals=2))


In [12]:
#pickle.dump(orders_priors, open("orders_priors.pkl", 'wb'))

In [13]:
orders_priors = pd.read_pickle("orders_priors.pkl")

MemoryError: 

In [None]:
orders_priors_bev = orders_priors[orders_priors['product_id'].isin(products_bev_id)]
orders_prior_id_bev = list(orders_priors_bev.order_id.values)
orders_bev = orders[orders['order_id'].isin(orders_prior_id_bev)]

orders_bev = orders_bev.merge(orders_priors_bev[['order_id','bev_per_basket']].drop_duplicates(subset=['order_id']), on='order_id', how='left')


In [None]:
bev_per_basket = orders_bev.groupby('user_id')['bev_per_basket'].mean().reset_index()

In [None]:
X_train = X_train.merge(bev_per_basket, on='user_id', how='left')
X_train  = X_train.fillna(0.0)

In [None]:
orders_bev = orders_bev.merge(count_ord_bev, on='user_id', how='left')
orders_bev.head(3)

In [None]:
# count how many of the beverages are reordered by order_id
#orders_priors_bev['reorder_count'] = orders_priors_bev.groupby('order_id')['reordered'].transform(lambda x: (x==1).count())
#orders_priors_bev.to_pickle('orders_priors_bev.pkl')

In [None]:
orders_priors_bev = pd.read_pickle('orders_priors_bev.pkl')

In [None]:
orders_bev = orders_bev.merge(orders_priors_bev[['order_id', 'reorder_count']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
orders_bev['reorder_ratio']= np.around(orders_bev['reorder_count']/orders_bev['order_count'], decimals=3)

In [None]:
user_reorder_ratio = orders_bev.groupby('user_id')['reorder_count'].mean().reset_index()
user_reorder_ratio = user_reorder_ratio.rename(columns={'reorder_count':'user_reorder_rt'})
user_reorder_ratio.head(10)
user_reorder_ratio['user_reorder_rt'] = np.around(user_reorder_ratio['user_reorder_rt'], decimals=3)

In [None]:
X_train = X_train.merge(user_reorder_ratio, on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
#average interval days per user
average_order_days = orders_bev.groupby('user_id')['days_since_prior_order'].mean().reset_index()
average_order_days.rename(columns={'days_since_prior_order':'avg_days_since_prior'}, inplace=True)

In [None]:
X_train = X_train.merge(average_order_days, on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
orders_priors = orders_priors.merge(products[['product_id', 'department_id']], on = 'product_id', how='left')
orders_priors.head(10)

In [None]:
unique_deps = orders_priors.groupby('order_id')['department_id'].nunique().reset_index()
unique_deps = unique_deps.rename(columns={'department_id':'unique_deps'})

In [None]:
orders_priors = orders_priors.merge(unique_deps, on='order_id', how='left')

In [None]:
orders_priors['department_ratio']= np.where(orders_priors['total_beverages']<1, orders_priors['total_beverages'], np.around(orders_priors['total_beverages']/orders_priors['unique_deps'], decimals=3))

In [None]:
orders_priors_bev = orders_priors[orders_priors['product_id'].isin(products_bev_id)]
orders_bev = orders_bev.merge(orders_priors_bev[['order_id','department_ratio']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
orders_bev.head(10)

In [None]:
user_department_ratio = orders_bev.groupby('user_id')['department_ratio'].mean().reset_index()
user_department_ratio = user_department_ratio.rename(columns={'department_ratio':'user_department_ratio'})

In [None]:
user_department_ratio['user_department_ratio']= np.around(user_department_ratio['user_department_ratio'], decimals=3)
user_department_ratio.head(10)

In [None]:
X_train = X_train.merge(user_department_ratio, on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
orders_priors_bev['high_add_to_cart']  = orders_priors_bev.groupby('order_id')['add_to_cart_order'].transform(lambda x: (x<7).sum())

In [None]:
orders_priors_bev['high_add_reorder_ratio']= np.where(orders_priors_bev['high_add_to_cart']<1, orders_priors_bev['high_add_to_cart'], np.around(orders_priors_bev['high_add_to_cart']/orders_priors_bev['size_of_order'], decimals=3))

In [None]:
orders_bev = orders_bev.merge(orders_priors_bev[['order_id', 'high_add_reorder_ratio']].drop_duplicates(subset=['order_id']), on='order_id', how='left')
orders_bev['add_to_cart_reorder_ratio'] = orders_bev['reorder_ratio']*orders_bev['high_add_reorder_ratio']
orders_bev.head(10)

In [None]:
user_high_add_cart = orders_bev.groupby('user_id')['add_to_cart_reorder_ratio'].mean().reset_index()
user_high_add_cart = user_high_add_cart.rename(columns={'add_to_cart_reorder_ratio':'user_high_add_cart'})
user_high_add_cart['user_high_add_cart'] = np.around(user_high_add_cart['user_high_add_cart'], decimals=3)

In [None]:
X_train = X_train.merge(user_high_add_cart, on='user_id', how='left')
X_train = X_train.fillna(0.0)

In [None]:
X_train = X_train.drop(['order_id','user_id'], axis=1)

In [None]:
X_example_train, X_example_test, y_example_train, y_example_test = train_test_split(X_train, y_train, test_size=0.33)
logreg = LogisticRegression()
logreg.fit(X_example_train, y_example_train["category"])
y_pred = logreg.predict(X_example_test)
print("logreg",accuracy_score(y_example_test["category"], y_pred))