In [None]:
import numpy as np 
import pandas as pd 

import gc
gc.enable() 

In [None]:
print('loading files ...')
order_products_prior = pd.read_csv(PATH + 'order_products__prior.csv')
order_products_train = pd.read_csv(PATH + 'order_products__train.csv')
orders = pd.read_csv(PATH + 'orders.csv')
products = pd.read_csv(PATH + 'products.csv', usecols=['product_id', 'aisle_id', 'department_id'])
orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2})
orders.days_since_prior_order = orders.days_since_prior_order.fillna(30)
print('done loading')

In [None]:
prior_orders = pd.merge(orders, order_products_prior, on='order_id', how='inner')
prior_orders.head()

#deleting prior dataset
del order_products_prior
gc.collect()

In [None]:
#number of orders placed by each user.
users = prior_orders.groupby(by='user_id')['order_number'].aggregate('max').to_frame('u_num_of_orders').reset_index()
# #converting the datatype to int.
# users.u_num_of_orders = users.u_num_of_orders.astype(np.uint8)
users.head()

In [None]:
#average products in orders placed by each users.

#1. First getting the total number of products in each order.
total_prd_per_order = prior_orders.groupby(by=['user_id', 'order_id'])['product_id'].aggregate('count').to_frame('total_products_per_order').reset_index()

#2. Getting the average products purchased by each user
avg_products = total_prd_per_order.groupby(by=['user_id'])['total_products_per_order'].mean().to_frame('u_avg_prd').reset_index()
avg_products.head()

#deleting the total_prd_per_order dataframe
del [total_prd_per_order]
gc.collect()

avg_products.head()

In [None]:
#dow the user has ordered most.
#importing the scipy's stats model
from scipy import stats


#execution will take approx 45sec.
dow = prior_orders.groupby(by=['user_id'])['order_dow'].aggregate(lambda x : stats.mode(x)[0]).to_frame('dow_u_most_orders')
#resetting the index
dow = dow.reset_index()
dow.head()

In [None]:
#hour of day the user has ordered most.

#execution will take approx 45sec.
hod = prior_orders.groupby(by=['user_id'])['order_hour_of_day'].aggregate(lambda x : stats.mode(x)[0]).to_frame('hod_u_most_orders')
#resetting the index
hod = hod.reset_index()
hod.head()

In [None]:
#reorder ratio of user.
reorder_u = prior_orders.groupby(by='user_id')['reordered'].aggregate('mean').to_frame('u_reorder_ratio').reset_index()
#changing the dtype.
reorder_u['u_reorder_ratio'] = reorder_u['u_reorder_ratio'].astype(np.float16)
reorder_u.head()

In [None]:
#merging users df and avg_prd
users = users.merge(avg_products, on='user_id', how='left')
#merging users df with dow
users = users.merge(dow, on='user_id', how='left')
#merging users df with hod
users = users.merge(hod, on='user_id', how='left')
#merging users df with reorder_u
users = users.merge(reorder_u, on='user_id', how='left')


#deleting unwwanted df
del [reorder_u, dow, hod, avg_products]
gc.collect()

In [None]:
#number of times purchased.
prd = prior_orders.groupby(by='product_id')['order_id'].aggregate('count').to_frame('p_num_of_times').reset_index()
# prd['p_num_of_times'] = prd['p_num_of_times'].astype(np.uint16)
prd.head()

#reordered ratio for each product
reorder_p = prior_orders.groupby(by='product_id')['reordered'].aggregate('mean').to_frame('p_reorder_ratio').reset_index()
# #changing dtype
# reorder_p['p_reorder_ratio'] = reorder_p['p_reorder_ratio'].astype(np.float16)

In [None]:
#add to cart for each product.
add_to_cart = prior_orders.groupby(by='product_id')['add_to_cart_order'].aggregate('mean').to_frame('p_avg_cart_position').reset_index()
# #changing the dtype
# add_to_cart['p_avg_cart_position'] = add_to_cart['p_avg_cart_position'].astype(np.float16)
add_to_cart.head()

#merging reorder_p with prd.
prd = prd.merge(reorder_p, on='product_id', how='left')

#merging add_to_cart with prd.
prd = prd.merge(add_to_cart, on='product_id', how='left')

#deleting unwanted df.
del [reorder_p, add_to_cart]
gc.collect()

In [None]:
#times a user have bough a product.
uxp = prior_orders.groupby(by=['user_id', 'product_id'])['order_id'].aggregate('count').to_frame('uxp_times_bought')
#resetting index
uxp = uxp.reset_index()
# #changing the dtype.
# uxp['uxp_times_bought'] = uxp['uxp_times_bought'].astype(np.uint8)
uxp.head()

#times a user have bough a product.
times = prior_orders.groupby(by=['user_id', 'product_id'])['order_id'].aggregate('count').to_frame('times_bought')
#resetting index
times = times.reset_index()
# #changing the dtype.
# times['times_bought'] = times['times_bought'].astype(np.uint8)
times.head()

#Total orders
total_orders = prior_orders.groupby('user_id')['order_number'].max().to_frame('total_orders').reset_index()
total_orders.head()

#Finding when the user has bought a product the first time.
first_order_num = prior_orders.groupby(by=['user_id', 'product_id'])['order_number'].aggregate('min').to_frame('first_order_num')
#resetting the index
first_order_num = first_order_num.reset_index()
first_order_num.head()

#merging both the dataframes
span = pd.merge(total_orders, first_order_num, on='user_id', how='right')
span.head()

#Calculating the order range.
# The +1 includes in the difference the first order were the product has been purchased
span['Order_Range_D'] = span.total_orders - span.first_order_num + 1

#merging times df with the span
uxp_ratio = pd.merge(times, span, on=['user_id', 'product_id'], how='left')

#calculating the ratio.
uxp_ratio['uxp_reorder_ratio'] = uxp_ratio.times_bought / uxp_ratio.Order_Range_D

#dropping all the unwanted columns.
uxp_ratio.drop(['times_bought', 'total_orders', 'first_order_num', 'Order_Range_D'], axis=1, inplace=True)
uxp_ratio.head()

#deleting all the unwanted df.
del [times, span, first_order_num, total_orders]
gc.collect()

In [None]:
Merging all the created features into the uxp dataset.

#merging uxp_ratio with uxp.
uxp = uxp.merge(uxp_ratio, on=['user_id', 'product_id'], how='left')
#deleting uxp_ratio
del uxp_ratio
#calling garbage collector.
gc.collect()

In [None]:
#keeping only the train and test set from the orders df.
orders_future = orders.loc[((orders.eval_set == 1) | (orders.eval_set == 2)), ['user_id', 'eval_set', 'order_id']]
orders_future.head()

#merging the orders_future with data.
data = data.merge(orders_future, on='user_id', how='left')
data.head()

In [None]:
#Preparing training data set.
data_train = data[data.eval_set == 1]

In [None]:
#merging the information contained in the order_products__train.csv into data_train.
data_train = data_train.merge(order_products_train[['product_id', 'order_id', 'reordered']], on=['product_id', 'order_id'], how='left')

In [None]:
#filling the NAN values in the reordered
data_train.reordered.fillna(0, inplace=True)
#setting user_id and product_id as index.
data_train = data_train.set_index(['user_id', 'product_id'])

#deleting eval_set, order_id as they are not needed for training.
data_train.drop(['eval_set', 'order_id'], axis=1, inplace=True)

In [None]:
#Preparing the test dataset.
data_test = data[data.eval_set == 2]
data_test.head()

#setting user_id and product_id as index.
data_test = data_test.set_index(['user_id', 'product_id'])

#deleting eval_set, order_id as they are not needed for training.
data_test.drop(['eval_set', 'order_id'], axis=1, inplace=True)

#shape of train and test.
data_train.shape, data_test.shape

#deleting unwanted df and collecting garbage
del [data, orders_future, products, order_products_train]
gc.collect()

In [None]:

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, classification_report
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.classifiers import plot_feature_importances

import xgboost as xgb
import lightgbm as lgb

In [None]:
#Creating X and y variables.
X = data_train.drop('reordered', axis=1)
y = data_train.reordered

#splitting dataset into train and test split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
#Creating a light gradient boosting model.
# #Initializing the model
lgbm = lgb.LGBMClassifier(objective='binary', num_leaves=96, max_depth=10)

#fitting the model.
lgbm.fit(X_train, y_train)

#prediction
y_pred = (lgbm.predict_proba(X_test)[:, 1] >= 0.21).astype('int') #setting a threshold.

#Evaluation.
print('F1 Score: {}'.format(f1_score(y_pred, y_test)))
print(classification_report(y_pred, y_test))
plot_confusion_matrix(y_pred, y_test)

In [None]:
#Fitting on entire data.
lgbm.fit(X, y)

In [None]:
#making prdeictions on the test dataset
y_pred_test = (lgbm.predict_proba(data_test)[:, 1] >= 0.21).astype('int') #setting a threshold.

#saving the prediction as a new column in data_test
data_test['prediction'] = y_pred_test

# Reset the index
final = data_test.reset_index()
# Keep only the required columns to create our submission file (for chapter 6)
final = final[['product_id', 'user_id', 'prediction']]

gc.collect()
final.head()

#Creating a submission file
orders = pd.read_csv(PATH + 'orders.csv')
orders_test = orders.loc[orders.eval_set == 'test', ['user_id', 'order_id']]
orders_test.head()

#merging our prediction with orders_test
final = final.merge(orders_test, on='user_id', how='left')

#remove user_id column
final = final.drop('user_id', axis=1)

In [None]:
#convert product_id as integer
final['product_id'] = final.product_id.astype(int)

## Remove all unnecessary objects
del orders
del orders_test
gc.collect()

d = dict()
for row in final.itertuples():
    if row.prediction== 1:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in final.order_id:
    if order not in d:
        d[order] = 'None'
        
gc.collect()

#We now check how the dictionary were populated (open hidden output)
#d

In [None]:
#Convert the dictionary into a DataFrame
sub = pd.DataFrame.from_dict(d, orient='index')

#Reset index
sub.reset_index(inplace=True)
#Set column names
sub.columns = ['order_id', 'products']

In [None]:
sub.to_csv('sub.csv', index=False, header=True)