# Setup

In [1]:
import os

from Db import DataBase
from Category import Category
from Feature import Feature

import pandas as pd
import numpy as np

from Trainset import Trainset
from Testset import Testset
from Trainer import Trainer

# Load Data

In [2]:
# Load Data
file_path = os.getcwd() + "/../data/"
dbase = DataBase(file_path)

# Data Preprocessing

In [3]:
baskets_data, coupons_data = dbase.load_basket_coupon_data()

In [4]:
# orignal_price
original_price = dbase.original_price(baskets_data)

# Product Category Table
catClass = Category(baskets_data)
prods_cat_table = catClass.generate_product_category_table()

baskets_train, baskets_test, coupons_train, coupons_test = dbase.split_data(baskets_data, coupons_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baskets_train[column] = baskets_train[column].cat.remove_unused_categories()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baskets_test[column] = baskets_test[column].cat.remove_unused_categories()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coupons_train[column] = coupons_train[column].cat.rem

In [5]:
baskets_train, baskets_test, coupons_train, coupons_test = dbase.generate_split_data_with_category(prods_cat_table, baskets_train, baskets_test, coupons_train, coupons_test)

In [6]:
print(prods_cat_table.shape)

(250, 2)


In [7]:
baskets_test.head(2)

Unnamed: 0,week,shopper,product,price,target,category
0,89,0,67,637,1,19
1,89,0,71,629,1,24


In [8]:
coupons_train.head(2)

Unnamed: 0,week,shopper,product,discount,category
0,0,0,35,35,10
1,0,0,193,40,13


In [9]:
coupons_test.head(2)

Unnamed: 0,week,shopper,product,discount,category
0,89,0,131,30,6
1,89,0,16,25,2


# Feature Engineering

In [10]:
# Define feat
print("feature engineering begin")
feat = Feature(baskets_train, coupons_train)

feature engineering begin


In [11]:
total_count_of_product = feat.total_count_of_product()
reordered_category = (baskets_train.groupby(['shopper']).category.value_counts()>1).astype(int)
reordered_product = feat.reordered_product()
category_count = feat.category_count()
reordered_category = feat.reordered_category()
coupon_in_same_category = feat.coupon_in_same_category()
average_price_per_shopper = feat.average_price_per_shopper()
average_basket_size = feat.average_basket_size()
unique_products_per_shopper = feat.unique_products_per_shopper()
unique_categories_per_shopper = feat.unique_categories_per_shopper()
ratio_of_reordered_products_per_shopper = feat.ratio_of_reordered_products_per_shopper()
ratio_of_reordered_categories_per_shopper = feat.ratio_of_reordered_categories_per_shopper()

In [12]:
reordered_product = reordered_product.to_frame('reordered_product').reset_index()
reordered_category = reordered_category.to_frame('reordered_category').reset_index()
unique_products_per_shopper = unique_products_per_shopper.to_frame('unique_products_per_shopper').reset_index()
unique_categories_per_shopper = unique_categories_per_shopper.to_frame('unique_categories_per_shopper').reset_index()

In [13]:
print("feature engineering finished")

feature engineering finished


# Training Set

In [14]:
# Train Table
train_t = Trainset(baskets_train, coupons_train, original_price)

In [15]:
full_df_train = train_t.generate_full_df_train()

In [16]:
## featureless_training_set
num_weeks = 88
num_shoppers = 100

df1 = pd.DataFrame({'key': np.ones(num_weeks), 'week': list(range(num_weeks))})
df2 = pd.DataFrame({'key': np.ones(num_shoppers), 'shopper': list(range(num_shoppers))})
df3 = pd.DataFrame({'key': np.ones(250), 'product': list(range(250))})

featureless_training_set = (pd
    .merge(df1, df2, on='key')
    .merge(df3, on='key')
    .merge(prods_cat_table, on='product')
    .merge(full_df_train, on=['week', 'shopper', 'product', 'category'], how='left')[full_df_train.columns]
    )

print("----- featureless training set -----")


----- featureless training set -----


In [17]:
featureless_training_set.head(2)

Unnamed: 0,week,shopper,product,price,target,category,discount,coupon
0,0,0,0,,,11,,
1,0,1,0,,,11,,


In [18]:
## Generate training set
training_set = train_t.populate_features(
    featureless_training_set,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)

In [19]:
print("===== training_set =====")
training_set.head(2)

===== training_set =====


Unnamed: 0,week,shopper,product,price,target,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,0,688.0,0.0,11,0.0,No,688,0,0.0,0,0.0,Yes,587.203947,8.539326,54,24,1,1
1,0,1,0,688.0,0.0,11,0.0,No,688,6,1.0,19,1.0,No,584.06535,7.393258,71,24,1,1


In [20]:
# X_train, y_train
X_train, y_train = train_t.split_trainingset_to_X_train_and_y_train(training_set)

In [21]:
X_train.head(2)

Unnamed: 0,shopper,product,price,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,688.0,11,0.0,No,688,0,0.0,0,0.0,Yes,587.203947,8.539326,54,24,1,1
1,1,0,688.0,11,0.0,No,688,6,1.0,19,1.0,No,584.06535,7.393258,71,24,1,1


In [22]:
y_train.head(2)

0    0.0
1    0.0
Name: target, dtype: float64

# Testing Set

In [23]:
# testing table
test_t = Testset(baskets_test, coupons_test)

In [24]:
# full_df_test
full_df_test = test_t.generate_full_df_test()

In [25]:
# Generate testing set
testing_set = test_t.generate_training_set(
    prods_cat_table,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper,
    training_set
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)

In [26]:
X_test, y_test = test_t.split_testingset_to_X_test_and_y_test(testing_set)

In [27]:
print(X_test.shape)
X_test.head(2)

(25000, 18)


Unnamed: 0,shopper,product,price,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,688.0,11,0.0,No,688,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441
1,0,1,560.0,11,0.0,No,560,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441


In [28]:
y_test.head(2)

0    0.0
1    0.0
Name: target, dtype: float64

In [29]:
# Duplicate all for X_test entries (each product for each shoppers for each week)
# => updated values: 
# 1) discount 30, 
# 2) coupon: Yes
# 3) price: original_price * (1 - (discount / 100))
# 4) coupon in same category (?)


# 25000 entries

In [30]:
X_test_w_coupons = X_test.copy()

In [31]:
X_test_w_coupons.head(2)

Unnamed: 0,shopper,product,price,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,688.0,11,0.0,No,688,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441
1,0,1,560.0,11,0.0,No,560,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441


In [32]:
# 1) discount 30%
X_test_w_coupons.loc[:,"discount"] = 30
X_test_w_coupons.head(2)

Unnamed: 0,shopper,product,price,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,688.0,11,30,No,688,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441
1,0,1,560.0,11,30,No,560,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441


In [33]:
# 2) coupons = Yes
X_test_w_coupons.loc[:, "coupon"] = "Yes"
X_test_w_coupons.head(2)

Unnamed: 0,shopper,product,price,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,688.0,11,30,Yes,688,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441
1,0,1,560.0,11,30,Yes,560,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441


In [34]:
# 3) price = original_price * (1 - (discount / 100))
X_test_w_coupons.loc[:, "price"] = X_test_w_coupons["original_price"] * (1 - (X_test_w_coupons["discount"] / 100))
X_test_w_coupons.head(2)

Unnamed: 0,shopper,product,price,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,481.6,11,30,Yes,688,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441
1,0,1,392.0,11,30,Yes,560,0,0.0,0,0.0,No,587.203947,8.539326,54,24,89,441


In [35]:
# Combine X_test AND X_test_w_coupons
new_X_test = pd.concat([X_test, X_test_w_coupons])

In [36]:
print(new_X_test.shape)
new_X_test.sample(10)

(50000, 18)


Unnamed: 0,shopper,product,price,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper,unique_categories_per_shopper,weeks_since_prior_product_order,weeks_since_prior_category_order
19290,29,180,551.0,21,0.0,No,551,0,0.0,11,1.0,No,562.891419,6.41573,61,23,89,445
9521,52,81,513.0,16,0.0,No,513,1,0.0,16,1.0,No,573.565619,6.078652,74,24,10,99
3201,20,31,548.8,10,30.0,Yes,784,4,1.0,4,1.0,No,581.853704,6.067416,51,23,34,738
15224,22,144,562.0,9,0.0,No,562,0,0.0,25,1.0,No,585.832037,7.224719,64,22,89,142
21921,92,201,514.0,3,0.0,No,514,0,0.0,40,1.0,No,598.45848,9.606742,71,25,89,112
15206,20,146,596.0,9,0.0,No,596,19,1.0,26,1.0,No,581.853704,6.067416,51,23,9,113
16254,25,154,569.0,4,0.0,No,569,1,0.0,38,1.0,No,582.983508,7.494382,61,24,51,92
10489,48,99,519.0,22,0.0,No,519,11,1.0,28,1.0,No,554.706052,7.797753,82,23,7,7
519,3,7,540.4,14,30.0,Yes,772,6,1.0,37,1.0,No,561.846361,8.337079,91,23,3,3
8353,35,73,837.0,24,0.0,No,837,0,0.0,14,1.0,No,601.825419,8.044944,68,25,89,190


# Trainer

In [37]:
trainer = Trainer(X_train, y_train, new_X_test, y_test)

In [38]:
model = trainer.fit_model()



In [39]:
predictions = trainer.predict()

NameError: name 'lgb' is not defined

In [None]:
predict_prob = trainer.predict_proba()