# Setup

In [1]:
import os

from Db import DataBase
from Category import Category
from Feature import Feature

import pandas as pd
import numpy as np

from Trainset import Trainset
from Testset import Testset
from Trainer import Trainer

# Load Data

In [2]:
# Load Data
file_path = os.getcwd() + "/../data/"
dbase = DataBase(file_path)

# Data Preprocessing

In [3]:
baskets_data, coupons_data = dbase.load_basket_coupon_data()

In [4]:
# orignal_price
original_price = dbase.original_price(baskets_data)

# Product Category Table
catClass = Category(baskets_data)
prods_cat_table = catClass.generate_product_category_table()

baskets_train, baskets_test, coupons_train, coupons_test = dbase.split_data(baskets_data, coupons_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baskets_train[column] = baskets_train[column].cat.remove_unused_categories()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baskets_test[column] = baskets_test[column].cat.remove_unused_categories()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coupons_train[column] = coupons_train[column].cat.rem

In [5]:
baskets_train, baskets_test, coupons_train, coupons_test = dbase.generate_split_data_with_category(prods_cat_table, baskets_train, baskets_test, coupons_train, coupons_test)

In [6]:
print(prods_cat_table.shape)
# prods_cat_table.tail(60)

(250, 2)


In [7]:
print(baskets_test.head(2))
print(coupons_train.head(2))
print(coupons_test.head(2))

   week shopper  product  price  target category
0    89       0       67    637       1        8
1    89       0       71    629       1       20
   week shopper product  discount category
0     0       0      35        35        1
1     0       0     193        40        4
   week shopper  product  discount category
0    89       0      131        30        0
1    89       0       16        25       16


# Feature Engineering

In [8]:
# Define feat
print("feature engineering begin")
feat = Feature(baskets_train, coupons_train)

feature engineering begin


In [9]:
total_count_of_product = feat.total_count_of_product()
reordered_category = (baskets_train.groupby(['shopper']).category.value_counts()>1).astype(int)
reordered_product = feat.reordered_product()
category_count = feat.category_count()
reordered_category = feat.reordered_category()
coupon_in_same_category = feat.coupon_in_same_category()
average_price_per_shopper = feat.average_price_per_shopper()
average_basket_size = feat.average_basket_size()
unique_products_per_shopper = feat.unique_products_per_shopper()
unique_categories_per_shopper = feat.unique_categories_per_shopper()
ratio_of_reordered_products_per_shopper = feat.ratio_of_reordered_products_per_shopper()
ratio_of_reordered_categories_per_shopper = feat.ratio_of_reordered_categories_per_shopper()

In [10]:
reordered_product = reordered_product.to_frame('reordered_product').reset_index()
reordered_category = reordered_category.to_frame('reordered_category').reset_index()
unique_products_per_shopper = unique_products_per_shopper.to_frame('unique_products_per_shopper').reset_index()
unique_categories_per_shopper = unique_categories_per_shopper.to_frame('unique_categories_per_shopper').reset_index()

In [11]:
print("feature engineering finished")

feature engineering finished


# Training Set

In [12]:
# Train Table
train_t = Trainset(baskets_train, coupons_train, original_price)

In [13]:
full_df_train = train_t.generate_full_df_train()

In [14]:
## featureless_training_set
num_weeks = 88
num_shoppers = 100

df1 = pd.DataFrame({'key': np.ones(num_weeks), 'week': list(range(num_weeks))})
df2 = pd.DataFrame({'key': np.ones(num_shoppers), 'shopper': list(range(num_shoppers))})
df3 = pd.DataFrame({'key': np.ones(250), 'product': list(range(250))})

featureless_training_set = (pd
    .merge(df1, df2, on='key')
    .merge(df3, on='key')
    .merge(prods_cat_table, on='product')
    .merge(full_df_train, on=['week', 'shopper', 'product', 'category'], how='left')[full_df_train.columns]
    )

print("----- featureless training set -----")
print(featureless_training_set.head(2))

----- featureless training set -----
   week shopper product  price  target category  discount coupon
0     0       0       0    NaN     NaN       15       NaN    NaN
1     0       1       0    NaN     NaN       15       NaN    NaN


In [15]:
## Generate training set
training_set = train_t.populate_features(
    featureless_training_set,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)
print("===== training_set =====")
print(training_set.head(2))

===== training_set =====
   week shopper product  price  target category  discount coupon  \
0     0       0       0  688.0     0.0       15       0.0     No   
1     0       1       0  688.0     0.0       15       0.0     No   

   original_price  total_count_of_product  reordered_product  category_count  \
0             688                       0                0.0               8   
1             688                       6                1.0              28   

   reordered_category coupon_in_same_category  average_price_per_shopper  \
0                 1.0                     Yes                 587.203947   
1                 1.0                     Yes                 584.065350   

   average_basket_size  unique_products_per_shopper  \
0             8.539326                           54   
1             7.393258                           71   

   unique_categories_per_shopper  weeks_since_prior_product_order  \
0                             24                                1

In [16]:
# X_train, y_train
X_train, y_train = train_t.split_trainingset_to_X_train_and_y_train(training_set)
print(X_train.head(2))
print(y_train.head(2))

  shopper product  price category  discount coupon  original_price  \
0       0       0  688.0       15       0.0     No             688   
1       1       0  688.0       15       0.0     No             688   

   total_count_of_product  reordered_product  category_count  \
0                       0                0.0               8   
1                       6                1.0              28   

   reordered_category coupon_in_same_category  average_price_per_shopper  \
0                 1.0                     Yes                 587.203947   
1                 1.0                     Yes                 584.065350   

   average_basket_size  unique_products_per_shopper  \
0             8.539326                           54   
1             7.393258                           71   

   unique_categories_per_shopper  weeks_since_prior_product_order  \
0                             24                                1   
1                             23                               

# Testing Set

In [17]:
# testing table
test_t = Testset(baskets_test, coupons_test)

In [18]:
# full_df_test
full_df_test = test_t.generate_full_df_test()

In [19]:
# Generate testing set
testing_set = test_t.generate_training_set(
    prods_cat_table,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper,
    training_set
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)

In [20]:
X_test, y_test = test_t.split_testingset_to_X_test_and_y_test(testing_set)
print(X_test.head(2))
print(y_test.head(2))

  shopper product  price category  discount coupon  original_price  \
0       0       0  688.0       15       0.0     No             688   
1       0       1  560.0       15       0.0     No             560   

   total_count_of_product  reordered_product  category_count  \
0                       0                0.0               8   
1                       0                0.0               8   

   reordered_category coupon_in_same_category  average_price_per_shopper  \
0                 1.0                      No                 587.203947   
1                 1.0                      No                 587.203947   

   average_basket_size  unique_products_per_shopper  \
0             8.539326                           54   
1             8.539326                           54   

   unique_categories_per_shopper  weeks_since_prior_product_order  \
0                             24                               89   
1                             24                               

# Trainer

In [21]:
trainer = Trainer(X_train, y_train, X_test, y_test)

In [22]:
model = trainer.fit_model()

