## Setup

In [19]:
import os

from Db import DataBase
from Category import Category
from Feature import Feature

import pandas as pd
import numpy as np

from Trainset import Trainset
from Testset import Testset
from Trainer import Trainer

## Load Data

In [20]:
# Load Data
file_path = os.getcwd() + "/../data/"
dbase = DataBase(file_path)

## Data Preprocessing

In [21]:
baskets_data, coupons_data = dbase.load_basket_coupon_data()

# orignal_price
original_price = dbase.original_price(baskets_data)

# Product Category Table
catClass = Category(baskets_data)
prods_cat_table = catClass.generate_product_category_table()

baskets_train, baskets_test, coupons_train, coupons_test = dbase.split_data(baskets_data, coupons_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [22]:
baskets_train, baskets_test, coupons_train, coupons_test = dbase.generate_split_data_with_category(prods_cat_table, baskets_train, baskets_test, coupons_train, coupons_test)

In [23]:
print(prods_cat_table.shape)
# prods_cat_table.tail(60)

(250, 2)


In [24]:
print(baskets_test.head(2))
print(coupons_train.head(2))
print(coupons_test.head(2))

   week shopper  product  price  target category
0    89       0       67    637       1       21
1    89       0       71    629       1        7
   week shopper product  discount category
0     0       0      35        35        9
1     0       0     193        40       22
   week shopper  product  discount category
0    89       0      131        30        0
1    89       0       16        25       18


## Feature Engineering

In [25]:
# Define feat
print("feature engineering begin")
feat = Feature(baskets_train, coupons_train)

feature engineering begin


In [26]:
total_count_of_product = feat.total_count_of_product()
reordered_category = (baskets_train.groupby(['shopper']).category.value_counts()>1).astype(int)
reordered_product = feat.reordered_product()
category_count = feat.category_count()
reordered_category = feat.reordered_category()
coupon_in_same_category = feat.coupon_in_same_category()
average_price_per_shopper = feat.average_price_per_shopper()
average_basket_size = feat.average_basket_size()
unique_products_per_shopper = feat.unique_products_per_shopper()
unique_categories_per_shopper = feat.unique_categories_per_shopper()

In [27]:
# ratio_of_reordered_products_per_shopper = feat.ratio_of_reordered_products_per_shopper()
# ratio_of_reordered_categories_per_shopper = feat.ratio_of_reordered_categories_per_shopper()


In [28]:
print("feature engineering finished")

feature engineering finished


## Training Set

In [29]:
# Train Table
train_t = Trainset(baskets_train, coupons_train, original_price)

In [30]:
full_df_train = train_t.generate_full_df_train()

In [31]:
## featureless_training_set
num_weeks = 88
num_shoppers = 100

df1 = pd.DataFrame({'key': np.ones(num_weeks), 'week': list(range(num_weeks))})
df2 = pd.DataFrame({'key': np.ones(num_shoppers), 'shopper': list(range(num_shoppers))})
df3 = pd.DataFrame({'key': np.ones(250), 'product': list(range(250))})

featureless_training_set = (pd
    .merge(df1, df2, on='key')
    .merge(df3, on='key')
    .merge(prods_cat_table, on='product')
    .merge(full_df_train, on=['week', 'shopper', 'product', 'category'], how='left')[full_df_train.columns]
    )

print("----- featureless training set -----")
print(featureless_training_set.head(2))

----- featureless training set -----
   week shopper product  price  target category  discount coupon
0     0       0       0    NaN     NaN       16       NaN    NaN
1     0       1       0    NaN     NaN       16       NaN    NaN


In [32]:
## Generate training set
training_set = train_t.populate_features(
    featureless_training_set,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)
print("===== training_set =====")
print(training_set.head(2))

===== training_set =====
   week shopper product  price  target category  discount coupon  \
0     0       0       0  688.0     0.0       16       0.0     No   
1     0       1       0  688.0     0.0       16       0.0     No   

   original_price  total_count_of_product  reordered_product  category_count  \
0             688                       0                0.0               8   
1             688                       6                1.0              28   

   reordered_category coupon_in_same_category  average_price_per_shopper  \
0                 1.0                     Yes                 587.203947   
1                 1.0                     Yes                 584.065350   

   average_basket_size  unique_products_per_shopper_x  \
0             8.539326                             54   
1             7.393258                             71   

   unique_products_per_shopper_y  weeks_since_prior_product_order  \
0                             24                           

In [33]:
training_set.columns

Index(['week', 'shopper', 'product', 'price', 'target', 'category', 'discount',
       'coupon', 'original_price', 'total_count_of_product',
       'reordered_product', 'category_count', 'reordered_category',
       'coupon_in_same_category', 'average_price_per_shopper',
       'average_basket_size', 'unique_products_per_shopper_x',
       'unique_products_per_shopper_y', 'weeks_since_prior_product_order',
       'weeks_since_prior_category_order'],
      dtype='object')

In [34]:
training_set.head(2)

Unnamed: 0,week,shopper,product,price,target,category,discount,coupon,original_price,total_count_of_product,reordered_product,category_count,reordered_category,coupon_in_same_category,average_price_per_shopper,average_basket_size,unique_products_per_shopper_x,unique_products_per_shopper_y,weeks_since_prior_product_order,weeks_since_prior_category_order
0,0,0,0,688.0,0.0,16,0.0,No,688,0,0.0,8,1.0,Yes,587.203947,8.539326,54,24,1,1
1,0,1,0,688.0,0.0,16,0.0,No,688,6,1.0,28,1.0,Yes,584.06535,7.393258,71,23,1,1


In [35]:
# X_train, y_train
X_train, y_train = train_t.split_trainingset_to_X_train_and_y_train(training_set)

## Testing Set

In [36]:
# testing table
test_t = Testset(baskets_test, coupons_test)

In [37]:
# full_df_test
full_df_test = test_t.generate_full_df_test()

In [41]:
## featureless_testing_set
num_shoppers = 100

df1 = pd.DataFrame({'key': np.ones(len(baskets_test["week"].unique())), 'week': baskets_test["week"].unique()})
df2 = pd.DataFrame({'key': np.ones(num_shoppers), 'shopper': list(range(num_shoppers))})
df3 = pd.DataFrame({'key': np.ones(250), 'product': list(range(250))})

featureless_testing_set = (pd
    .merge(df1, df2, on='key')
    .merge(df3, on='key')
    .merge(prods_cat_table, on='product')
    .merge(full_df_test, on=['week', 'shopper', 'product', 'category'], how='left')[full_df_test.columns]
    )

print("----- featureless testing set -----")
print(featureless_testing_set.head(2))

----- featureless testing set -----
   week  shopper product  price  target category  discount coupon
0    89        0       0    NaN     NaN       16       NaN    NaN
1    89        1       0    NaN     NaN       16       NaN    NaN


In [43]:
# Generate testing set
testing_set = test_t.generate_training_set(
    prods_cat_table,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper,
    training_set
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)

KeyError: 'shopper'

In [40]:
X_test, y_test = test_t.split_testingset_to_X_test_and_y_test(testing_set)
print(X_test.head(2))
print(y_test.head(2))

NameError: name 'testing_set' is not defined

## Trainer

In [None]:
trainer = Trainer(X_train, y_train, X_test, y_test)

In [None]:
model = trainer.fit_model()