## Setup

In [19]:
import os

from Db import DataBase
from Category import Category
from Feature import Feature

import pandas as pd
import numpy as np

from Trainset import Trainset
from Testset import Testset
from Trainer import Trainer

## Load Data

In [20]:
# Load Data
file_path = os.getcwd() + "/../data/"
dbase = DataBase(file_path)

## Data Preprocessing

In [22]:
baskets_data, coupons_data = dbase.load_basket_coupon_data()

# orignal_price
original_price = dbase.original_price(baskets_data)

# Product Category Table
catClass = Category(baskets_data)
prods_cat_table = catClass.generate_product_category_table()

baskets_train, baskets_test, coupons_train, coupons_test = dbase.split_data(baskets_data, coupons_data)

TypeError: original_price() takes 1 positional argument but 2 were given

In [None]:
baskets_train, baskets_test, coupons_train, coupons_test = dbase.generate_split_data_with_category(prods_cat_table, baskets_train, baskets_test, coupons_train, coupons_test)

In [None]:
print(prods_cat_table.shape)
# prods_cat_table.tail(60)

In [None]:
print(baskets_test.head(2))
print(coupons_train.head(2))
print(coupons_test.head(2))

## Feature Engineering

In [7]:
# Define feat
print("feature engineering begin")
feat = Feature(baskets_train, coupons_train)

feature engineering begin


In [8]:
total_count_of_product = feat.total_count_of_product()
reordered_category = (baskets_train.groupby(['shopper']).category.value_counts()>1).astype(int)
reordered_product = feat.reordered_product()
category_count = feat.category_count()
reordered_category = feat.reordered_category()
coupon_in_same_category = feat.coupon_in_same_category()
average_price_per_shopper = feat.average_price_per_shopper()
average_basket_size = feat.average_basket_size()
unique_products_per_shopper = feat.unique_products_per_shopper()
unique_categories_per_shopper = feat.unique_categories_per_shopper()

In [9]:
# ratio_of_reordered_products_per_shopper = feat.ratio_of_reordered_products_per_shopper()
# ratio_of_reordered_categories_per_shopper = feat.ratio_of_reordered_categories_per_shopper()


In [10]:
print("feature engineering finished")

feature engineering finished


## Training Set

In [11]:
# Train Table
train_t = Trainset(baskets_train, coupons_train, original_price)

In [13]:
full_df_train = train_t.generate_full_df_train()

In [17]:
## featureless_training_set
num_weeks = 88
num_shoppers = 100

df1 = pd.DataFrame({'key': np.ones(num_weeks), 'week': list(range(num_weeks))})
df2 = pd.DataFrame({'key': np.ones(num_shoppers), 'shopper': list(range(num_shoppers))})
df3 = pd.DataFrame({'key': np.ones(250), 'product': list(range(250))})

featureless_training_set = (pd
    .merge(df1, df2, on='key')
    .merge(df3, on='key')
    .merge(prods_cat_table, on='product')
    .merge(full_df_train, on=['week', 'shopper', 'product', 'category'], how='left')[full_df_train.columns]
    )

print("----- featureless training set -----")
print(featureless_training_set.head(2))

----- featureless training set -----
   week shopper product  price  target category  discount coupon
0     0       0       0    NaN     NaN        2       NaN    NaN
1     0       1       0    NaN     NaN        2       NaN    NaN


In [18]:
## Generate training set
training_set = train_t.populate_features(
    featureless_training_set,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)
print("===== training_set =====")
print(training_set.head(2))

ValueError: 'product' is both an index level and a column label, which is ambiguous.

In [None]:
# X_train, y_train
X_train, y_train = train_t.split_trainingset_to_X_train_and_y_train(training_set)
print(X_train.head(2))
print(y_train.head(2))

## Testing Set

In [None]:
# testing table
test_t = Testset(baskets_test, coupons_test)

In [None]:
# full_df_test
full_df_test = test_t.generate_full_df_test()

In [None]:
# Generate testing set
testing_set = test_t.generate_training_set(
    prods_cat_table,
    original_price,
    total_count_of_product,
    reordered_product,
    category_count,
    reordered_category,
    coupon_in_same_category,
    average_price_per_shopper,
    average_basket_size,
    unique_products_per_shopper,
    unique_categories_per_shopper,
    training_set
#     ratio_of_reordered_products_per_shopper,
#     ratio_of_reordered_categories_per_shopper,
)

In [None]:
X_test, y_test = test_t.split_testingset_to_X_test_and_y_test(testing_set)
print(X_test.head(2))
print(y_test.head(2))

## Trainer

In [None]:
trainer = Trainer(X_train, y_train, X_test, y_test)

In [None]:
model = trainer.fit_model()