In [15]:
import numpy as np
import os
import pandas as pd
from pathlib import Path
import yaml
import sys
from src.utils import create_folder


In [13]:
proj_path = Path.cwd()
with open(os.path.join(proj_path, 'catalog.yml'), "r") as f:
    catalog = yaml.safe_load(f)['breakfast']
    
with open(os.path.join(proj_path, 'params.yml'), "r") as f:
    params = yaml.safe_load(f)

In [12]:
main_fname = os.path.join(proj_path, catalog['xlsx_fname'])
transactions = pd.read_excel(main_fname, skiprows=1, usecols=np.arange(12), sheet_name=catalog['sheet_names']['transactions'])
product_lookup = pd.read_excel(main_fname, skiprows=1, usecols=np.arange(6), sheet_name=catalog['sheet_names']['products'])
store_lookup = pd.read_excel(main_fname, skiprows=1, usecols=np.arange(9), sheet_name=catalog['sheet_names']['store'])
store_lookup.drop(index=[22, 39], inplace=True)
store_lookup.reset_index(drop=True, inplace=True)
glossary = pd.read_excel(main_fname, skiprows=3, usecols=np.arange(3), sheet_name=catalog['sheet_names']['glossary'], names=['VARIABLE NAME', 'TABLE', 'DESCRIPTION'])

In [17]:
create_folder(os.path.join(proj_path, catalog['output_dir']['dir']))
transactions.to_csv(os.path.join(proj_path, catalog['output_dir']['dir'], catalog['output_dir']['transactions']))
product_lookup.to_csv(os.path.join(proj_path, catalog['output_dir']['dir'], catalog['output_dir']['products']))
store_lookup.to_csv(os.path.join(proj_path, catalog['output_dir']['dir'], catalog['output_dir']['store']))
glossary.to_csv(os.path.join(proj_path, catalog['output_dir']['dir'], catalog['output_dir']['glossary']))

In [18]:
merged_data = transactions.merge(product_lookup, on='UPC', how='left').merge(store_lookup, left_on='STORE_NUM', right_on='STORE_ID', how='left')
merged_data = merged_data.to_csv(os.path.join(proj_path, catalog['output_dir']['dir'], catalog['output_dir']['merged']))

In [21]:
from src.utils import make_dates
from datetime import timedelta
merged_data = pd.read_csv('data/processed/merged_data.csv')
merged_data['WEEK_END_DATE'] = pd.to_datetime(merged_data['WEEK_END_DATE'])
original_data = merged_data.copy()
merged_data['WEEK_END_DATE'] = merged_data['WEEK_END_DATE'] + timedelta(days=3)
data_ranges = make_dates(params['breakfast']['experiment_dates'])


In [24]:
print(data_ranges)
print(merged_data['WEEK_END_DATE'].max())
print(merged_data['WEEK_END_DATE'].min())

   train_start  train_end valid_start  valid_end test_start   test_end
0   2009-01-17 2010-12-04  2010-12-11 2011-01-01 2011-01-08 2011-01-29
1   2009-02-14 2011-01-01  2011-01-08 2011-01-29 2011-02-05 2011-02-26
2   2009-03-14 2011-01-29  2011-02-05 2011-02-26 2011-03-05 2011-03-26
3   2009-04-11 2011-02-26  2011-03-05 2011-03-26 2011-04-02 2011-04-23
4   2009-05-09 2011-03-26  2011-04-02 2011-04-23 2011-04-30 2011-05-21
5   2009-06-06 2011-04-23  2011-04-30 2011-05-21 2011-05-28 2011-06-18
6   2009-07-04 2011-05-21  2011-05-28 2011-06-18 2011-06-25 2011-07-16
7   2009-08-01 2011-06-18  2011-06-25 2011-07-16 2011-07-23 2011-08-13
8   2009-08-29 2011-07-16  2011-07-23 2011-08-13 2011-08-20 2011-09-10
9   2009-09-26 2011-08-13  2011-08-20 2011-09-10 2011-09-17 2011-10-08
10  2009-10-24 2011-09-10  2011-09-17 2011-10-08 2011-10-15 2011-11-05
11  2009-11-21 2011-10-08  2011-10-15 2011-11-05 2011-11-12 2011-12-03
12  2009-12-19 2011-11-05  2011-11-12 2011-12-03 2011-12-10 2011-12-31
2012-0

In [6]:
import xgboost as xgb
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, plotting, space_eval
import numpy as np

In [7]:
#Hyperparameter Search
space = {
    'eta': hp.quniform('eta', 0.02, 0.5, 0.01),
    'max_depth': hp.choice('max_depth', np.arange(2, 10, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 3, 1),
    'subsample': hp.quniform('subsample', 0.2, 1, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.2, 1, 0.1),
    'n_estimators': hp.choice('n_estimators', np.arange(5, 150, dtype=int))
}

def optimize():
    
    best = fmin(_score, space, algo=tpe.suggest, trials=trials, max_evals=100, verbose=0)
    return best