In [3]:
import numpy as np
import os
import pandas as pd
from pathlib import Path
import yaml
import sys
from src.utils import create_folder
import xgboost as xgb
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, plotting, space_eval
from src.utils import *
from datetime import timedelta, datetime

proj_path = Path.cwd()
with open(os.path.join(proj_path, 'catalog.yml'), "r") as f:
    catalog = yaml.safe_load(f)['breakfast']
    
with open(os.path.join(proj_path, 'params.yml'), "r") as f:
    params = yaml.safe_load(f)

In [4]:
#Hyperparameter Search
space = {
    'eta': hp.quniform('eta', 0.02, 0.5, 0.01),
    'max_depth': hp.choice('max_depth', np.arange(2, 10, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 3, 1),
    'subsample': hp.quniform('subsample', 0.2, 1, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.2, 1, 0.1),
    'n_estimators': hp.choice('n_estimators', np.arange(5, 150, dtype=int))
}

def optimize():
    
    best = fmin(_score, space, algo=tpe.suggest, trials=trials, max_evals=100, verbose=0)
    return best

In [5]:
merged_data = pd.read_csv('data/processed/merged_data.csv')
merged_data['WEEK_END_DATE'] = pd.to_datetime(merged_data['WEEK_END_DATE'])
original_data = merged_data.copy()
merged_data['WEEK_END_DATE'] = merged_data['WEEK_END_DATE'] + timedelta(days=3)
data_ranges = make_dates(params['breakfast']['experiment_dates'])

In [6]:
stores = list(params['breakfast']['dataset']['store_ids'].keys())
upcs = list(params['breakfast']['dataset']['upc_ids'].keys())
import itertools
store_upc_pairs = list(itertools.product(stores, upcs))
print(store_upc_pairs)

[(2277, 1600027527), (2277, 3800031838), (2277, 1111009477), (2277, 7192100339), (389, 1600027527), (389, 3800031838), (389, 1111009477), (389, 7192100339), (25229, 1600027527), (25229, 3800031838), (25229, 1111009477), (25229, 7192100339)]


In [None]:



    
#for store_id, upc_id in store_upc_pairs:
#    create_folder(os.path.join(proj_path, 'runs'))
for _, train_start, train_end, valid_start, valid_end, test_start, test_end in date_ranges.itertuples():
    lag_units = params['xgb']['window_size']
    avg_units = params['xgb']['avg_units']
    #control features

    #filtered_data = merged_data[merged_data['ADDRESS_STATE_PROV_CODE']=='TX'][['WEEK_END_DATE', 'STORE_NUM', 'UPC', 'UNITS', 'PRICE', 'BASE_PRICE', 'DESCRIPTION', 'MANUFACTURER', 'CATEGORY', 'SUB_CATEGORY', 'PRODUCT_SIZE', 'STORE_ID', 'STORE_NAME', 'ADDRESS_CITY_NAME', 'MSA_CODE', 'SEG_VALUE_NAME', 'PARKING_SPACE_QTY', 'SALES_AREA_SIZE_NUM', 'AVG_WEEKLY_BASKETS']].copy()
    filtered_data = merged_data[merged_data['ADDRESS_STATE_PROV_CODE']=='TX'][['WEEK_END_DATE', 'STORE_NUM', 'UPC', 'UNITS', 'PRICE', 'BASE_PRICE', 'STORE_ID', 'MSA_CODE', 'PARKING_SPACE_QTY', 'SALES_AREA_SIZE_NUM', 'AVG_WEEKLY_BASKETS']].copy()

    #Filter data
    make_lag_features(filtered_data, lag_units, col_name='UNITS', prefix_name='lag-units', inplace=True)
    make_historical_avg(filtered_data, r_list=avg_units, col_n='lag-units-1', google_trends=True)
    add_datepart(filtered_data, fldname='WEEK_END_DATE', drop=False)

    training_df = filtered_data[(filtered_data['WEEK_END_DATE']>=train_start) & (filtered_data['WEEK_END_DATE']<=train_end)].copy()
    valid_df = filtered_data[(filtered_data['WEEK_END_DATE']>=valid_start) & (filtered_data['WEEK_END_DATE']<=valid_end)].copy()
    test_df = filtered_data[(filtered_data['WEEK_END_DATE']>=test_start) & (filtered_data['WEEK_END_DATE']<=test_end)].copy()

    train_df.set_index('WEEK_END_DATE', inplace=True)
    valid_df.set_index('WEEK_END_DATE', inplace=True)
    test_df.set_index('WEEK_END_DATE', inplace=True)

    X_train = training_df
    y_train = X_train.pop('UNITS')
    X_valid = valid_df
    y_valid = X_valid.pop('UNITS')
    X_test = test_df
    y_test = X_test.pop('UNITS')

    #Function used to perform an evaluation on the validation and return the score to the trained model
    def _score(params):
        xg_boost_model = xgb.XGBRegressor(objective = 'reg:squarederror',
                                        colsample_bytree = params['colsample_bytree'],
                                        learning_rate = params['eta'],
                                        max_depth = params['depth'],
                                        min_child_weight = params['min_child_weight'],
                                        n_estimators = params['n_estimators'],
                                        random_state = 2020,
                                        subsample = params['subsample'],
                                        tree_method = 'hist')
        xg_boost_model.fit(X_train, y_train)
        preds = xg_boost_model.predict(X_valid)
        mape = mean_absolute_percentage_error(y_valid, preds)
        return mape

    trials = Trials()
    best_hyperparams = optimize()
    hyperparameters = space_eval(space, best_hyperparams)
    xgb_model = XGBClassifier(hyperparameters)
    xgb_model.fit(pd.concat([X_train, X_valid]), pd.concat([y_train, y_valid]))
    
    test_preds = xgb_model.predict(X_test)
    #test_metrics = get_metrics(y_test.values, test_preds)
    test_df['test_predictions'] = test_preds
    test_df['y_true'] = y_test.values

    fname = './results/' + 'xgb_TX.csv'
    test_df.to_csv(fname)
        
    