In [None]:
#!pip install optuna hyperopt

### Import libraries

In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import autocorrelation_plot
import re
import os
import random
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, RepeatedKFold
from datetime import datetime
import pickle
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from optuna import create_study
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
import warnings
warnings.filterwarnings('ignore')

### Import data

In [None]:
train_df = pd.read_pickle('../asset/train_df.pkl')
test_df = pd.read_pickle('../asset/test_df.pkl')

### Helper

In [None]:
def convert_all_features(data):
    
    """This function will help us to convert boolean and cateogrical values to numerical values.
    
    Input:
    data -> dataframe (the original dataframe)
    
    Output:
    final_df -> dataframe (after feature conversions including one hot encoding and ordinal encoding)
    """
    
    # selecting features that we will use for the Prophet model
    features = list(data.columns)
    
    # treat train and test dataset in a different way
    if 'sales' in data.columns:
        features.append('sales')
        
    oe = OrdinalEncoder()

    not_transformed = []
    no_need_to_transform = []
    
    
    # based on the features, use different methods for encoders. 
    for col, dtype in data.dtypes.items():
        if col in ['date', 'unique_key', 'date_year', 'date_quarter', 'date_month', 'date_day', 'date_week', 'year_month'] or dtype in ['datetime64[ns]', 'timedelta64[ns]']:
            no_need_to_transform.append(col)
        elif col == 'transferred':
            data.loc[:, col] = data[col].apply(lambda x: 1 if x == True else 0)
        elif col in ['state_sales_cut', 'store_sales_bins', 'store_type_sales',
                     'family_sales_bins', 'onpromo_avg_bins', 'cluster_sales_indicator']:
            data.loc[:, col] = oe.fit_transform(data[col].values.reshape(-1,1))
        else:
            not_transformed.append(col) 
    dummy_df = pd.get_dummies(data.drop(columns = no_need_to_transform))
    return dummy_df

In [None]:
def time_split(data, train_size):
    
    """This function will help us create train test split in time series.
    
    Input:
    data -> dataframe (the original dataframe)
    train_size -> float (the percentage of train_set in our dataset)
    
    Output:
    train_df -> dataframe (train_set based on the train_size)
    test_df -> dataframe (test_set based on the train_size)"""
    
    total_row = len(data)
    train_idx = int(total_row * train_size)
    
    train = data[:train_idx]
    test = data[train_idx:]
    
    return train, test

In [None]:
def calculate_total_errors(pred_dict, actual, yhat):
    
    """This function will calculate the erorrs using mean absolute erorrs.
    
    Input:
    pred_dict -> dict (key: unique_key, values: y and yhat)
    actual -> (values: the actual y value)
    yhat -> (values: the predicdted value from the model)
    
    Output:
    error -> int (mean absolute erorr)"""
    

    
    error_total = 0
    # looping through the dictionary and summing the error.
    for key in pred_dict:
        error = mean_absolute_error(pred_dict[key][actual], pred_dict[key][yhat])
        error_total += error
        
    return error_total

### Hyper parameter tuning

- filter out zeros

In [None]:
# create dictionaries to save the results
xgb_params_dict = {}

# create unique keys based on the store number and departments
train_df.loc[:, 'unique_str_dep_key'] = train_df.store_nbr.apply(lambda x: str(x)) + '-' + train_df.family.apply(lambda x: str(x))

# get the unique list of keys
unique_key_list = list(train_df.unique_str_dep_key.unique())

# filter stores that have 0 sales
total_sales_df = train_df.groupby('unique_str_dep_key').sum()[['sales']]
zero_sales = total_sales_df[total_sales_df.sales == 0]

# find stores that have zero sales total
zero_list = list(zero_sales.index)

# we want to filter out data based on the unique keys
for key in unique_key_list:
    
    if key not in zero_list:
        # filter out data
        data = train_df[train_df.unique_str_dep_key == key]
        data.reset_index().drop(columns = ['index', 'date', 'store_nbr', 'family'])

        # perform feature engineering -> one hot coding
        new_data = convert_all_features(data)

        # train test split (70:30)
        train, test = time_split(new_data, 0.7)
        train.drop_duplicates(inplace = True)
        test.drop_duplicates(inplace = True)

        # assign predictors and target variables
        train_X, train_y = train.drop(columns = ['sales']), train.sales.values
        test_X, test_y = test.drop(columns = ['sales']), test.sales.values
        
        # define params
        params = {'max_depth': [3, 6, 10, 15],
              'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
              'subsample': np.arange(0.5, 1.0, 0.1),
              'colsample_bytree': np.arange(0.5, 1.0, 0.1),
              'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
              'n_estimators': [100, 250, 500, 750]
              }
        
        # start training
        xgbclf = XGBRegressor(tree_method='hist')
        clf = RandomizedSearchCV(estimator=xgbclf,
                             param_distributions=params,
                             n_iter=25,
                             n_jobs=4,
                             verbose=1)
        # fit the model
        clf.fit(train_X, train_y)
        
        # find the best params
        best_hyperparams = clf.best_params_
        
        print(f"{key} found best params.")
        
        # save the best results
        xgb_params_dict[key] = best_hyperparams
        key_change = str(key).replace('/', '_')
        with open(f'../asset/best_params/best_params_{key_change}.pkl', 'wb') as f:
            pickle.dump(xgb_params_dict[key], f)
        
    else:
        # meaning these store + dep combinations have 0 sales
        xgb_params_dict[key] = 'zero_coef'

### Use hyper parameters in the model

In [None]:
# create dictionaries to save the results
xgb_pred_dict_train = {}
xgb_pred_dict_test = {}

# create unique keys based on the store number and departments
train_df.loc[:, 'unique_str_dep_key'] = train_df.store_nbr.apply(lambda x: str(x)) + '-' + train_df.family.apply(lambda x: str(x))

# get the unique list of keys
unique_key_list = list(train_df.unique_str_dep_key.unique())

# filter stores that have 0 sales
total_sales_df = train_df.groupby('unique_str_dep_key').sum()[['sales']]
zero_sales = total_sales_df[total_sales_df.sales == 0]

# find stores that have zero sales total
zero_list = list(zero_sales.index)

# we want to filter out data based on the unique keys
for key, val in xgb_params_dict.items():
    
    # filter dataframe so that we can only get the unique df
    data = train_df[train_df.unique_str_dep_key == key]
    data.reset_index().drop(columns = ['index', 'date', 'store_nbr', 'family'])

    # perform feature engineering -> one hot coding
    new_data = convert_all_features(data)

    # train test split (70:30)
    train, test = time_split(new_data, 0.7)
    train.drop_duplicates(inplace = True)
    test.drop_duplicates(inplace = True)

    # assign predictors and target variables
    train_X, train_y = train.drop(columns = ['sales']), train.sales.values
    test_X, test_y = test.drop(columns = ['sales']), test.sales.values
    
    # if coef 0, then assign 0
    if val == 'zero_coef':
        xgb_pred_dict_train[key] = {'actual':train_y,
                               'yhat':0}
        xgb_pred_dict_test[key] = {'actual':test_y,
                              'yhat':0}
    # if not, use the hyper parameters that we created    
    else:
        # load the hyper parameters
        key_change = str(key).replace('/', '_')
        with open(f'../asset/best_params/best_params_{key_change}.pkl', 'rb') as f:
            best_params = pickle.load(f)
        
        xgbr = XGBRegressor(**best_params)
        xgbr.fit(train_X, train_y)
        
        train_y_hat = xgbr.predict(train_X)
        test_y_hat = xgbr.predict(test_X)
        
        xgb_pred_dict_train[key] = {'actual':train_y,
                               'yhat':train_y_hat}
        xgb_pred_dict_test[key] = {'actual':test_y,
                              'yhat':test_y_hat}
        

Combine the results

In [None]:
total_df = pd.DataFrame()
for key in xgb_pred_dict_test:
    xgb_train_df = pd.DataFrame(xgb_pred_dict_train[key])
    xgb_test_df = pd.DataFrame(xgb_pred_dict_test[key])
    
    xgb_train_df.loc[:, 'ind'] = 'train'
    xgb_test_df.loc[:, 'ind'] = 'test'
    
    xgb_df = pd.concat([xgb_train_df, xgb_test_df])
    xgb_df.loc[:, 'error'] = np.abs(xgb_df['actual'] - xgb_df['yhat'])
    xgb_df.loc[:, 'unique_key'] = key
    
    total_df = total_df.append(xgb_df, ignore_index= True)

In [None]:
total_error_by_groups = total_df.groupby(['unique_key', 'ind']).sum()[['error']]

In [None]:
total_error_by_groups = total_error_by_groups.reset_index()

In [None]:
np.sum(total_error_by_groups[total_error_by_groups.ind == 'train']['error'])

In [None]:
np.sum(total_error_by_groups[total_error_by_groups.ind == 'test']['error'])