### Import Libraries

In [11]:
import numpy as np
import pandas as pd
import re
import os
import warnings
warnings.filterwarnings('ignore')
import pickle
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor

### Import data

In [3]:
train_df = pd.read_pickle('../asset/train_df.pkl')
test_df = pd.read_pickle('../asset/test_df.pkl')

### Helper

In [12]:
def convert_all_features(data):
    
    """This function will help us to convert boolean and cateogrical values to numerical values.
    
    Input:
    data -> dataframe (the original dataframe)
    
    Output:
    final_df -> dataframe (after feature conversions including one hot encoding and ordinal encoding)
    """
    
    # selecting features that we will use for the Prophet model
    features = list(data.columns)
    
    # treat train and test dataset in a different way
    if 'sales' in data.columns:
        features.append('sales')
        
    oe = OrdinalEncoder()

    not_transformed = []
    no_need_to_transform = []
    
    
    # based on the features, use different methods for encoders. 
    for col, dtype in data.dtypes.items():
        if col in ['date', 'unique_key', 'date_year', 'date_quarter', 'date_month', 'date_day', 'date_week', 'year_month'] or dtype in ['datetime64[ns]', 'timedelta64[ns]']:
            no_need_to_transform.append(col)
        elif col == 'transferred':
            data.loc[:, col] = data[col].apply(lambda x: 1 if x == True else 0)
        elif col in ['state_sales_cut', 'store_sales_bins', 'store_type_sales',
                     'family_sales_bins', 'onpromo_avg_bins', 'cluster_sales_indicator']:
            data.loc[:, col] = oe.fit_transform(data[col].values.reshape(-1,1))
        else:
            not_transformed.append(col) 
    dummy_df = pd.get_dummies(data.drop(columns = no_need_to_transform))
    return dummy_df

In [29]:
def time_split(data, train_size):
    
    """This function will help us create train test split in time series.
    
    Input:
    data -> dataframe (the original dataframe)
    train_size -> float (the percentage of train_set in our dataset)
    
    Output:
    train_df -> dataframe (train_set based on the train_size)
    test_df -> dataframe (test_set based on the train_size)"""
    
    total_row = len(data)
    train_idx = int(total_row * train_size)
    
    train = data[:train_idx]
    test = data[train_idx:]
    
    return train, test

### Prediction

For the 0 sales stores, we will use 0 as our predictions. Otherwise, we will use pretrained XGBoost

In [41]:
# define a dictionary to save the results
pred_dict = {}

# change special characters
train_df.loc[:, 'family'] = train_df.family.apply(lambda x: str(x).replace('/', '&'))
test_df.loc[:, 'family'] = test_df.family.apply(lambda x: str(x).replace('/', '&'))

# create unique keys based on the store number and departments
test_df.loc[:, 'unique_str_dep_key'] = test_df.store_nbr.apply(lambda x: str(x)) + '-' + test_df.family.apply(lambda x: str(x))
train_df.loc[:, 'unique_str_dep_key'] = train_df.store_nbr.apply(lambda x: str(x)) + '-' + train_df.family.apply(lambda x: str(x))

# filter out zero sales
total_sales = train_df.groupby('unique_str_dep_key').sum()[['sales']]
zero_sales = total_spales[total_sales.sales == 0]
zero_sales_list = list(zero_sales.index)

unique_key_list = list(train_df.unique_str_dep_key.unique())
# loop through the unique key
for u_key in unique_key_list:
    
    # filter dataframe so that we can only get the unique df
    data = train_df[train_df.unique_str_dep_key == u_key]
    pred_df = test_df[test_df.unique_str_dep_key == u_key]
    data.reset_index().drop(columns = ['index', 'date', 'store_nbr', 'family'])
    pred_df.reset_index().drop(columns = ['index', 'date', 'store_nbr', 'family'])
    
    # perform feature engineering -> one hot coding
    new_data = convert_all_features(data)
    new_pred_data = convert_all_features(pred_df.drop(columns= ['date']))
    new_pred_data = new_pred_data.set_index('id')
    
    # train test split (70:30)
    train, test = time_split(new_data, 0.7)
    train.drop_duplicates(inplace = True)
    test.drop_duplicates(inplace = True)
    
    # assign predictors and target variables
    train_X, train_y = train.drop(columns = ['sales']), train.sales.values
    test_X, test_y = test.drop(columns = ['sales']), test.sales.values
    
    total_row = len(new_pred_data)
    
    if u_key not in zero_sales_list:
        with open(f'../asset/best_params_optuna/best_params_{u_key}.pkl', 'rb') as f:
            best_params = pickle.load(f)
            
        for k, v in best_params.items():
            if k in ['max_depth', 'min_child_weight', 'n_estimators']:
                best_params[k] = int(v)

        xgbr = XGBRegressor(**best_params)
        xgbr.fit(train_X, train_y)
        pred = xgbr.predict(new_pred_data)

        pred_dict[u_key] = pred
    
    else:
        zeros = [0 for _ in range(total_row)]
        pred_dict[u_key] = zeros