In [None]:
#!pip install fbprophet

### Load libraries

In [None]:
import numpy as np
import pandas as pd
import re
import os
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from fbprophet import Prophet

### Load data

In [None]:
train_df = pd.read_pickle('../asset/train_df.pkl')
test_df = pd.read_pickle('../asset/test_df.pkl')

In [None]:
train_df.store_nbr.nunique()

In [None]:
train_df.family.nunique()

### Data Manipulation

The original datasets have 54 unique stores and 33 unique departments, which are the total of 1782 unique combibations. This might deliver the detailed information regarding departments, but it takes a long time for us to train the model. What if we combine the total sales per stores and get the erros from there? Would there be a huge difference?

In [None]:
train_df_group_by = train_df.groupby(['date', 'store_nbr']).sum().reset_index()

In [None]:
train_df_group_by = train_df_group_by[['date', 'store_nbr', 'sales']]

In [None]:
train_df_group_by.rename(columns= {'sales':'total_sales'}, inplace = True)

In [None]:
train_df_combined = pd.merge(train_df, train_df_group_by,
                             left_on= ['date', 'store_nbr'],
                             right_on= ['date', 'store_nbr'],
                             how = 'left')

In [None]:
len(train_df_combined) == len(train_df)

Looks like join worked fine. There was no duplicate issues. 

In [None]:
final_train_df = train_df_combined.drop(columns= ['family', 'sales']).drop_duplicates(['date', 'store_nbr', 'total_sales'])

In [None]:
train_df.date.nunique() *train_df.store_nbr.nunique() == len(final_train_df)

Drop rows if they are duplicates. 

### Helper

In [None]:
def convert_features(data):
    """This function will help us to convert boolean and cateogrical values to numerical values.
    
    Input:
    data -> dataframe (the original dataframe)
    
    Output:
    final_df -> dataframe (after feature conversions including one hot encoding and ordinal encoding)
    """
    
    # selecting features that we will use for the Prophet model
    features = ['date', 'is_weekend', 'holiday_counts', 'is_multiple', 'is_above_median','store_sales_bins', 
                'family_sales_bins', 'onpromo_avg_bins', 'price_indicator', 'is_higher_than_avg_oil_price',
                'is_delta_-4', 'is_delta_-3', 'is_delta_3', 'is_delta_4', 'is_after_2014-12', 
                'state_sales_cut','store_type_sales', 'cluster_sales_indicator', 'christmas_sales_season']
    
    # treat train and test dataset in a different way
    if 'total_sales' in data.columns:
        features.append('total_sales')
        
    new_data = data[features]
    
    oe = OrdinalEncoder()
    
    # based on the features, use different methods for encoders. 
    for col, dtype in new_data.dtypes.items():
        if col in ['id', 'date']:
            pass
        elif dtype == 'bool':
            new_data.loc[:, col] = new_data[col].apply(lambda x: int(x))
        elif col in ['store_sales_bins', 'family_sales_bins', 'onpromo_avg_bins', 'cluster_sales_indicator']:
            new_data.loc[:, col] = oe.fit_transform(new_data[col].values.reshape(-1,1))
    
    # for categorical values, create dummy varaibles
    # for train, we should include sales
    # for test, use the dataframe before joining
    
    final_df = pd.get_dummies(new_data)
#     if 'sales' not in new_data.columns:
#         final_df = pd.get_dummies(new_data)
#     else:
#         new_data = pd.get_dummies(new_data)
#         final_df = pd.concat([new_data, new_data['sales']], axis = 1)
    
    return final_df

In [None]:
def time_split(data, train_size):
    """This function will help us create train test split in time series.
    
    Input:
    data -> dataframe (the original dataframe)
    train_size -> float (the percentage of train_set in our dataset)
    
    Output:
    train_df -> dataframe (train_set based on the train_size)
    test_df -> dataframe (test_set based on the train_size)"""
    
    total_row = len(data)
    train_idx = int(total_row * train_size)
    
    train = data[:train_idx]
    test = data[train_idx:]
    
    return train, test

In [None]:
def create_prophet(prophet_df, ci):
    
    """This function will help us build forecasting models using Prophet. 
    Since our dataset is based on store numbers, we can create different models based on stores.
    
    Input:
    data -> dataframe (the original dataframe that contains store information, geo location, and various features)
    
    Output:
    prophet_dict -> dict (key: store, value: yhat and y (dataframe))"""
    
    # create a dictionary to save information
    prophet_dict = {}
    christmas_df = pd.DataFrame(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], columns = ['ds'])
    
    # create a unique stores list
    unique_store = list(prophet_df.store_nbr.unique())
    
    for key in unique_store:

        data = prophet_df[prophet_df.store_nbr == key]
        data = data.reset_index()
        data.drop(columns = ['index'], inplace = True)

        data_converted = convert_features(data)
        data_converted.rename(columns= {'date':'ds', 'total_sales':'y'}, inplace = True)

        # time series split (70:30)
        train, test = time_split(data_converted, 0.7)
        train.drop_duplicates(inplace = True)
        test.drop_duplicates(inplace = True)
        test_len = len(test)

        model = Prophet(interval_width= ci)

        # adding regressors for the Prophet model
        features = list(train.columns)

        for feature in features:
            if feature not in ['ds', 'y', 'unique_store_dep_key']:
                model.add_regressor(feature, standardize = True)

        model.fit(train)

        # create a future dataframe so that we can compare the y and yhat
        future = model.make_future_dataframe(periods= test_len - 1)

        # Christmas is not added so we should manually add them
        future.append(christmas_df, ignore_index = True)

        # combine two data sources together
        origial = pd.concat([train, test])

        # add features
        new_future = pd.merge(future, origial,
                              left_on= 'ds',
                              right_on= 'ds',
                              how = 'left')

        # create a forecastibng for the validation set
        forecast = model.predict(new_future)

        # select features
        selected_forecast = forecast[['ds', 'yhat_lower', 'yhat_upper', 'yhat']]

        # save the results in the dictionary
        prophet_dict[key] = selected_forecast
        
    return prophet_dict

In [None]:
def combine_y_yhat(original_df, pred_df):
    
    """This function will return a combination of yhat from the Prophet model
       and the actual y from the original dataset.
       
       Input:
       
       original_df -> dataframe (contain information regarding unique_key, date, and actual)
       pred_df -> dataframe (contain information regarding the yhat)
       
       Output:
       
       combined_dict -> dictionary (key: unique_key, value: contain information from the both datasets)"""
    
    # create a dictionary to save the results
    combined_dict = {}
    
    # create a unique stores list
    unique_store = list(prophet_df.store_nbr.unique())
    
    for key in unique_store:
        # original dataset
        true_df = original_df[original_df.store_nbr == key]
        # yhat dataset from the Prophet model
        yhat_df = pred_df[key]
        
        filtered_true = true_df[['date', 'total_sales']]
        filtered_true.rename(columns = {'date':'ds'}, inplace = True)
        
        # create lists to distinguish the difference between train and test
        total_len = len(filtered_true)
        train_idx = int(total_len * 0.7)
        test_idx = total_len - train_idx
        
        # create indicators based on the length of data -> train and test
        train_ind_list = ['train' for _ in range(train_idx)]
        test_ind_list = ['test' for _ in range(test_idx)]
        combined_ind_list = train_ind_list + test_ind_list
        
        # assign a column so that we can distinguish them
        filtered_true.loc[:, 'indicator'] = combined_ind_list
        
        
        merged_df = pd.merge(filtered_true, yhat_df,
                             left_on = 'ds',
                             right_on = 'ds')
        merged_df.rename(columns = {'total_sales':'y'}, inplace = True)
        
        # save information in the dictionary
        combined_dict[key] = merged_df
        
    return combined_dict

In [None]:
def find_mae(pred_dict):
    
    """This function will find the rmse for each key within the dictionary that we created using the Prophet model.
    
    Input:
    pred_dict -> dict (key: unique_key (store number and department), value: dataframe (y hat and y))
    
    Output:
    error_df -> dict (dataframe that contains information regarding unique key and mean absolute error)
    """
    
    error_dict = {}
    
    for key in pred_dict:
        
        # start looping through the dictionary 
        data = pred_dict[key]
        data.loc[:, 'mean_abs_error'] = np.abs(data['yhat'] - data['y'])
        
        # check errors between train and test
        errors_partition = data.groupby(['indicator']).sum()[['mean_abs_error']]
        errors_by_group = errors_partition.reset_index()
        
        train_error = errors_by_group[errors_by_group.indicator == 'train']['mean_abs_error'].values[0]
        test_error = errors_by_group[errors_by_group.indicator == 'test']['mean_abs_error'].values[0]
        
        # save the results in the dictionary
        error_dict[key] = {'train_error':train_error,
                           'test_error':test_error}
        
        
    return error_dict

In [None]:
def create_prophet_remove_zero(prophet_df, exception, ci):
    
    """This function will help us build forecasting models using Prophet. 
    Since our dataset is based on store numbers, we can create different models based on stores.
    This function does not include store-department unique key in the model if they have 0 sales. 
    
    Input:
    data -> dataframe (the original dataframe that contains store information, geo location, and various features)
    
    Output:
    prophet_dict -> dict (key: store, value: yhat and y (dataframe))"""
    
    # create a dictionary to save information
    prophet_dict = {}
    christmas_df = pd.DataFrame(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], columns = ['ds'])
    
    # create a unique stores list
    unique_store = list(prophet_df.store_nbr.unique())
    
    
    for key in unique_store:
        
        # if the unique_key is in the zero sales list, then we are not taking them into a consideration
        if key in exception:
            pass
        
        else:
            data = prophet_df[prophet_df.unique_store_dep_key == key]
            data = data.reset_index()
            data.drop(columns = ['index'], inplace = True)

            data_converted = convert_features(data)
            data_converted.rename(columns= {'date':'ds', 'total_sales':'y'}, inplace = True)

            # time series split (70:30)
            train, test = time_split(data_converted, 0.7)
            train.drop_duplicates(inplace = True)
            test.drop_duplicates(inplace = True)
            test_len = len(test)

            model = Prophet(interval_width= ci)

            # adding regressors for the Prophet model
            features = list(train.columns)

            for feature in features:
                if feature not in ['ds', 'y', 'unique_store_dep_key']:
                    model.add_regressor(feature, standardize = True)

            model.fit(train)

            # create a future dataframe so that we can compare the y and yhat
            future = model.make_future_dataframe(periods= test_len - 1)

            # Christmas is not added so we should manually add them
            future.append(christmas_df, ignore_index = True)

            # combine two data sources together
            origial = pd.concat([train, test])

            # add features
            new_future = pd.merge(future, origial,
                                  left_on= 'ds',
                                  right_on= 'ds',
                                  how = 'left')

            # create a forecastibng for the validation set
            forecast = model.predict(new_future)

            # select features
            selected_forecast = forecast[['ds', 'yhat_lower', 'yhat_upper', 'yhat']]

            # save the results in the dictionary
            prophet_dict[key] = selected_forecast
        
    return prophet_dict

In [None]:
def calculate_total_errors(pred_dict, actual, yhat):
    
    """This function will calculate the erorrs using mean absolute erorrs.
    
    Input:
    pred_dict -> dict (key: unique_key, values: y and yhat)
    actual -> (values: the actual y value)
    yhat -> (values: the predicdted value from the model)
    
    Output:
    error -> int (mean absolute erorr)"""
    

    
    error_total = 0
    # looping through the dictionary and summing the error.
    for key in pred_dict:
        error = mean_absolute_error(pred_dict[key][actual], pred_dict[key][yhat])
        error_total += error
        
    return error_total

### Model Fitting

- Prophet

In [None]:
prophet_pred = create_prophet(final_train_df, 0.95)

In [None]:
total_df = pd.DataFrame()

for key in prophet_pred:
    temp = prophet_pred[key]
    temp.loc[:, 'store_nbr'] = key
    
    # create lists to distinguish the difference between train and test
    total_len = len(temp)
    train_idx = int(total_len * 0.7)
    test_idx = total_len - train_idx

    # create indicators based on the length of data -> train and test
    train_ind_list = ['train' for _ in range(train_idx)]
    test_ind_list = ['test' for _ in range(test_idx)]
    combined_ind_list = train_ind_list + test_ind_list
    combined = pd.merge(temp, final_train_df[['date', 'total_sales', 'store_nbr']],
                        left_on= ['ds', 'store_nbr'],
                        right_on= ['date', 'store_nbr'],
                        how = 'inner')
    combined.loc[:, 'indicator'] = combined_ind_list
    total_df = total_df.append(combined, ignore_index= True)

In [None]:
total_df.loc[:, 'mae'] = np.abs(total_df['yhat'] - total_df['total_sales'])

In [None]:
total_df.head()

In [None]:
errors_by_stores = total_df.groupby(['store_nbr', 'indicator']).sum()[['mae']].reset_index()

In [None]:
train_error = errors_by_stores[errors_by_stores.indicator == 'train']
test_error = errors_by_stores[errors_by_stores.indicator == 'test']

In [None]:
train_error_sum = round(np.sum(train_error.mae), 4)
test_error_sum = round(np.sum(test_error.mae), 4)

In [None]:
print(f"Using Prophet, the mean absolute error in the training dataset is {train_error_sum}")
print(f"Using Prophet, the mean absolute error in the validation dataset is {test_error_sum}")

Looks like training is a lot quicker than the individual models, but the difference in errors is dramatically huge. We have to stick with the previous model.

### Export the aggregated df

In [None]:
with open('../asset/aggregated_train_df.pkl', 'wb') as f:
    pickle.dump(final_train_df, f)

Let's try to use the same dataframe on XGBoost