Note: pip install tsfresh is not successful. tf not downloading correctly. assuming it's related to m1

In [None]:
#!pip install prophet

In [None]:
#!pip install tsmoothie

### Load libraries

In [None]:
import numpy as np
import pandas as pd
import re
import os
import pickle
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from prophet import Prophet

### Load data

In [None]:
with open('../asset/aggregated_train_df.pkl', 'rb') as f:
    final_train_df = pickle.load(f)

### Helper

In [None]:
def convert_all_features(data):
    
    """This function will help us to convert boolean and cateogrical values to numerical values.
    
    Input:
    data -> dataframe (the original dataframe)
    
    Output:
    final_df -> dataframe (after feature conversions including one hot encoding and ordinal encoding)
    """
    
    # selecting features that we will use for the Prophet model
    features = list(data.columns)
    
    # treat train and test dataset in a different way
    if 'final_sales' in data.columns:
        features.append('final_sales')
        
    oe = OrdinalEncoder()

    not_transformed = []
    no_need_to_transform = []
    
    
    # based on the features, use different methods for encoders. 
    for col, dtype in data.dtypes.items():
        if col in ['date', 'unique_key', 'date_year', 'date_quarter', 'date_month', 'date_day', 'date_week', 'year_month'] or dtype in ['datetime64[ns]', 'timedelta64[ns]']:
            no_need_to_transform.append(col)
        elif col == 'transferred':
            data.loc[:, col] = data[col].apply(lambda x: 1 if x == True else 0)
        elif col in ['state_sales_cut', 'store_sales_bins', 'store_type_sales',
                     'family_sales_bins', 'onpromo_avg_bins', 'cluster_sales_indicator']:
            data.loc[:, col] = oe.fit_transform(data[col].values.reshape(-1,1))
        else:
            not_transformed.append(col) 
    dummy_df = pd.get_dummies(data.drop(columns = no_need_to_transform))
    return dummy_df

In [None]:
def time_split(data, train_size):
    
    """This function will help us create train test split in time series.
    
    Input:
    data -> dataframe (the original dataframe)
    train_size -> float (the percentage of train_set in our dataset)
    
    Output:
    train_df -> dataframe (train_set based on the train_size)
    test_df -> dataframe (test_set based on the train_size)"""
    
    total_row = len(data)
    train_idx = int(total_row * train_size)
    
    train = data[:train_idx]
    test = data[train_idx:]
    
    return train, test

In [None]:
def calculate_total_errors(pred_dict, actual, yhat):
    
    """This function will calculate the erorrs using mean absolute erorrs.
    
    Input:
    pred_dict -> dict (key: unique_key, values: y and yhat)
    actual -> (values: the actual y value)
    yhat -> (values: the predicdted value from the model)
    
    Output:
    error -> int (mean absolute erorr)"""
    

    
    error_total = 0
    # looping through the dictionary and summing the error.
    for key in pred_dict:
        error = mean_absolute_error(pred_dict[key][actual], pred_dict[key][yhat])
        error_total += error
        
    return error_total

### Model Fitting

- Linear Regression

It will not be the best model, but we can start building a simple model more for the baseline.

In [None]:
# create dictionaries to save the results
lr_train_dict = {}
lr_test_dict = {}

# get the unique list of keys
unique_key_list = list(final_train_df.store_nbr.unique())

# we want to filter out data based on the unique keys
for key in unique_key_list:
    
    # filter out data
    data = final_train_df[final_train_df.store_nbr == key]
    data = data.reset_index().drop(columns = ['index', 'date', 'store_nbr'])
    
    # perform feature engineering -> one hot coding
    new_data = convert_all_features(data)
    
    # train test split (70:30)
    train, test = time_split(new_data, 0.7)
    train.drop_duplicates(inplace = True)
    test.drop_duplicates(inplace = True)
    
    # assign predictors and target variables
    train_X, train_y = train.drop(columns = ['total_sales']), train.total_sales.values
    test_X, test_y = test.drop(columns = ['total_sales']), test.total_sales.values
    
    # initiate the linear regression model
    lr = LinearRegression()
    
    
    # start training LinearRegression 
    lr.fit(train_X, train_y)
    
    # train y hat and test y hat
    lr_train_pred = lr.predict(train_X)
    lr_test_pred = lr.predict(test_X)
    
    # save the results in the dictionaries
    if key not in lr_train_dict:
        lr_train_dict[key] = {
            'train_y':train_y,
            'train_yhat':lr_train_pred
        }
    
    if key not in lr_test_dict:
        lr_test_dict[key] = {
            'test_y':test_y,
            'test_yhat':lr_test_pred}
        

In [None]:
# create dictionaries to save the results
lr_train_dict = {}
lr_test_dict = {}

# get the unique list of keys
unique_key_list = list(final_train_df.store_nbr.unique())

# we want to filter out data based on the unique keys
for key in unique_key_list:
    
    # filter out data
    data = final_train_df[final_train_df.store_nbr == key]
    data = data.reset_index().drop(columns = ['index', 'date', 'store_nbr'])
    
    # perform feature engineering -> one hot coding
    new_data = convert_all_features(data)
    
    # train test split (70:30)
    train, test = time_split(new_data, 0.7)
    train.drop_duplicates(inplace = True)
    test.drop_duplicates(inplace = True)
    
    # assign predictors and target variables
    train_X, train_y = train.drop(columns = ['total_sales']), train.total_sales.values
    test_X, test_y = test.drop(columns = ['total_sales']), test.total_sales.values
    
    # initiate the linear regression model
    lr = XGBRegressor()
    
    
    # start training LinearRegression 
    lr.fit(train_X, train_y)
    
    # train y hat and test y hat
    lr_train_pred = lr.predict(train_X)
    lr_test_pred = lr.predict(test_X)
    
    # save the results in the dictionaries
    if key not in lr_train_dict:
        lr_train_dict[key] = {
            'train_y':train_y,
            'train_yhat':lr_train_pred
        }
    
    if key not in lr_test_dict:
        lr_test_dict[key] = {
            'test_y':test_y,
            'test_yhat':lr_test_pred}
        

In [None]:
pd.DataFrame(lr_test_dict['1'])

In [None]:
total_df = pd.DataFrame()

for key in lr_test_dict:
    temp = pd.DataFrame(lr_test_dict[key])
    temp.loc[:, 'store_nbr'] = key
    
    # create lists to distinguish the difference between train and test
    total_len = len(temp)
    train_idx = int(total_len * 0.7)
    test_idx = total_len - train_idx

    # create indicators based on the length of data -> train and test
    train_ind_list = ['train' for _ in range(train_idx)]
    test_ind_list = ['test' for _ in range(test_idx)]
    combined_ind_list = train_ind_list + test_ind_list
    temp.loc[:, 'indicator'] = combined_ind_list
    total_df = total_df.append(temp, ignore_index= True)

In [None]:
total_df.loc[:, 'mae'] = np.abs(total_df['test_y'] - total_df['test_yhat'])

In [None]:
np.sum(total_df.groupby('store_nbr').sum()[['mae']]).values

In [None]:
1.21845254e+08

- Lasso

We can use Lasso to find out which features are more important in terms of building the model.

In [None]:
# create dictionaries to save the results
lasso_coef_dict = {}
lasso_train_dict = {}
lasso_test_dict = {}

# create unique keys based on the store number and departments
train_df.loc[:, 'unique_str_dep_key'] = train_df.store_nbr.apply(lambda x: str(x)) + '-' + train_df.family.apply(lambda x: str(x))

# get the unique list of keys
unique_key_list = list(train_df.unique_str_dep_key.unique())

# we want to filter out data based on the unique keys
for key in unique_key_list:
    
    # filter out data
    data = train_df[train_df.unique_str_dep_key == key]
    data = data.reset_index().drop(columns = ['index', 'date', 'unique_str_dep_key', 'store_nbr', 'family'])
    
    # perform feature engineering -> one hot coding
    new_data = convert_all_features(data)
    
    # train test split (70:30)
    train, test = time_split(new_data, 0.7)
    train.drop_duplicates(inplace = True)
    test.drop_duplicates(inplace = True)
    
    # assign predictors and target variables
    train_X, train_y = train.drop(columns = ['sales']), train.sales.values
    test_X, test_y = test.drop(columns = ['sales']), test.sales.values
    
    # initiate the linear regression model
    lasso = Lasso(alpha = 0.01)
    
    # start training
    lasso.fit(train_X, train_y)
    
    # train y hat and test y hat
    lasso_train_pred = lasso.predict(train_X)
    lasso_test_pred = lasso.predict(test_X)
    
    # save the results in the dictionaries
    if key not in lasso_train_dict:
        lasso_train_dict[key] = {
            'train_y':train_y,
            'train_yhat':lasso_train_pred
        }
    
    if key not in lasso_test_dict:
        lasso_test_dict[key] = {
            'test_y':test_y,
            'test_yhat':lasso_test_pred}
        
    if key not in lasso_coef_dict:
        lasso_coef_dict[key] = {'features':lasso.feature_names_in_,
                                'coef':lasso.coef_}
        

- XGB

Unlike linear regressio, XGB does not need any assumptions regarding the data (non-parametric) and often perform well. 

In [None]:
# create dictionaries to save the results

xgb_train_dict = {}
xgb_test_dict = {}

# create unique keys based on the store number and departments
train_df.loc[:, 'unique_str_dep_key'] = train_df.store_nbr.apply(lambda x: str(x)) + '-' + train_df.family.apply(lambda x: str(x))

# get the unique list of keys
unique_key_list = list(train_df.unique_str_dep_key.unique())

# we want to filter out data based on the unique keys
for key in unique_key_list:
    
    
    # filter out data
    data = train_df[train_df.unique_str_dep_key == key]
    data = data.reset_index().drop(columns = ['index', 'date', 'unique_str_dep_key', 'store_nbr', 'family'])
    
    # perform feature engineering 
    new_data = convert_all_features(data)
    
    # train test split (70:30)
    train, test = time_split(new_data, 0.7)
    train.drop_duplicates(inplace = True)
    test.drop_duplicates(inplace = True)
    
    # assign predictors and target variables
    train_X, train_y = train.drop(columns = ['sales']), train.sales.values
    test_X, test_y = test.drop(columns = ['sales']), test.sales.values
    
    # initiate the xgboost
    xgbr = XGBRegressor()
    
    # start training
    xgbr.fit(train_X, train_y)
    
    # train y hat and test y hat
    xgb_train_pred = xgbr.predict(train_X)
    xgb_test_pred = xgbr.predict(test_X)
    
    # save the results in the dictionaries
    if key not in xgb_train_dict:
        xgb_train_dict[key] = {
            'train_y':train_y,
            'train_yhat':xgb_train_pred
        }
    
    if key not in xgb_test_dict:
        xgb_test_dict[key] = {
            'test_y':test_y,
            'test_yhat':xgb_test_pred}

- combine two (Lasso + XGBoost)

Let's use Lasso as our feature selection, then use XGBoost for the better prediction.

1. Lasso for feature selection

In [None]:
# create dictionaries to save the results
lasso_coef_dict = {}
lasso_train_dict = {}
lasso_test_dict = {}

# create unique keys based on the store number and departments
train_df.loc[:, 'unique_str_dep_key'] = train_df.store_nbr.apply(lambda x: str(x)) + '-' + train_df.family.apply(lambda x: str(x))

# get the unique list of keys
unique_key_list = list(train_df.unique_str_dep_key.unique())

# we want to filter out data based on the unique keys
for key in unique_key_list:
    
    # filter out data
    data = train_df[train_df.unique_str_dep_key == key]
    data = data.reset_index().drop(columns = ['index', 'date', 'unique_str_dep_key', 'store_nbr', 'family'])
    
    # perform feature engineering -> one hot coding
    new_data = convert_all_features(data)
    
    # train test split (70:30)
    train, test = time_split(new_data, 0.7)
    train.drop_duplicates(inplace = True)
    test.drop_duplicates(inplace = True)
    
    # assign predictors and target variables
    train_X, train_y = train.drop(columns = ['sales']), train.sales.values
    test_X, test_y = test.drop(columns = ['sales']), test.sales.values
    
    # initiate the linear regression model
    lasso = Lasso(alpha = 0.01)
    
    # start training
    lasso.fit(train_X, train_y)
    
    # train y hat and test y hat
    lasso_train_pred = lasso.predict(train_X)
    lasso_test_pred = lasso.predict(test_X)
    
    # save the results in the dictionaries
    if key not in lasso_train_dict:
        lasso_train_dict[key] = {
            'train_y':train_y,
            'train_yhat':lasso_train_pred
        }
    
    if key not in lasso_test_dict:
        lasso_test_dict[key] = {
            'test_y':test_y,
            'test_yhat':lasso_test_pred}
        
    if key not in lasso_coef_dict:
        lasso_coef_dict[key] = {'features':lasso.feature_names_in_,
                                'coef':lasso.coef_}
        

2. Use XGBoost for the final model

In [None]:
total_train_dict = {}
total_test_dict = {}


for key, val in lasso_coef_dict.items():
            
        # filter out data
        data = train_df[train_df.unique_str_dep_key == key]
        data.reset_index().drop(columns = ['index', 'date', 'store_nbr', 'family'])

        # perform feature engineering -> one hot coding
        new_data = convert_all_features(data)

        # train test split (70:30)
        train, test = time_split(new_data, 0.7)
        train.drop_duplicates(inplace = True)
        test.drop_duplicates(inplace = True)
        
        if np.sum(val['coef']) == 0:
            
            # use xgboost directly
            
            # assign predictors and target variables
            train_X, train_y = train.drop(columns = ['sales']), train.sales.values
            test_X, test_y = test.drop(columns = ['sales']), test.sales.values

            xgbr = XGBRegressor()

            # start training
            xgbr.fit(train_X, train_y)

            # train y hat and test y hat
            xgb_train_pred = xgbr.predict(train_X)
            xgb_test_pred = xgbr.predict(test_X)

            total_train_dict[key] = {'actual':train_y,
                                     'yhat':xgb_train_pred}

            total_test_dict[key] = {'actual':test_y,
                                    'yhat':xgb_test_pred}

        

        else:
            lasso_df = pd.DataFrame(v)
            # drop features that are not important
            lasso_df = lasso_df[lasso_df.coef != 0]
            filtered_features = list(lasso_df.features)
            filtered_features.append('sales')
            
            # select features 
            filtered_train = train[filtered_features]
            filtered_test = test[filtered_features]

            # assign predictors and target variables
            train_X, train_y = filtered_train.drop(columns = ['sales']), train.sales.values
            test_X, test_y = filtered_test.drop(columns = ['sales']), test.sales.values

            xgbr = XGBRegressor()

            # start training
            xgbr.fit(train_X, train_y)

            # train y hat and test y hat
            xgb_train_pred = xgbr.predict(train_X)
            xgb_test_pred = xgbr.predict(test_X)

            total_train_dict[key] = {'actual':train_y,
                                     'yhat':xgb_train_pred}

            total_test_dict[key] = {'actual':test_y,
                                    'yhat':xgb_test_pred}

### Export the model

- linear regression

In [None]:
with open('../asset/lr_model/lr_train_dict.pkl', 'wb') as f:
    pickle.dump(lr_train_dict, f)

In [None]:
with open('../asset/lr_model/lr_test_dict.pkl', 'wb') as f:
    pickle.dump(lr_test_dict, f)

- lasso

In [None]:
with open('../asset/lasso_model/lasso_train_dict.pkl', 'wb') as f:
    pickle.dump(lasso_train_dict, f)

In [None]:
with open('../asset/lasso_model/lasso_test_dict.pkl', 'wb') as f:
    pickle.dump(lasso_test_dict, f)

- xgb

In [None]:
with open('../asset/xgb_model/xgb_train_dict.pkl', 'wb') as f:
    pickle.dump(xgb_train_dict, f)

In [None]:
with open('../asset/xgb_model/xgb_test_dict.pkl', 'wb') as f:
    pickle.dump(xgb_test_dict, f)

- combined

In [None]:
with open('../asset/combined_model/total_train_dict.pkl', 'wb') as f:
    pickle.dump(total_train_dict, f)

In [None]:
with open('../asset/combined_model/total_test_dict.pkl', 'wb') as f:
    pickle.dump(total_test_dict, f)