### Import libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Helper

In [16]:
def calculate_total_errors(pred_dict, actual, yhat):
    
    """This function will calculate the erorrs using mean absolute erorrs.
    
    Input:
    pred_dict -> dict (key: unique_key, values: y and yhat)
    actual -> (values: the actual y value)
    yhat -> (values: the predicdted value from the model)
    
    Output:
    error -> int (mean absolute erorr)"""
    

    
    error_total = 0
    # looping through the dictionary and summing the error.
    for key in pred_dict:
        error = mean_absolute_error(pred_dict[key][actual], pred_dict[key][yhat])
        error_total += error
        
    return error_total

### Import data

- linear regression

In [2]:
with open('../asset/lr_model/lr_train_dict.pkl', 'rb') as f:
    lr_train_dict = pickle.load(f)

In [3]:
with open('../asset/lr_model/lr_test_dict.pkl', 'rb') as f:
    lr_test_dict = pickle.load(f)

- lasso

In [29]:
with open('../asset/lasso_model/lasso_train_dict.pkl', 'rb') as f:
    lasso_train_dict = pickle.load(f)

In [30]:
with open('../asset/lasso_model/lasso_test_dict.pkl', 'rb') as f:
    lasso_test_dict = pickle.load(f)

- xgb

In [6]:
with open('../asset/xgb_model/xgb_train_dict.pkl', 'rb') as f:
    xgb_train_dict = pickle.load(f)

In [7]:
with open('../asset/xgb_model/xgb_test_dict.pkl', 'rb') as f:
    xgb_test_dict = pickle.load(f)

- combined

In [8]:
with open('../asset/combined_model/total_train_dict.pkl', 'rb') as f:
    total_train_dict = pickle.load(f)

In [9]:
with open('../asset/combined_model/total_test_dict.pkl', 'rb') as f:
    total_test_dict = pickle.load(f)

- prophet

In [10]:
with open('../asset/prophet_model/combined_dict.pkl', 'rb') as f:
    prophet_combined_dict = pickle.load(f)

### Error Check

- lr

In [17]:
lr_train_error = calculate_total_errors(lr_train_dict, 'train_y', 'train_yhat')
lr_test_error = calculate_total_errors(lr_test_dict,'test_y', 'test_yhat' )

- lasso

In [31]:
lasso_train_error = calculate_total_errors(lasso_train_dict, 'train_y', 'train_yhat')
lasso_test_error = calculate_total_errors(lasso_test_dict, 'test_y', 'test_yhat')

- xgb

In [19]:
xgb_train_error = calculate_total_errors(xgb_train_dict, 'train_y', 'train_yhat')
xgb_test_error = calculate_total_errors(xgb_test_dict, 'test_y', 'test_yhat')

- combined

In [21]:
total_train_error = calculate_total_errors(total_train_dict, 'actual', 'yhat')
total_test_error = calculate_total_errors(total_test_dict, 'actual', 'yhat')

- prophet

In [23]:
train_error = 0
test_error = 0

for key in prophet_combined_dict:
    data = prophet_combined_dict[key]
    train = data[data.indicator == 'train']
    test = data[data.indicator == 'test']
    
    train_e = mean_absolute_error(train['y'], train['yhat'])
    test_e = mean_absolute_error(test['y'], test['yhat'])
    
    train_error += train_e
    test_error += test_e

In [35]:
print(f"Using Linear Regression, the mean absolute error in the training dataset is {round(lr_train_error, 4)}")
print(f"Using Linear Regression, the mean absolute error in the validation dataset is {round(lr_test_error, 4)}")
print('-' * 90)
print(f"Using Lasso, the mean absolute error in the training dataset is {round(lasso_train_error, 4)}")
print(f"Using Lasso, the mean absolute error in the validation dataset is {round(lasso_test_error, 4)}")
print('-' * 90)
print(f"Using XGBoost, the mean absolute error in the training dataset is {round(xgb_train_error, 4)}")
print(f"Using XGBoost, the mean absolute error in the validation dataset is {round(xgb_test_error, 4)}")
print('-' * 90)
print(f"Using Lasso & XGBoost, the mean absolute error in the training dataset is {round(total_train_error, 4)}")
print(f"Using Lasso & XGBoost, the mean absolute error in the validation dataset is {round(total_test_error, 4)}")
print('-' * 90)
print(f"Using Prophet, the mean absolute error in the training dataset is {round(train_error, 4)}")
print(f"Using Prophet, the mean absolute error in the validation dataset is {round(test_error, 4)}")

Using Linear Regression, the mean absolute error in the training dataset is 136632.364
Using Linear Regression, the mean absolute error in the validation dataset is 293779.7383
------------------------------------------------------------------------------------------
Using Lasso, the mean absolute error in the training dataset is 136624.5913
Using Lasso, the mean absolute error in the validation dataset is 292813.3353
------------------------------------------------------------------------------------------
Using XGBoost, the mean absolute error in the training dataset is 23999.8968
Using XGBoost, the mean absolute error in the validation dataset is 216646.2786
------------------------------------------------------------------------------------------
Using Lasso & XGBoost, the mean absolute error in the training dataset is 26357.1988
Using Lasso & XGBoost, the mean absolute error in the validation dataset is 224199.7479
------------------------------------------------------------------

Looks like XGBoost has the best performance, but overfits. Let's hyper parameter tune the model.