In [71]:
import numpy as np
import pandas as pd
from fbprophet import Prophet
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import copy

# Introduction

The basis of the model will be the Facebook Prophet package. But, the first thing we need to do is load the preprocessed data.


In [72]:
df = pd.read_csv("data/preprocessed.csv")

In [73]:
df = df[df.columns[1:]]
df.head()

Unnamed: 0,ds,y
0,2015-07-01 05:00:00,162827.0
1,2015-07-01 06:00:00,335153.0
2,2015-07-01 07:00:00,333837.0
3,2015-07-01 08:00:00,398386.0
4,2015-07-01 09:00:00,388954.0


Now we run the model with a 75-25 train test split.

In [74]:
spoint = int(df.shape[0]*0.75)
train_df = df.iloc[:spoint]
test_df = df.iloc[spoint:]
#Check split since I did it manually
train_df.shape[0] + test_df.shape[0] == df.shape[0]

True

Now we train the basic models

In [75]:
model = Prophet()
model.fit(train_df)

<fbprophet.forecaster.Prophet at 0x16af28cef70>

In [76]:
eval_df = model.predict(pd.DataFrame(train_df['ds']))
eval_df.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,daily,...,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2015-07-01 05:00:00,432089.920648,428110.69637,517390.91534,432089.920648,432089.920648,38640.777508,38640.777508,38640.777508,-25279.942936,...,11005.316492,11005.316492,11005.316492,52915.403951,52915.403951,52915.403951,0.0,0.0,0.0,470730.698156
1,2015-07-01 06:00:00,432093.576126,402058.609139,493360.046875,432093.576126,432093.576126,16304.402767,16304.402767,16304.402767,-47721.15766,...,11030.513571,11030.513571,11030.513571,52995.046855,52995.046855,52995.046855,0.0,0.0,0.0,448397.978892
2,2015-07-01 07:00:00,432097.231603,386704.451059,480149.040575,432097.231603,432097.231603,-763.670208,-763.670208,-763.670208,-64916.221782,...,11077.716076,11077.716076,11077.716076,53074.835497,53074.835497,53074.835497,0.0,0.0,0.0,431333.561394
3,2015-07-01 08:00:00,432100.88708,377887.304382,466891.293891,432100.88708,432100.88708,-11441.822973,-11441.822973,-11441.822973,-75742.065391,...,11145.472993,11145.472993,11145.472993,53154.769424,53154.769424,53154.769424,0.0,0.0,0.0,420659.064106
4,2015-07-01 09:00:00,432104.542557,369102.358477,461170.252107,432104.542557,432104.542557,-14906.350065,-14906.350065,-14906.350065,-79373.208471,...,11232.010233,11232.010233,11232.010233,53234.848173,53234.848173,53234.848173,0.0,0.0,0.0,417198.192492


In [77]:
print(metrics.r2_score(train_df['y'], model.predict(pd.DataFrame(train_df['ds'])).yhat), 
      metrics.r2_score(test_df['y'],model.predict(pd.DataFrame(test_df['ds'])).yhat), 
      metrics.r2_score(df['y'], model.predict(pd.DataFrame(df['ds'])).yhat))

0.7824039517999025 0.687577002082496 0.7638128022011801


As expected, the model does the best on the training set with noticably worse performance on the test set. Prophet is performing worse than the random forest model from the EDA, so we are going to combine it with the random forest model to improve the accuracy when compared against either.

In [78]:
rfr = RandomForestRegressor()

In [79]:
sdf = pd.DataFrame(df)
sdf['sy'] = sdf.y.shift(-24)
sdf = sdf.iloc[:-24]
spoint = int(0.75*sdf.shape[0])
X_train, X_test, y_train, y_test = sdf['y'].iloc[:spoint], sdf['y'].iloc[spoint:], sdf.sy.iloc[:spoint], sdf.sy.iloc[spoint:]
y_test.shape

(12316,)

In [80]:
rfr.fit(np.array(X_train).reshape(-1,1), y_train)


RandomForestRegressor()

In [81]:
#use the scores on the trainging data set to determine weights for adding the models together
rf_score = metrics.r2_score(y_train, np.array(rfr.predict(np.array(X_train).reshape(-1,1))))
p_score = metrics.r2_score(train_df['y'],model.predict(pd.DataFrame(train_df['ds'])).yhat)

In [82]:
score_sum = rf_score+p_score
def combine_predict(X):
    return (p_score*model.predict(pd.DataFrame(X['ds'])).yhat + rf_score*rfr.predict(np.array(X['sy']).reshape(-1,1)))/score_sum


In [83]:
prep_df = pd.DataFrame()
ds_train, ds_test = sdf.ds.iloc[:int(0.75*sdf.shape[0])], sdf.ds.iloc[int(0.75*sdf.shape[0]):]
y_test = sdf.sy.iloc[int(0.75*sdf.shape[0]):]
prep_df = pd.concat([pd.DataFrame(ds_test), pd.DataFrame(y_test)], axis=1)

In [84]:
pred_df = combine_predict(prep_df)

In [85]:
comb_score = metrics.r2_score(y_test, pred_df)

In [86]:
#set r2_scores to those for the test set for comparision
rf_score = metrics.r2_score(y_test, np.array(rfr.predict(np.array(X_test).reshape(-1,1))))
p_score = metrics.r2_score(test_df['y'],model.predict(pd.DataFrame(test_df['ds'])).yhat)
p_score, rf_score, comb_score

(0.687577002082496, 0.8539424443706733, 0.9052450075498651)

The combine model has a better score in the test set than both Prophet and the RandomForestRegressor. We will now create a class to find optimal hyper-parameters and save the model with.

In [90]:
class CombinedModel():
    def __init__(self):
        self.rfr = RandomForestRegressor()
        self.prophet = Prophet()
    
    def fit(self,X, y):
        self.rfr.fit(X.loc[:, X.columns != 'ds'], y)
        self.prophet.fit(pd.concat([pd.DataFrame(X['ds']), y], axis=1))
    
    def set_score(self,X, y):
        self.rf_score = metrics.r2_score(self.rfr.predict(X.loc[:, X.columns != 'ds']), y)
        self.p_score = metrics.r2_score(self.prophet.predict(pd.DataFrame(X['ds'])), y)
    
    def predict(self,X):
        rf_pred = self.rfr.predict(X.loc[:, X.columns != 'ds'])
        p_pred = self.prophet.predict(pd.DataFrame(X['ds']))
        return self.rf_score*rf_pred + self.p_score*p_pred
    
    def optimize(self, X, y):
        #optimize RandomForestRegressor
        rf_params = {'n_estimators' : list(range(1, 1001)),
                    'criterion' : ['mse','mae'],
                    'max_features' : ['auto', 'sqrt', "log2"]}
        rscv = RandomizedSearchCV(rfr, rf_params)
        rscv.fit(X,y)
        self.rfr = rscv.best_estimator_

In [91]:
comodel = CombinedModel()

In [92]:
comodel.optimize(pd.DataFrame(X_train), y_train)

In [93]:
comodel.fit(pd.DataFrame(X_train), y_train)

KeyError: 'ds'

In [None]:
y_train.head()