# Forecasting #

In [1]:
# imports
import pandas as pd
import numpy as np
import time

In [2]:
# define data location
#location = '/Users/mithras/Documents/_SCHOOL/_Drexel/BUSN 710 - Capstone/Data/Forecasting Project/'
location = '/Users/loki/Documents/Data/Forecasting Project/'

In [26]:
# load data
granularity = "daily"

if granularity == "daily":
    use = pd.read_pickle(location+'fcast_daily.pkl.zip') # 2 years of data
    sufficient_use = pd.read_pickle(location+'peco_sufficient_daily.pkl.zip') # most recent year, id/date sufficiency
    weather = pd.read_pickle(location+'daily_weather.pkl.zip')
elif granularity == "hourly":
    use = pd.read_pickle(location+'fcast_hourly.pkl.zip') # 2 years of data
    sufficient_use = pd.read_pickle(location+'peco_sufficient_hourly.pkl.zip') # most recent year, id/date sufficiency
    weather = pd.read_pickle(location+'hourly_weather.pkl.zip')
else:
    print("Granularity not in ('daily','hourly')")

In [47]:
# merge
use = use.drop(columns=['DACCOUNTID','DMETERNO','DCUSTOMERID','FUELTYPE','REVENUCODE'])
data = pd.merge(use, weather, how='inner', on=['Dt'])
data.head()
# CustIDs || Date | Consumption |||| Weather_variables 

Unnamed: 0,ID,Dt,Use,TARIFF,Weekday,Holiday,AvgTemp,HighDB,LowDB,Temp1,...,AvgHumid,Humid1,Humid1delta,AvgWind,Wind1,Wind1delta,AvgDaytimeCloud,AvgCloud,Cloud1,Cloud1delta
0,"(156414999216, 4464803701784)",2018-06-30,0.0,GH0,0,0,84.458333,94,74,82.625,...,53.791667,54.916667,-1.125,5.291667,7.833333,-2.541667,50.0,40.0,50.416667,-10.416667
1,"(469011116536, 604933845496)",2018-06-30,1.14,GH0,0,0,84.458333,94,74,82.625,...,53.791667,54.916667,-1.125,5.291667,7.833333,-2.541667,50.0,40.0,50.416667,-10.416667
2,"(504636767772, 606835012504)",2018-06-30,0.0,GH0,0,0,84.458333,94,74,82.625,...,53.791667,54.916667,-1.125,5.291667,7.833333,-2.541667,50.0,40.0,50.416667,-10.416667
3,"(585291286356, 4464618413296)",2018-06-30,0.0,GH0,0,0,84.458333,94,74,82.625,...,53.791667,54.916667,-1.125,5.291667,7.833333,-2.541667,50.0,40.0,50.416667,-10.416667
4,"(587387949408, 4464383833568)",2018-06-30,1.14,GH0,0,0,84.458333,94,74,82.625,...,53.791667,54.916667,-1.125,5.291667,7.833333,-2.541667,50.0,40.0,50.416667,-10.416667


### Train/Test split ##

In [29]:
# split by dates (we trained our clusters on the recent year, so test against older data)
train = data[data['Dt'] >= "10-1-2017"].dropna()
# test = data[data['Dt'] < "10-1-2017"].dropna()

In [32]:
print(test.shape,train.shape)

(1462424, 25) (2272597, 25)


### Forecast by Revenuecode ###

In [40]:
# Treat revenuecode as kmeans cluster
# Build regression model per revenuecode centroid
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import math
from itertools import chain
from timeit import default_timer as timer



# create table of each cluster's regression equations
models = []
rTable = pd.DataFrame(index=(['intercept']+list(data)[4:] + ['MSE','R2']))


for code in data['TARIFF'].unique():
    dat = train[(train['TARIFF']==code)]
    X = dat.drop(columns=['ID','Dt','Use','TARIFF'])
    y = dat['Use']
    
    # fit the regression model
    regr = linear_model.LinearRegression(fit_intercept=True)
    regr.fit(X, y)
    models.append((code,regr))
    
    # extract model coefficients
    intercept = regr.intercept_
    coefs = regr.coef_
    err = mean_squared_error(y, regr.predict(X))
    r2 = regr.score(X,y)
    #print([intercept]+list(chain(coefs))+[err]+[r2])
    rTable[code] = [intercept]+list(chain(coefs))+[err]+[r2]
    del (dat, X, y)

models = dict(models)
rTable = rTable.T

In [39]:
rTable

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [61]:
# create testing data (average daily use per ratecode)
test = use[use['Dt'] < "10-1-2017"]
test = test.drop(columns='ID').groupby(['Dt','TARIFF','Weekday','Holiday']).mean().reset_index()
test = pd.merge(test, weather, how='inner', on=['Dt'])
test = test.dropna()
test.head()

Unnamed: 0,Dt,TARIFF,Weekday,Holiday,Use,AvgTemp,HighDB,LowDB,Temp1,Temp1delta,...,AvgHumid,Humid1,Humid1delta,AvgWind,Wind1,Wind1delta,AvgDaytimeCloud,AvgCloud,Cloud1,Cloud1delta
0,2016-10-01,TFG,0,0,243.39,61.125,64,58,60.125,1.0,...,90.666667,91.041667,-0.375,11.0,15.208333,-4.208333,100.0,100.0,100.0,0.0
1,2016-10-02,TFG,0,0,223.09,63.458333,67,61,61.125,2.333333,...,90.875,90.666667,0.208333,5.5,11.0,-5.5,95.0,96.25,100.0,-3.75
2,2016-10-03,TFG,1,0,174.88,67.208333,76,63,63.458333,3.75,...,74.5,90.875,-16.375,4.625,5.5,-0.875,54.166667,62.5,96.25,-33.75
3,2016-10-04,TFG,1,0,214.29,62.833333,68,59,67.208333,-4.375,...,79.291667,74.5,4.791667,7.666667,4.625,3.041667,90.0,64.583333,62.5,2.083333
4,2016-10-05,TFG,1,0,223.77,61.75,71,56,62.833333,-1.083333,...,70.458333,79.291667,-8.833333,8.5,7.666667,0.833333,27.5,31.25,64.583333,-33.333333


In [67]:
# For each revenuecode, forecast subsequent year of use
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import math
from itertools import chain
from timeit import default_timer as timer

error = []
for code in data['TARIFF'].unique():
    dat = test[(test['TARIFF']==code)]
    X = dat.drop(columns=['Dt','Use','TARIFF'])
    y = dat['Use']
    
    
    regr = models[code]
    yhat = regr.predict(X)
    
    mse = mean_squared_error(y, yhat)
    rmse = math.sqrt(mse)
    error.append(rmse)

In [68]:
# sum error
sum(error)

2224.8588539812636

### Forecast by Cluster ###

In [None]:
# merge
data = pd.merge(sufficient_use, weather, how='inner', on=['Dt'])
data = data.drop(columns=['DACCOUNTID','DMETERNO','DCUSTOMERID','FUELTYPE','REVENUCODE'])
data.head()
# CustIDs || Date | Consumption |||| Weather_variables 

In [None]:
# Precluster Groups

In [None]:
# Within Group Clusters

In [None]:
# KNN

In [None]:
# Build regression model per cluster centroid

In [None]:
# For each centroid, forecast subsequent year of use

In [None]:
# sum error