In [71]:
import pandas as pd
import sys
import requests
from xlsxwriter.utility import xl_rowcol_to_cell
sys.path.append(r'/home/jupyter/reusable_code')
import google_api_functions as gaf
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)


from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot
import itertools
from sklearn.metrics import mean_squared_error
import numpy as np

import re
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Set up modules for Google functionality
from google.cloud import bigquery # To run BQ statements
from google_auth_oauthlib import flow # To authorise as user
from googleapiclient.discovery import build # To pull in from sheets, slides etc. API
from google.auth.transport.requests import Request
from google.cloud.bigquery import magics
import google

from datetime import datetime

In [72]:
def mean_absolute_percentage_error(actual, estimated): 
    actual, estimated = np.array(actual), np.array(estimated)
    return np.mean(np.abs((actual - estimated) / actual)) 

In [73]:
creds=gaf.Authenticate_Google(r"/home/jupyter/reusable_code/") #GAF is a package steve created with a list of useful functions
bq = bigquery.Client(project='itv-bde-analytics-dev',credentials=creds) #Apply credentials to BQ client "bq"

In [173]:
# Read in the FTS table created in BQ

query="""
select 
  timestamp_trunc(start,DAY) as reportingDate
  , count(*) as vol 
from `itv-bde-analytics-dev.britbox_sandbox.ss_entitlements` 
where eventSubType.reportingEvent = 'A' and billingProvider in ('Stripe', 'iTunes') 
group by 
  1
"""
FTS_df = bq.query(query).to_dataframe()
FTS_df

In [174]:
# Change the column names (was relevant when using steves code)
FTS_df = FTS_df.rename(columns={ 'reportingDate':'reporting date','vol':'N'})

# Indexes the date column 
FTS_df = FTS_df.set_index("reporting date")


In [175]:
# Order the data
FTS_df = FTS_df.sort_index()
FTS_df.tail()

In [121]:
# Get Apple Mobility Data
mobility_data=pd.read_csv('https://covid19-static.cdn-apple.com/covid19-mobility-data/2021HotfixDev18/v3/en-us/applemobilitytrends-2020-11-19.csv')
walkingdataUK=mobility_data[(mobility_data['region']=='United Kingdom')&(mobility_data['transportation_type']=='walking')]
datecols=[x for x in walkingdataUK.columns if re.match('\d{4}-\d{2}-\d{2}',x)]
walkingdataUK_TS=walkingdataUK[datecols].stack().droplevel(level=0)
walkingdataUK_TS.index=pd.to_datetime(walkingdataUK_TS.index)
# Fill in known blanks in May 11/12 2020
walkingdataUK_TS=walkingdataUK_TS.reindex(pd.date_range(start=walkingdataUK_TS.index.min(), end=walkingdataUK_TS.index.max()))
walkingdataUK_TS['2020-05-11']=(walkingdataUK_TS['2020-05-10']+(1/3)*(walkingdataUK_TS['2020-05-13']-walkingdataUK_TS['2020-05-10']))
walkingdataUK_TS['2020-05-12']=(walkingdataUK_TS['2020-05-10']+(2/3)*(walkingdataUK_TS['2020-05-13']-walkingdataUK_TS['2020-05-10']))

In [164]:
FTS_df=FTS_df.tz_localize(None) # Remove timezone from FTS_df index
min_idx,max_idx=max(walkingdataUK_TS.index.min(),FTS_df.index.min()),min(walkingdataUK_TS.index.max(),FTS_df.index.max()) # Work out overlapping indexed
# Align to get both indexes
FTS_df=FTS_df[min_idx:max_idx]
walkingdataUK_TS=walkingdataUK_TS[min_idx:max_idx]
# Convert mobility
from matplotlib import pyplot as plt
plt.plot(FTS_df)
plt.plot((1/walkingdataUK_TS)*100000)
plt.show()

In [166]:
exogdata=pd.DataFrame((1/walkingdataUK_TS)*100000)
exogdata['SI']=0
exogdata.loc['2020-10-03':'2020-10-10','SI'] = [7,6,5,4,3,2,1,0]
exogdata['2020-10-03':'2020-10-10']

exogtrain,exogtest = exogdata['2019-11-11 00:00:00':'2020-10-31 20:00:00'], exogdata['2020-11-01 00:00:00':'2020-11-16 20:00:00']

In [178]:
# Alternative way of generating loop
import itertools
p_values = range(0,3)
d_values = range(0,2)
q_values = range(0,2)
s_values=[7,0] #Weekly, daily, none
pdq = list(itertools.product(p_values, d_values, q_values))
PDQ = list(itertools.product(p_values, d_values, q_values,s_values))
arima_combos=list(itertools.product(pdq,PDQ))
arima_combos=[i for i in arima_combos if i[0]!=(0,0,0)] # Remove all the 0,0,0 combos
print(len(pdq))
print(len(arima_combos))

train,test = FTS_df['2019-11-11 00:00:00':'2020-10-31 20:00:00'], FTS_df['2020-11-01 00:00:00':'2020-11-16 20:00:00']           
Upper_Bound=train.mean()+(3*train.std())
train['N2'] = np.where(train['N']>Upper_Bound[0], Upper_Bound[0],train['N'])
train=train['N2']

In [168]:
errors=[]
errors2=[]
predictions = list()

# Initialise best model dictionary
best_model={'MAPE':999999999.0}

# Generate dictionary of the parameters that will be needed for each iteration of the model
ARIMA_params={i:(['ar.L{}'.format(x+1) for x in range(0,i[0])],['ma.L{}'.format(x+1) for x in range(0,i[2])]) for i in pdq}
SARIMA_params={i:(['ar.S.L{}'.format((x+1)*i[3]) for x in range(0,i[0])],['ma.S.L{}'.format((x+1)*i[3]) for x in range(0,i[2])]) for i in PDQ}

all_params=[]
for i in arima_combos:
    [all_params.append(item) for sublist in [x for x in ARIMA_params[i[0]]]+[y for y in SARIMA_params[i[1]]]+[['sigma2']] for item in sublist]
all_params=all_params+['SI','0']
start_params=pd.Series({x:0.01 for x in set(all_params)})


for n,i in enumerate(reversed(arima_combos)):
    print(n)
    keep_paramids = pd.Series(['0','SI']+[item for sublist in [x for x in ARIMA_params[i[0]]]+[y for y in SARIMA_params[i[1]]]+[['sigma2']] for item in sublist])
    print(keep_paramids)
    if i[0]!=(0,0,0):
        try:
            if i[1][3]==0: 
                model=ARIMA(train,order=i[0],freq='d') # Initialise ARIMA class

                print('Starting ARIMA for {}  at {}'.format(i,datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
                model_fit = model.fit(disp=0)  # Actually fit the ARIMA

                print('Finished ARIMA for {}  at {}'.format(i,datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
                pred_y = model_fit.forecast(steps=len(test))[0]# create a forecast for the "expected" values on the Test Dataset

            else:
         #   try:

                model=SARIMAX(train, exog=exogtrain,order=i[0], seasonal_order=i[1],freq='d', simple_differencing=True) # Initialise SARIMAX class

                print('Starting SARIMAX for {} at {}'.format(i,datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
                #print(start_params.filter(items=keep_paramids))
                # Actually fit the SARIMAX
                init_params=start_params.filter(items=keep_paramids)
                print(init_params)
                model_fit = model.fit(disp=0,low_memory=True,start_params=init_params) #start params are filtered and sorted according to keep_paramids
                #model_fit = model.fit(disp=0,low_memory=True) # Actually fit the SARIMAX
                print('Finished SARIMAX for {}  at {}'.format(i,datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
                print(model_fit.params)

                pred_y = model_fit.forecast(steps=len(test),exog=exogtest) # create a forecast for the "expected" values on the Test Dataset

            predictions.append(pred_y)

            error = mean_squared_error(test,pred_y)
            error2 = mean_absolute_percentage_error(test,pred_y)

            if error2*100<best_model['MAPE']:

                print('{} was this model''s error, and {} is the previous best so model will update'.format(error2*100,best_model['MAPE']))
                # Update view of what best model is
                best_model={'model':model_fit\
                           ,'modelParams':model_fit.params\
                           ,'modelParamsConfInt':model_fit.conf_int()\
                            ,'ARIMA':i[0]\
                            ,'SARIMA':i[1]\
                            ,'RMSE':np.sqrt(error)
                            ,'MAPE':error2*100
                           }
                # Update Start Params to hopefully optimise future loops
                start_params=model_fit.params.combine_first(start_params)
            errors.append(error)
            errors2.append(error2)
            print(i[0],i[1], np.sqrt(error), error2*100)
        except:
            print('Guess what? It didn''t work')

In [169]:
best_model

In [170]:
df=pd.DataFrame(test)
pred_y =best_model['model'].predict(start=df.index.min(), end=df.index.max(),exog=exogtest)
df['est']=pred_y
df.plot()


In [136]:
model=SARIMAX(train, exog=exogtrain,order=(0,0,1), seasonal_order=(2,1,1,7),freq='d', simple_differencing=True) # Initialise SARIMAX class
model_fit = model.fit(disp=0,low_memory=True,start_params=init_params) #start params are filtered and sorted according to keep_paramids
print('Finished SARIMAX for {}  at {}'.format(i,datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
print(model_fit.params)

pred_y = model_fit.forecast(steps=len(test),exog=exogtest) # create a forecast for the "expected" values on the Test Dataset


In [97]:
df=pd.DataFrame(test)
pred_y =best_model['model'].predict(start=df.index.min(), end=df.index.max())
df['est']=pred_y
df.plot()

In [46]:
model_fit.summary()

In [180]:

hw_combos = list(itertools.product(['add','mul'],['add','mul','None'])) #Create all 6 combos of additive, multiplicative and True/False
    
for n,model_var in enumerate(hw_combos):
    print("Trying Holt Winter's Model {}, model number {}".format(model_var,n))
    #try:

    # Creates the time series model using the ExponentialSmooothing function
    if model_var[1]=='None':
        model=ExponentialSmoothing(train, trend=model_var[0],freq='d')


    else:
        model=ExponentialSmoothing(train, trend=model_var[0],seasonal=model_var[1]\
                                   ,seasonal_periods=7,freq='d')


        model_fit=model.fit()
    pred_y = model_fit.predict(start=test.index.min(), end=test.index.max()) # create a forecast for the "expected" values on the Test Dataset
    error = mean_squared_error(test,pred_y)
    error2 = mean_absolute_percentage_error(test,pred_y)
    print(model_var[0],model_var[1], np.sqrt(error), error2*100)
    #except: 
     #   print("Model {} Failed".format(model_var))


In [179]:
model=ExponentialSmoothing(train, trend='mul',seasonal='mul'\
                                   ,seasonal_periods=7,freq='d')
model_fit=model.fit()
print(model_fit.params)
df=pd.DataFrame(test)
pred_y =model_fit.predict(start=df.index.min(), end=df.index.max())
df['est']=pred_y

df.plot(ylim=(0))

In [103]:
model_fit.predict(start=df.index.min(), end=df.index.max())

In [143]:
all_params=[]
for i in arima_combos:
    [all_params.append(item) for sublist in [x for x in ARIMA_params[i[0]]]+[y for y in SARIMA_params[i[1]]]+[['sigma2']] for item in sublist]
all_params=['SI','0']+all_params
start_params=pd.Series({x:0.01 for x in set(all_params)})
start_params