In [6]:
# Set up modules for Google functionality
from google.cloud import bigquery # To run BQ statements
from google_auth_oauthlib import flow # To authorise as user
from googleapiclient.discovery import build # To pull in from sheets, slides etc. API
from google.auth.transport.requests import Request

# Display
import pprint

# Operating system stuff
import pickle
import os.path
import sys

# Data handling
import json
import requests
from pandas import read_csv
from pandas import datetime

# Stats, models, datasheets
import pandas as pd
import pyreadstat

# Visualisation
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib_venn # For venn diagrams
from pandas.plotting import autocorrelation_plot

# Network graphs
import networkx as nx


# Misc
from xlsxwriter.utility import xl_rowcol_to_cell # Used to create cell references
import itertools

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf


In [7]:

from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot
import itertools

from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import pacf


In [2]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')
bq = bigquery.Client(project='itv-bde-analytics-dev',credentials=creds)

In [15]:
query='''select datetime(timestamp_add('2019-11-07 00:00:00', 
   INTERVAL cast(floor(timestamp_diff(timestamp_trunc(start_date, HOUR),'2019-11-07 00:00:00',HOUR)/4)*4 as int64) HOUR)) 
as starting_hour
,count(distinct britbox_id) as N from 
`itv-bde-svod-prd.reporting.Entitlements_oldf_reporting`
where date(start_date)>='2019-11-11'
and plan_type='trial'
group by 1
order by 1'''
FTS_df = bq.query(query).to_dataframe()
FTS_df

In [17]:
FTS_df=FTS_df.set_index('starting_hour')

In [65]:
def parse_datasets(complete_df, training_start, forecast_start, forecast_end, seasonality_period\
                   ,eval_duration=(1,'s'),training_end=None):
    
    # Aim is to create three datasets:
    # Training (the period to build the model off of)
    # Eval actuals (a period just before the model is build on which to test the model fit)
    # Forecast actuals (a period that you're going to predict. Actuals will only exist if you are using it for incremental analysis)
    
    # If NO training_end specified, the training dataset and the evaluation dataset will be mutually exclusive, 
    # i.e. training will end where eval begins. If it is explicitly specified, there will be some overlap
    
    # Remove the section to be forecasted
    partial_df=complete_df[training_start:forecast_start][:-1]
    try:
        forecast_actuals=complete_df[forecast_start:forecast_end]
    except:
        forecast_actuals=None
    
    #
    if eval_duration[1]=='s':
        if training_end==None:
            training_df=partial_df[:-((eval_duration[0]*seasonality_period))]
        else:
            training_df=partial_df[:training_end]
            
        eval_actuals=partial_df[-((eval_duration[0]*seasonality_period)):]
    
    else:
        if training_end==None:
            training_df=partial_df[:-((eval_duration[0]))]
        else:
            training_df=partial_df[:training_end]
        eval_actuals=partial_df[-((eval_duration[0])):]

    return (training_df,eval_actuals,forecast_actuals)

def model_fit_results(df,observed_col,forecast_col=None,resid_col=None,showplot=True):
    if resid_col and forecast_col:
        df['modelled'] = df[forecast_col]
        df['Error']=df[resid_col]
    elif resid_col:
        df['Error']=df[resid_col]
        df['modelled'] = df[observed_col]-df['Error']        
    elif forecast_col:
        df['modelled'] = df[forecast_col]
        df['Error']=df[observed_col]-df['modelled']
    else:
        print("Error: you need either the forecasted values or the residuals to show model fit results")
        return
    
    df['Squared error']=df['Error']**2
    df['Absolute error']=abs(df['Error'])
    df['Percentage error']=df['Error']/ df[observed_col]
    df['Absolute Percentage error']=df['Absolute error']/df[observed_col]
    
    results=df[['Error','Squared error','Absolute error','Percentage error','Absolute Percentage error']].mean()
    if showplot==True:
        df[[observed_col,'modelled']].plot()
        pyplot.show() 
    return results.to_dict()
    


def Fit_various_models(full_df,training_start,forecast_start,forecast_end,seasonality_lag,eval_periods,frequency='d'):
    
    var_to_model=full_df.columns[0]
    
    #Call the previously defined "parse datasets" function to return the full_df split into components for fitting
    # model and forecasting
    training, eval_actuals, forecast_actuals=parse_datasets(full_df,training_start,forecast_start\
                                                        ,forecast_end ,seasonality_lag,\
                                                            eval_duration=(eval_periods,'d'))

    print('Training on dates between {} and {}'.format(training.index.min(),training.index.max()))
    print('Evaluating error on dates between {} and {}'.format(eval_actuals.index.min(),eval_actuals.index.max()))
    # Initialise a dictionary holding all model results
    all_model_fits=[]
        
    ####### Part 1: Fit a combination of Holt Winters Models, looping through each version- additive, multiplicative 
    ####### and those same options for seasonal, or no seasonality at all
    
    hw_combos = list(itertools.product(['add','mul'],['add','mul','None'])) #Create all 6 combos of additive, multiplicative and True/False
    
    for n,model_var in enumerate(hw_combos):
        print("Trying Holt Winter's Model {}, model number {}".format(model_var,n))
        try:
            
            # Creates the time series model using the ExponentialSmooothing function
            if model_var[1]=='None':
                model=ExponentialSmoothing(training, trend=model_var[0],freq=frequency)
 
            else:
                model=ExponentialSmoothing(training, trend=model_var[0],seasonal=model_var[1]\
                                           ,seasonal_periods=seasonality_lag,freq=frequency)
                
            print("Model fitted")
            # Stores the model fit attribute
            model_fit=model.fit()
            model_fit.summary() # Provides summary statistics
            
            ### Evaluation of accuracy in predicting the data in the "evaluation" period
            
            #Produce a 'forecast' for the evaluation time period
            eval_actuals['estimate']=model_fit.predict(start=eval_actuals.index.min(), end=eval_actuals.index.max())
            my_eval=model_fit_results(eval_actuals,var_to_model,forecast_col='estimate',showplot=False)
            all_model_fits.append({'model':"Holt Winter's {}".format(model_var),\
                                   'AIC':model_fit.aic,\
                                  'avg error':my_eval['Error'],\
                                   'avg Squared error':my_eval['Squared error'],\
                                   'avg Absolute error':my_eval['Absolute error'],\
                                   'avg Percentage error':my_eval['Percentage error'],\
                                   'avg Absolute Percentage error':my_eval['Absolute Percentage error']\
                                  })
            
            
            
            
            
            
            #eval_estimated=pd.DataFrame(model_fit.predict(start=eval_actuals.index.min(), end=eval_actuals.index.max()))
            # Join on the actuals so there is a dataframe with real and estimated values
            #full_eval_df=eval_actuals.join(eval_estimated[0]).rename(columns={var_to_model:'Actual',0:'Estimated'})
            # Call the function above to return the fit of it
            

        except: 
            print("Model {} Failed".format(model_var))
    
    
    ####### Part 2: Fit a range of ARIMA models
    
    # Define the p, d and q parameters to take any value between 0 and 2
    p= P = range(0,3)
    d = q = D=Q= range(0, 2)
    s=[seasonality_lag,0]
    # Generate all different combinations of p, q and q triplets
    pdq = list(itertools.product(p, d, q))
    PDQ = list(itertools.product(P,D,Q,s))
    arima_combos=list(itertools.product(pdq,PDQ))
    
    

   
    # Loop through models 
    for n,model_var in enumerate(arima_combos):
        print("Trying ARIMA {}, model number {}".format(model_var,n))
        if model_var[0]==(0,0,0):
            pass
        else:
            try:  
                model=SARIMAX(training, order=model_var[0], seasonal_order=model_var[1],freq=frequency, simple_differencing=True)
                # Stores the model fit attribute
                model_fit=model.fit()
                model_fit.summary() # Provides summary statistics

                ### Evaluation of accuracy in predicting the data in the "evaluation" period

                #Produce a 'forecast' for the evaluation time period
                eval_actuals['estimate']=model_fit.predict(start=eval_actuals.index.min(), end=eval_actuals.index.max())
                my_eval=model_fit_results(eval_actuals,var_to_model,forecast_col='estimate',showplot=False)

                all_model_fits.append({'model':"ARIMA {}".format(model_var),\
                                       'AIC':model_fit.aic,\
                                      'avg error':my_eval['Error'],\
                                       'avg Squared error':my_eval['Squared error'],\
                                       'avg Absolute error':my_eval['Absolute error'],\
                                       'avg Percentage error':my_eval['Percentage error'],\
                                       'avg Absolute Percentage error':my_eval['Absolute Percentage error']\
                                      })


            except:
                print("Model {} Failed".format(model_var))
            
            
    best_AIC=[i for i in all_model_fits if i['AIC']==min([i['AIC'] for i in all_model_fits])][0]
    print(' The best AIC was on model : {} at a value of {}. The MAPE was {}'\
              .format(best_AIC['model'],best_AIC['AIC'],best_AIC['avg Absolute Percentage error']))
        
    best_MAPE=[i for i in all_model_fits if i['avg Absolute Percentage error']==min([i['avg Absolute Percentage error'] for i in all_model_fits])][0]
    print(' The best MAPE was on model : {} at a value of {}. The MAPE was {}'\
              .format(best_MAPE['model'],best_MAPE['AIC'],best_MAPE['avg Absolute Percentage error']))
    
    pd.DataFrame(all_model_fits).plot.scatter(x='avg Absolute Percentage error',y='AIC')
    pyplot.show()
    return all_model_fits
  
       

In [11]:
# Correlation with itself (yesterday)
print("Correlation at lag 1 is :",FTS_df['N'].autocorr())

# Correlation with itself (7 days ago)
print("Correlation at lag 6 is :",FTS_df['N'].autocorr(6))

# Correlation with exactly same record (useless sanity check)
print("Correlation at lag 42 is :",FTS_df['N'].autocorr(42))

# Plot autoregression to see how it correlates with each lag
autocorrelation_plot(FTS_df['N'])
pyplot.show()

# Get Partial ACF

PACF_values=pd.DataFrame(pacf(FTS_df['N'],nlags=200)) #PACF numpy array converted to df
PACF_values.plot() #Plot
pyplot.show()

# In this example it appears a PACF can be >1. We have some erratic values going back beyond 25 instances which mask the pattern
sub_df=PACF_values[0:1000]
sub_df.plot(style='.-',xlim=(0,26))
pyplot.show()

PACF_values

In [18]:
my_training, my_eval_actuals, my_forecast_actuals=parse_datasets(FTS_df,'2019-11-11 00:00:00','2020-02-29 16:00:00',\
                                                                 '2020-03-03 23:00:00',42,eval_duration=(42,'d'))


In [66]:
results6=Fit_various_models(FTS_df,'2019-11-11','2020-02-29 16:00:00','2020-03-03 23:00:00',6,42,frequency='4h')

In [69]:
results6_df=pd.DataFrame(results6)
results6_df[(results6_df['avg Absolute Percentage error']<.25)]\
.sort_values(by='avg Absolute Percentage error')

In [23]:
results42=Fit_various_models(FTS_df,'2019-11-11','2020-02-29 16:00:00','2020-03-03 23:00:00',42,42,frequency='4h')

In [64]:
results42_df=pd.DataFrame(results42)
results42_df[(results42_df['avg Absolute Percentage error']<.25)]\
.sort_values(by='AIC')

In [24]:
best_model42=ExponentialSmoothing(my_training, trend='add',seasonal='mul',seasonal_periods=42,freq='4h')

In [35]:
# Refit to the latest model
print(my_training.index.min(),my_training.index.max())
print(my_eval_actuals.index.min(),my_eval_actuals.index.max())
print(my_forecast_actuals.index.min(),my_forecast_actuals.index.max())

In [36]:
full_training=my_training.append(my_eval_actuals)
print(full_training.index.min(),full_training.index.max())

In [126]:
#best_model=ExponentialSmoothing(full_training, trend='add',seasonal='mul',seasonal_periods=42,freq='4h')
#best_model=SARIMAX(my_training, order=(0,0,1), seasonal_order=(1,0,1,6),freq='4h', simple_differencing=True)
best_model=SARIMAX(my_training, order=(1, 0, 0), seasonal_order=(1, 0, 1, 42),freq='4h', simple_differencing=True)

In [44]:
best_model42.fit().params

In [47]:
best_model.fit().params

In [130]:
my_eval_actuals['expected']=best_model.fit().predict(start=my_eval_actuals.index.min(),end=my_eval_actuals.index.max())

my_forecast_actuals['expected']=best_model.fit().predict(start=my_forecast_actuals.index.min(),end=my_forecast_actuals.index.max())
my_forecast_actuals

In [131]:
my_eval_actuals.append(my_forecast_actuals).plot()

In [132]:

my_eval_actuals.append(my_forecast_actuals)

# Full prediction period

In [73]:
full_pred_period=pd.DataFrame(my_eval_actuals['N'].append(my_forecast_actuals['N']))

In [74]:
full_pred_period['expected']=best_model.fit().predict(start=full_pred_period.index.min(),end=full_pred_period.index.max())

In [75]:
full_pred_period.plot()

In [85]:
print(full_pred_period['2020-02-29']['N'].sum()-full_pred_period['2020-03-01']['expected'].sum())
print(full_pred_period['2020-03-01']['N'].sum()-full_pred_period['2020-03-01']['expected'].sum())
print(full_pred_period['2020-03-02']['N'].sum()-full_pred_period['2020-03-01']['expected'].sum())
print(full_pred_period['2020-03-03']['N'].sum()-full_pred_period['2020-03-01']['expected'].sum())


In [67]:
results6A=Fit_various_models(FTS_df,'2019-11-11','2020-02-29 16:00:00','2020-03-03 23:00:00',seasonality_lag=6,\
                            eval_periods=10,frequency='4h')

Regressing as a function of last obs and last week

In [89]:
query='''with base as (select datetime(timestamp_add('2019-11-07 00:00:00', 
   INTERVAL cast(floor(timestamp_diff(timestamp_trunc(start_date, HOUR),'2019-11-07 00:00:00',HOUR)/4)*4 as int64) HOUR)) 
as starting_hour
,count(distinct britbox_id) as N from 
`itv-bde-svod-prd.reporting.Entitlements_oldf_reporting`
where date(start_date)>='2019-11-11'
and plan_type='trial'
group by 1)

select *,
timestamp_diff(timestamp(starting_hour),'2019-11-07 00:00:00',HOUR) as hours_since_start,
lag(N,6) over (order by starting_hour) as yesterday,
lag(N,42) over (order by starting_hour) as last_week
from base
order by 1'''
FTS_df2 = bq.query(query).to_dataframe()
FTS_df2

In [91]:
FTS_df2=FTS_df2.set_index('starting_hour')
FTS_df2.head()

In [98]:
FTS_df3=FTS_df2[42:][:'2020-02-29 12:00:00']
print(FTS_df3.index.min(),FTS_df3.index.max())

In [103]:
import statsmodels.api as sm
X = FTS_df3[['hours_since_start','yesterday','last_week']]
y = FTS_df3['N']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

In [110]:
to_predict=FTS_df2['2020-02-29 12:00:00':][1:][['hours_since_start','yesterday','last_week']]
actuals=FTS_df2['2020-02-29 12:00:00':][1:]['N']
print(to_predict.index.min(),to_predict.index.max())

In [111]:
to_predict

In [112]:
fitted=model.predict(to_predict)

In [118]:
actual_vs_predicted=pd.DataFrame(fitted,columns=['fitted'])

In [121]:
actual_vs_predicted['actuals']=actuals

In [124]:
actual_vs_predicted[:48].plot()

In [125]:
actual_vs_predicted['2020-03-01 20:00:00':]

# This is what I'm sending to BBC

In [133]:
model1=ExponentialSmoothing(full_training, trend='add',seasonal='mul',seasonal_periods=42,freq='4h')
model2=SARIMAX(my_training, order=(0,0,1), seasonal_order=(1,0,1,6),freq='4h', simple_differencing=True)
model3=SARIMAX(my_training, order=(1, 0, 0), seasonal_order=(1, 0, 1, 42),freq='4h', simple_differencing=True)

In [134]:
my_eval_actuals['expected']=model1.fit().predict(start=my_eval_actuals.index.min(),end=my_eval_actuals.index.max())
my_eval_actuals['expected2']=model2.fit().predict(start=my_eval_actuals.index.min(),end=my_eval_actuals.index.max())
my_eval_actuals['expected3']=model3.fit().predict(start=my_eval_actuals.index.min(),end=my_eval_actuals.index.max())

my_forecast_actuals['expected']=model1.fit().predict(start=my_forecast_actuals.index.min(),end=my_forecast_actuals.index.max())
my_forecast_actuals['expected2']=model2.fit().predict(start=my_forecast_actuals.index.min(),end=my_forecast_actuals.index.max())
my_forecast_actuals['expected3']=model3.fit().predict(start=my_forecast_actuals.index.min(),end=my_forecast_actuals.index.max())


In [135]:
my_eval_actuals.append(my_forecast_actuals).plot()

In [136]:
blended=my_eval_actuals.append(my_forecast_actuals)[['N','expected2','expected3']]

In [140]:
pd.options.display.max_rows = 999
blended