In [23]:
# modules
import numpy as np
import matplotlib.pyplot as plt
import scipy
from   scipy import optimize
import pandas as pd
from scipy.optimize import curve_fit
import scipy.stats as stats
import sys
import pmdarima as pm

# Install/import plotly packages- this package has lots of graphical properties
import plotly.graph_objects as go
import plotly.offline as pyo

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

from statsmodels.tsa.stattools import adfuller
from datetime import timedelta, datetime


In [24]:
#pip install pmdarima

In [25]:
def Interpolate_Missing (df,freq='d'):
    df=df.reindex(pd.date_range(start=df.index.min(), end=df.index.max(),freq=freq))
    df=df.interpolate()
    return df

def Cap_data (ser,show_info=True):
    
    newSer=ser.copy() # Make a deep copy of the series to manipulate, without affecting original series
    
    if type(ser) is pd.core.series.Series : # Check if series
        skewvalue=ser.skew()
        print('The skew is {:.2f}. A value of <-1 indicates lots of small values, and a value of >1 indicates the presence of some large values.'.format(skewvalue))
        
        if show_info==True:
            print(ser.describe())
            ser.hist()
            plt.show()
        if skewvalue>1:
            threshold=newSer.quantile(0.99)# For lots of large values, consider the 99th percentile as the threshold
            mask=newSer>threshold # Create the boolean mask for all the records exceeding the threshold
            
            affected_records=mask[mask==True].size # Identify how many records affected
            series_size=newSer.size # How many records in dataset
            pc_affected=float(affected_records)/float(series_size)*100
            
            newSer=newSer.mask(newSer>threshold,threshold)  # Apply the mask
            print('The 99th percentile is {}. Values greater than this will be capped, amounting to {:.0f} records and {:.1f}% of the dataset.'.format(threshold,affected_records,pc_affected))
        elif skewvalue<-1:
            threshold=newSer.quantile(0.01)[columnname] # For lots of large values, consider the 99th percentile as the threshold
            mask=newSer<threshold # Create the boolean mask for all the records exceeding the threshold
            
            affected_records=mask[mask==True].size # Identify how many records affected
            series_size=newSer.size # How many records in dataset
            pc_affected=float(affected_records)/float(series_size)*100
            #newSer= np.where(newSer<threshold, threshold,newSer)  # Apply the mask
            newSer=newSer.mask(newSer<threshold,threshold)
            print('The 1st percentile is {}. Values less than this will be capped, amounting to {:.0f} records and {:.1f}% of the dataset.'.format(threshold,affected_records,pc_affected))
            
        else:
            print('Skew is ok, no capping applied')
            
        newskew=newSer.skew()
        print('The skew is now {:.2f}.'.format(newskew))
        
        if show_info==True:
            print(newSer.describe())
            newSer.hist()  
            plt.show()
        
        
    
    elif type(ser) is pd.core.frame.DataFrame and ser.shape[1]==1: # Check if one-column dataframe, if so take the first column and reference throughout for the processing
        columnname=ser.columns[0] #Get the name of the series to use later
        skewvalue=ser[columnname].skew()
        print('The skew is {:.2f}. A value of <-1 indicates lots of small values, and a value of >1 indicates the presence of some large values.'.format(skewvalue))
        
        if show_info==True:
            print(ser.describe())
            ser.hist()
            plt.show()
        if skewvalue>1:
            threshold=newSer.quantile(0.99)[columnname] # For lots of large values, consider the 99th percentile as the threshold
            mask=newSer[columnname]>threshold # Create the boolean mask for all the records exceeding the threshold
            print(mask[columnname==True])
            affected_records=mask[mask==True].size # Identify how many records affected
            series_size=newSer.size # How many records in dataset
            pc_affected=float(affected_records)/float(series_size)*100
            newSer[columnname]= np.where(mask, threshold,newSer[columnname])  # Apply the mask
            print('The 99th percentile is {}. Values greater than this will be capped, amounting to {:.0f}/{:.0f} records ({:.1f}% of the dataset).'.format(threshold,affected_records,series_size,pc_affected))
        
        elif skewvalue<-1:
            threshold=newSer.quantile(0.01)[columnname] # For lots of large values, consider the 99th percentile as the threshold
            mask=newSer[columnname]<threshold # Create the boolean mask for all the records exceeding the threshold
            print(mask[columnname==True])
            affected_records=mask[mask==True].size # Identify how many records affected
            series_size=newSer.size # How many records in dataset
            pc_affected=float(affected_records)/float(series_size)*100
            newSer[columnname]= np.where(mask, threshold,newSer[columnname])  # Apply the mask
            print('The 1st percentile is {}. Values less than this will be capped, amounting to {:.0f}/{:.0f} records ({:.1f}% of the dataset).'.format(threshold,affected_records,series_size,pc_affected))
        
        else:
            print('Skew is ok, no capping applied')
            
        newskew=newSer[columnname].skew()
        print('The skew is now {:.2f}.'.format(newskew))
        
        if show_info==True:
            print(newSer[columnname].describe())
            newSer.hist()  
            plt.show()

    else:
        print('Can''t run data, needs to be a 1 column dataframe or a series')
    return newSer

def generate_test_trains(df,HoldoutPC=0.05,recenttrain=90,lowerCutoff=None,upperCutoff=None,holdoutsize=None):
    
    if lowerCutoff:
        df=df[lowerCutoff:]
    if upperCutoff:
        df=df[:upperCutoff]
    
    # This function splits the samples into three groups. A holdout is taken from the most recent data, used to evaluate model fit. The proportion to hold out is determined by HoldoutPC    
    # Two different training sets are generated. One uses all available data, the other uses the most recent N periods, on the proviso that older day may no longer be relevant so models 
    # trained on this data may be better
    
    # If for whatever reason we wish to trim the edges (i.e. we want to build a model for "last week", this can be done using the lower- and upperCutoff)
    if holdoutsize==None:
        holdoutsize=int(np.floor(len(df)*HoldoutPC))
    
    holdout=df[-holdoutsize:]
    print('{} records used as a test dataset ranging from {} and {}.'.format(holdoutsize,holdout.index.min() ,holdout.index.max()))
    fullTrain=df[:-holdoutsize]
    print('{} records used in the "Full Training" dataset ranging from {} and {}.'.format(len(fullTrain),fullTrain.index.min() ,fullTrain.index.max()))
    recentTrain=fullTrain[-recenttrain:]
    print('{} records used in the "Recent Data" training dataset ranging from {} and {}.'.format(len(recentTrain),recentTrain.index.min() ,recentTrain.index.max()))
    return(holdout,fullTrain,recentTrain)

def timeSeries_diagnostics(ser,hypothesised_seasonality=7):
    if type(ser) is pd.core.series.Series :  
        print("Correlation at lag 1 is :",ser.autocorr())

        # Correlation with itself (7 days ago)
        print("Correlation at lag {} is :".format(hypothesised_seasonality),ser.autocorr(hypothesised_seasonality))

        xcutoff=hypothesised_seasonality*3+1 # Show three seasonal periods to see if that is validated
        plot_acf(ser, lags=xcutoff)

        
        # Get Partial ACF

        PACF_values=pd.DataFrame(pacf(ser,nlags=xcutoff)) #PACF numpy array converted to df
        #PACF_values.plot() #Plot
        #pyplot.show()
        plot_pacf(ser)
        print("Reminder- the point at which PACF drops to 0 is the point at which no further value is added by this lag over and above the points before it")

    else:
        print('Please pass a Pandas Series for evaluation')

def HW_grid(seasonal=[7]):
    Initial_grid=list(itertools.product(['add','mul'],['add','mul',None],seasonal))
    Deduped_grid=list(set([(i[0],i[1],i[2]) if i[1]!=None else (i[0],i[1],0) for i in Initial_grid ])) # If seasonality is none, we don't need the individual variantsv `    
     #Create all 6 combos of additive, multiplicative multiplied by the seasonal
    return Deduped_grid


def Remove_duplicate_training_sets(training_sets):
    if type(training_sets)!=dict:
        print ('ERROR: training_sets parameter needs to take the form of a dictionary {label: pandas series}')
    else:
        del_dict={x:False for x in training_sets} #Create a dictionary saying which sets to delete, default to None
        for n1,x in enumerate(training_sets): # Loop training sets
            for n2,y in enumerate(training_sets): # For each set
                if x==y or n2<n1: # Don't check references to self and only look at outstanding combinations
                    pass
                else:
                    print('Comparing {} with {}'.format(x,y))
                    if training_sets[x].equals(training_sets[y]): # If any other series are duplicate
                        del_dict[y]=True # Mark them for deletion
        # Now actually delete if appropriate
        for i in del_dict:
            if del_dict[i]==True:
                del training_sets[i]
    return training_sets

def error_calcs(actual, estimated): 
    actual, estimated = np.array(actual), np.array(estimated)
    nobs=len(actual)
    
    fit_dict={'MAPE':np.mean(np.abs(actual - estimated) / actual) ,
             'MSE':np.mean((actual - estimated)**2),
             'RMSE':np.sqrt(np.mean((actual - estimated)**2)),
             'MAE':np.mean(np.abs(actual - estimated)),
             'Correlation': np.corrcoef(actual , estimated)[0][1]}
    
    return fit_dict

In [26]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

In [27]:
# Extracting data to determine LTV
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)

In [28]:
from pandas.io import gbq

In [29]:


query="""

with breakdown as 
    (
    select 
    date(churnDate) as churnDate
      , count(distinct britbox_ID) as churns

    from
    
    (
    
    (
    select itvid,
  timestamp_trunc(eventDate,DAY) as churnDate
    from `itv-bde-analytics-prd.britbox_mart.E_entitlements`
    where (eventSubType.reportingEvent in ('H3','H4','HX') -- from paid to churned
          or eventSubType.reportingEvent in ('Q1','Q3') -- from paid - AR off to churn
          or eventSubType.reportingEvent in ('P2', 'P3','PX') -- from grace period to churn
          ) and billingProvider in ('Stripe', 'iTunes') 
    and eventDate <= '2021-03-17'
    ) as b
    


        left join 

        (select distinct
        britbox_ID
        ,min(Event_partition) as first_watched_LI
            from `itv-bde-analytics-prd.britbox_analytics.Viewing_clean` where Event_partition >= '2020-01-01'
            and lower(title.programme) like '%love island%' group by 1
            ) as c

            on b.itvid = c.britbox_ID and c.first_watched_LI <b.churnDate

    
) group by 1 )

select * from breakdown where churnDate >= '2020-01-01' order by 1


    """
df = bq.query(query ).to_dataframe()

df


In [30]:
df=df.rename(columns={'churnDate':'date'})

In [31]:
df['date'] = pd.to_datetime(df['date'])

In [32]:
df.set_index(df['date'], inplace=True)
df=df.drop(columns=['date'])
df.head()

In [33]:

TS_capped=Cap_data(df,show_info=False)       


#df_excl_analysis=df[:'2021-01-31']
#df_for_analysis=df['2021-02-01':]
#df_excl_analysis.index.max()

df_excl_analysis=TS_capped[:'2021-01-31']
df_for_analysis=TS_capped['2021-02-01':]
print(df_excl_analysis.index.max())
print(df_for_analysis.index.min())



In [34]:
holdout_sample_size=42
plt.plot(df_excl_analysis)

In [35]:
model=pm.auto_arima(df_excl_analysis,\
                    start_p=0,max_p=5,\
                    start_d=0,max_d=2,\
                    start_q=0,max_q=5,\
                    start_P=0,max_P=5,\
                    start_D=0,max_D=2,\
                    start_Q=0,max_Q=5, m=7\
                    , seasonal=True,error_action='warn', trace=True, supress_warnings=True,stepwise=True,random_state=20,n_fits=50
                   ,out_of_sample_size=holdout_sample_size)

In [45]:
estimated=pd.DataFrame(model.predict_in_sample(),index=df_excl_analysis.index)
estimated.columns = ['predicted_GrossChurn']


In [46]:
# Visualise Overall fit
plt.figure(figsize=(8,5))
plt.plot(df_excl_analysis,label="Training")
plt.plot(estimated,label="Training (Predicted)")
#plt.plot(test,label="Test (Actuals)")
#plt.plot(test_prediction,label="Test (Predicted)")
#plt.plot(df_for_analysis,label="Non-exclusive period (Actuals)")
plt.legend(loc = 'upper left')
plt.show()

In [47]:
# Visualise Fit on non-training sample
plt.figure(figsize=(8,5))
plt.plot(df_excl_analysis[-holdout_sample_size:],label="Test")
plt.plot(estimated[-holdout_sample_size:],label="Test (Predicted)")
plt.legend(loc = 'upper left')
plt.show()

In [48]:
# Generate actual prediction
prediction = pd.DataFrame(model.predict(n_periods = len(df_for_analysis.index)),index=df_for_analysis.index)
prediction.columns = ['predicted_GrossChurn']


In [49]:
# Visualise all items
plt.figure(figsize=(8,5))
plt.plot(df_excl_analysis[:-holdout_sample_size],label="Training")
plt.plot(estimated[:-holdout_sample_size],label="Training (Predicted)")
plt.plot(df_excl_analysis[-holdout_sample_size:],label="Test")
plt.plot(estimated[-holdout_sample_size:],label="Test (Predicted)")
plt.plot(df_for_analysis,label="Non-exclusive period (Actuals)")
plt.plot(prediction,label="Non-exclusive period (Predicted)")
plt.legend(loc = 'upper left')
plt.show()

In [50]:
# Visualise all items except the training portion
plt.figure(figsize=(8,5))
plt.plot(df_excl_analysis[-holdout_sample_size:],label="Test")
plt.plot(estimated[-holdout_sample_size:],label="Test (Predicted)")
plt.plot(df_for_analysis,label="Non-exclusive period (Actuals)")
plt.plot(prediction,label="Non-exclusive period (Predicted)")
plt.legend(loc = 'upper left')
plt.show()

In [51]:
model.summary()

In [52]:
all_actuals=pd.concat([df_excl_analysis,df_for_analysis])
all_estimates=pd.concat([estimated,prediction])
outputdf=pd.merge(all_actuals,all_estimates,how="inner",left_index=True,right_index=True)

outputdf['Date']=outputdf.index.strftime("%Y-%m-%d")
outputdf['Date_as_DT']=outputdf.index
outputdf["Day_of_Week"] = outputdf.Date_as_DT.dt.weekday
outputdf["Day_of_Week"] = outputdf.Date_as_DT.dt.weekday*1
outputdf['Month']=outputdf.Date_as_DT.dt.to_period('M').dt.strftime("%Y-%m-%d")
outputdf["Week"] = outputdf.apply(lambda x: x.Date_as_DT- timedelta(days=x.Day_of_Week), axis=1).dt.strftime("%Y-%m-%d")
outputdf=outputdf.drop(columns=['Date_as_DT'])
outputdf
gaf.Write_whole_df_to_gsheet(creds, outputdf, '1vFoQTXSD6Kw_fLj5JwWBv1VYTuPDnKkgkD7Z8UJrY-c', 'Churn_TimeSeries')

In [44]:
print("Fit on Training is: ",error_calcs(df_excl_analysis[:-holdout_sample_size],estimated[:-holdout_sample_size]))
print("Fit on Test is: ",error_calcs(df_excl_analysis[-holdout_sample_size:],estimated[-holdout_sample_size:]))