# Customer life time value

In [2]:
# Install this package for use of graphical properties
!pip install plotly 

### STEPS
##### 1. Extract the data required
##### 2. Plot actual data - what does this look like? 
##### 3. Fit a curve based on actual data
##### 4. Select best fitted curve
##### 5. Intergrate ^, what is the expected tenure per segment?
##### 6. Multiple this by rev (for now use 5.99 as base)

In [55]:
# modules
import numpy as np
import matplotlib.pyplot as plt
import scipy
from   scipy import optimize
import pandas as pd
from scipy.optimize import curve_fit
import scipy.stats as stats
from scipy.stats import norm
import sys

# Install/import plotly packages- this package has lots of graphical properties
import plotly.graph_objects as go
import plotly.offline as pyo
from pandas.io import gbq
from scipy import integrate 

from matplotlib.patches import Polygon

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


In [56]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

#### 0. Define Functions for later use

In [87]:
def Get_Revenues(Cohort_table,End_date):
    if not End_date:
        End_date=str(date.today())
    # Check end date is in the correct format
    if len(re.findall(r"(([12]\d{3})-(0[1-9]|1[012])-(0\d|1\d|2\d|3[01]))", End_date))>0:
        pass
    else:
        print("Error- End Date in wrong format. Please use YYYY-MM-DD")
    
    # Check that the BQ table has the correct columns
    # User should input a table of Segment_Name, BritBox_ID, Subscription_ID and AcqnDate
    
    # The code should then take this table and join it to the revenues data and generate the resultant output in the format needed for the rest of the LTV code
    
    

def plot_df(df, x, y, title="Daily Revenue", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()


# Determine the fitted curves per segment and plot this 

# This is used for the curve-fitting procedure later, which requires the function as an input
def func_expdecay(xdata, a, b ,k):
    return a * np.exp(-b * xdata) + k

def func_log(xdata,a,b):
    return a * -np.log(b*xdata) 

def func_linear(xdata,a,b):
    return a * xdata +b  #mx +c

def func_normal(xdata,a,mu,std):
    return a*(np.exp(-((xdata-mu)**2)/(2*std**2)))

def func_weib(xdata,n,a):
    return (a / n) * (xdata / n)**(a-1) * np.exp(-(xdata/n)**a)

def func_lognormal(xdata, a, mu, std):
    return a*((1.0/(xdata*std*np.sqrt(2.0*np.pi)))*np.exp(-1.0*(((np.log(xdata)-mu)**2.0)/(2.0*(std**2.0)))))

def func_power(x, a, b):
    return a*(x**b)

def fit_test(ydata,y2,test='CHI2'):
    """returns fit scores for chi2 and rmse 
    chisquare requires large freq ideally greater than 5 
    (ref:https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html)
    """
    if (test=='CHI2') and (min(ydata)>5): 
        return stats.chisquare(f_obs=ydata, f_exp=y2)
    elif test == 'RMSE':
        return np.sqrt(np.mean((ydata - y2)**2))
    else:
        print('check conditions')

        
        
def mean_absolute_percentage_error(actual, estimated): 
    actual, estimated = np.array(actual), np.array(estimated)
    return np.mean(np.abs((actual - estimated) / actual)) 

#### 1. Data Extraction


In [57]:
# Extracting data to determine LTV
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)

In [59]:
# Filter of 500+ customers needed per segment 

query="""

with table1 as 

(select a.*,concat(a.itvid, a.subscriptionid) as ID, b.Segment, b.signuptime from 

(select distinct itvid, subscriptionid,  proratedDailyRevenue,date_value  from `itv-bde-analytics-prd.britbox_model.LTV_Entlmt_Daily_Revenue`) as a 

inner join 

(select a.*, case when a.utm_content = 'CRM' then b.level2 else a.utm_content end as Segment

from (select distinct britbox_ID, subscriptionid , subscription.firstStart as signuptime, acquisition.utm_content as utm_content from  `itv-bde-analytics-prd.britbox_analytics.entitlements`
where acquisition.utm_source != 'Non_web' and date(subscription.firstStart) <= '2020-11-10' ) as a 
left join 

(select  * from `itv-bde-analytics-prd.britbox_mart.last_click_referrer`  ) as b

on a.britbox_ID = b.user_ID and  a.subscriptionid = b.subscription_ID) as b

on a.itvid = b.britbox_ID 
    and a.subscriptionid = b.subscriptionid

and date(b.signuptime) <= date(a.date_value) and date(b.signuptime) <= (current_date() -1))

,table1a as 

(select distinct id,proratedDailyRevenue, segment, row_number() over (partition by id , Segment , signuptime order by date_value) as Day from table1)

, table2 as
(
select distinct  * , proratedDailyRevenue/Customers as avg_dailyrev,
max(Customers) over (partition by Segment) as Total_Customers,
max(ifnull(proratedDailyRevenue/Customers, 0.00000000001)) over (partition by Segment) as max_rev 
from (select distinct 
Day,
Segment,
count(distinct id) as Customers,
sum(ifnull(proratedDailyRevenue,0.00000000001))/100 as proratedDailyRevenue
from  table1a

group by 1,2
order by 1,2)

order by 1,2,3)

select * from table2 where Total_Customers > 500

    """
df = bq.query(query ).to_dataframe()

df


In [60]:
# Pivot the table so data is in correct structure for making graph
df_2_pivot = pd.pivot_table(df,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
df_2_pivot

In [61]:
#df.to_excel('Actual Acq Channel graph data.xlsx')

#### 2. Plot data

In [62]:
# Plotting actual data
# This enables the data to be plotted in seperate lines split by 'segment' by allocating the ydata per row 
# hence why data needed to be pivoted so there's one row per segment

traces = [go.Scatter (
        x = df_2_pivot.columns,
        y = df_2_pivot.loc[rowname],
        mode = 'lines',
        name = rowname
)for rowname in df_2_pivot.index]

# Plot the data
graph = go.Figure(data = traces)
#graph.update_xaxes(type="category",)
graph

In [65]:
# Draw Plot
x = sorted(df.Day)
y = df.avg_dailyrev
plot_df(df, x, y, title='Daily Revenue')  

In [66]:
# Draw Plot
x = sorted(df.Day)
y = df.Customers
plot_df(df, x, y, title='Daily Customers')  

#### 3. Curve Fitting

### SW edit>> functions are held in a dictionary

In [86]:
#################### INITIALISATION  ####################
# Specify the test used to determine Goodness of Fit of the fitted curves
test_type = "RMSE"
integrate_to_days=1095 # How many days to integrate to e.g. 365 is one year
function = [func_expdecay, func_normal,func_weib, func_lognormal,func_power] # Choose which functions to fit

# Initialise empty lists where the parameters can be stored for later evaluation if needed
list_parameters = []
list_parameter_covariances = []

# Initialise an empty DataFrame to hold the final output 
LTV_dataset3 = pd.DataFrame()

# Initialise empty dictionaries to hold the best functions (curves) and parameters
selected_functions = {}
LTV_selections=[]

#### ALL SEGMENTS AND DAYS
# df is the original, full table holding all segments and days both as records
# df_2_pivot still holds all data, but now days are in columns, and there is just one row per segment

#### ONE SEGMENT AND ALL DAYS >> USED FOR PLOTTING AND EMPIRICAL LTV CALCS
## sub_df is the original table but for just one segment (recreated in each loop)
#### df_pivot is a pivot of sub_df that has one row and all days held in columns

#### ONE SEGMENT AND ONLY THE DAYS AFTER THE MAX >> USED FOR CURVE FITTING
### sub_df2 is the subset of sub_df that only looks at the data from the Max point onwards
#### sub_pivot is a pivot of sub_df2 that again transposes so that days are held in columns. This is used for curve fitting


#################### LOOP THROUGH SEGMENTS AND FIT CURVES  ####################

# Loop through each segment name (table is structured as one row per segment, one column per day)
for n,rowname in enumerate(df_2_pivot.index) : 
    
    print('''\n ---------------------------------------------------------------------- \n \n Segment: ''' + str(rowname))
    
    #Initialise empty lists for later use
    errors = []
    param_list= []
    
    #################### Create a Dataframe that finds the maximum daily revenue and only holds data from there onwards
    # Assumption is that there will pretty much always be a decay curve, barring any major changes in price
    # This dataframe created will be the one used to fit a function, as it is much easier to fit a decay curve than something which also account for the trial period
    
    # Subset columns needed and only this segment
    sub_df=df[['Day','Segment','avg_dailyrev','max_rev','Total_Customers']][df['Segment']==rowname]
    sub_df = sub_df.reset_index(drop = True)
    
    # Find Index in which avg_dailyrev = max_rev
    Starting_Curve_Value = sub_df[sub_df.avg_dailyrev==sub_df.max_rev].index.values
    Starting_Curve_Value2 = Starting_Curve_Value[0] # There may be multiple days with the same (max) revenue, so start from the first
    
    # Subset this data so you start the curve fitting from the max value
    sub_df2 = sub_df.iloc[Starting_Curve_Value2:]

    
    #################### Transpose df and sub_df2 to get days into columns for the selected segment    
    sub_pivot=pd.pivot_table(sub_df2,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
    df_pivot=pd.pivot_table(sub_df,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
    
    
    #################### Loop through the range of functions to be fitted to the LTV data     
    func_dict={}   
    for func in function : 

        #print('Function: ' + func.__name__)
        
        # Turn the above Data Frames into series/arrays: all= _all data for a segment _max = only data since the max
        xdata_all = np.array(df_pivot.columns)
        ydata_all = np.array(df_pivot.loc[rowname])
        
        xdata_max = np.array(sub_pivot.columns)
        ydata_max = np.array(sub_pivot.loc[rowname])
        
        #################### Define p0 for each function (effectively parameter seeds) >> could be moved into the functions themselves as defaults
     
        if func.__name__ in [  'func_expdecay']:
            p0 = (0.2,0.1, 1.5) # inital guess
    
        if func.__name__ in ['func_normal']:
            mu, std = norm.fit(xdata_max)
            p0 = (85,mu,std)
            
        if func.__name__ in ['func_lognormal']:
            mu, std = norm.fit(xdata_max)
            p0 = (10,mu,std)
    
        if func.__name__ in [ 'func_linear', 'func_log']:
            p0 = (2,3)
        
        if func.__name__ in ['func_weib', 'func_power']:
            p0 = (1,1)
        
        if func.__name__ in ['func_exp2']:
            p0 = (0.2,0.1,1.5,2)
    
        #################### Attempt to fit the function, return the parameters & covariance of the best fit (per function)
        try:
            params, params_covariance = optimize.curve_fit(func, xdata_max, ydata_max,p0, maxfev=100000,method='dogbox') 
            
            # Generate an array of x values (days) from the peak to the end of the observed dataset.
            # We do this because the number of days for any given segment may not = the total number of days across all segments 
            # E.g. some segments we may only have 100 days for, some we have 200 for, or the peak is at a different place.
            x2 = np.linspace(min(xdata_max),max(xdata_max),len(xdata_max))
            y2 = func(x2,*params) # Apply the fitted curve to the dummy X values

            # Account for when y values trend into negatives as we wouldn't want a negative revenue-- SHOULD WE DO THIS HERE OR LATER, SURELY ONLY NEED FOR INTEGRATION?
            y2[y2 < 0] = 0

            fit_error = fit_test(ydata_max,y2,test=test_type) # Fits the generated curve against the observed using intended test specified before loop
            abserr=mean_absolute_percentage_error(ydata_max,y2) # Calculates mean absolute error
            
            print('Function {} on segment {} has an average error of {:.2f}% and a value of {:.2f} for {}'.\
                  format(func,rowname,abserr*100,fit_error,test_type))

            func_dict[func]={'error':fit_error,'params':params,'mape':abserr}
             # Plot each function
            plt.plot(x2, y2, color=np.random.rand(3,), label=func.__name__) # generates a random different colour per segment - check if there's a way to fix this? 
        
        except:
            pass
            print("Failed to fit function {} to segment {}".format(func,rowname))
            
   
       
    #################### Select best fitting function according to error, store the function name e.g. "normal" and the parameters
    selected_func,selected_params = [(key,func_dict[key]['params']) for key in func_dict if func_dict[key]['error']==min([d['error'] for d in func_dict.values()])][0]
    selected_functions[rowname]= {'function':selected_func,'params':selected_params} # Add to dictionary storing choices for all segments

    
    #################### Plot actuals on same graph as above
    plt.plot(xdata_all, ydata_all, 'bo', label='Real Data', markersize=0.5) # This is the real data but omitted for now as it makes the graph look messy
    
    # Format the plot
    plt.xlabel("Months Subscribed")
    plt.ylabel("Daily Revenue")
    plt.title("Segment Decay Curve")
    plt.legend(loc='best')
    plt.show() # Display the plot
    
    
    #################### Determine the best fitted curve to the real data and integrate to determine LTV
    
    observed_days=max(xdata_all) # Calculate the number of days of actual data we have
    predicted_days=integrate_to_days-observed_days # Calculate the number of days we're predicting

    
    ###### Select the necessary data ranges
    empirical_x, empirical_y=xdata_all,ydata_all # The actual data observed. This will be both used for visual and for integration
    
    forecast_x=range(observed_days+1,integrate_to_days + 1)
    forecast_y=selected_func(forecast_x,*selected_params) # The forecast into the future. This should be sequentially after the observed data ends and is used for the integration
    
    curve_x=range(min(xdata_max),integrate_to_days + 1)
    curve_y=selected_func(curve_x,*selected_params) # The curve fitted to both the empirical data after the max, and the forecast moving forwards. Used for the visual
    

    ###### Set -ve forecasts to 0
    forecast_y[forecast_y<0]=0
    curve_y[curve_y<0]=0
    
    ###### Plot and Integrate
    # Plot & integrate empirical data
    plt.plot(empirical_x,empirical_y,c='b') # Plot line in blue
    plt.fill_between(empirical_x,empirical_y, where = [(x >= 0)  and (x <= max(empirical_x)) for x in empirical_x], color = 'blue', alpha = 0.3) # Fill in under the graph
    val_actual_curve = scipy.integrate.trapz(empirical_y,empirical_x) # Integrate under the curve
    
    # Plot & integrate the fitted curve
    plt.plot(curve_x,curve_y,c='r') # Plot the line in red
    plt.fill_between(forecast_x,forecast_y, color = 'red', alpha = 0.3) # Fill in under the forecasted area only
    val_fitted_curve = scipy.integrate.trapz(forecast_y, forecast_x)
   
    # Generate clean name, add as title and save figure
    title=rowname + ": " +str(selected_func.__name__).replace('func_','')
    plt.title(title)
    save_name=title+ '.png'
    #plt.savefig(save_name)
    plt.show()

    LTV = (val_actual_curve + val_fitted_curve)
    print('LTV Value of Cohort: {} is £{:.2f}'.format(str(rowname), LTV))
    print('The observed component (blue) totals £{:.2f} across the first {} days'.format(val_actual_curve,observed_days))
    print('The forecasted component (red) totals £{:.2f} across the next {} days'.format(val_fitted_curve,predicted_days))
    
    LTV_selections.append({'Segment':rowname,\
                           'LTV':round(LTV,2),'LTV_days':integrate_to_days,\
                           'LTV_Observed':round(val_actual_curve,2),'LTV_observed_days':observed_days,\
                           'LTV_Predicted':round(val_fitted_curve,2),'LTV_predicted_days':predicted_days,\
                           'Fitted_Function':str(selected_func.__name__).replace('func_',''),'Fitted_Params':str(selected_params),\
                          })

    # Export forecasted data sets to csv
    # All Data
    #export_dataset = pd.DataFrame({'Avg_Daily_Rev':empirical_y} , index=empirical_x) 
    # Curve Data 
    #export_dataset2 = pd.DataFrame({'Avg_Daily_Rev':curve_y} , index=curve_x)
    # Integrate Data
    #export_dataset3 = pd.DataFrame({'Avg_Daily_Rev':forecast_y} , index=forecast_x)

    #export_dataset4 = pd.concat([export_dataset, export_dataset2], axis = 1)

    #export_dataset5 = pd.merge(export_dataset4, export_dataset3, left_index=True, right_index = True, how = 'left')

    #export_dataset5.to_excel(rowname + ' Graph Data.xlsx')
          
LTV_final_df=pd.DataFrame(LTV_selections)
LTV_final_df