# Customer life time value

In [2]:
# Install this package for use of graphical properties
#!pip install plotly 

In [32]:
# modules
import numpy as np
import matplotlib.pyplot as plt
import scipy
from   scipy import optimize
import pandas as pd
from scipy.optimize import curve_fit
import scipy.stats as stats
from scipy.stats import norm
import sys
from datetime import date
import datetime
import re

# Install/import plotly packages- this package has lots of graphical properties
import plotly.graph_objects as go
import plotly.offline as pyo
from pandas.io import gbq
from scipy import integrate 

from matplotlib.patches import Polygon
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta


# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


In [4]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

#### 0. Define Functions for later use

In [6]:
# Checking the input dataset follows the required format for the LTV model

# Required Structure : 

# Britbox_ID
# Subscription ID
# signuptime - date format
# Segment

def Check_input_table_structure(dataset): 
    Error_flag=False
    # Check column header names - structure should be britbox_id, subscriptionid, signuptime, segment, order is not important
   
    mydata = [x.lower() for x in dataset.columns]
    required_columns = ['britbox_id', 'subscriptionid', 'signuptime', 'segment']
    
    # Columns in 'imported dataset' that don't match the LTV framework
    if mydata == required_columns:
        pass
    elif mydata != required_columns :
        missing_required_variables = [x for x in mydata + required_columns if x not in mydata]
        additional_variables = [x for x in mydata + required_columns if x not in required_columns]
        if len(missing_required_variables) == 0:
            pass
        else:
            Error_flag=True
            print('Missing fields required ' + str(missing_required_variables))
        if len(additional_variables) == 0:
            pass
        else:
            Error_flag=True
            print('Additional fields included, please delete ' + str(additional_variables))
        
    # Check signupdate format - must be in YYY-MM-DD 
    if len(re.findall(r"(([12]\d{3})-(0[1-9]|1[012])-(0\d|1\d|2\d|3[01]))", str(dataset.signuptime)))>0:
        pass
    else:
        Error_flag=True
        print("Error- Sign Up Date in wrong format. Please use YYYY-MM-DD")
        
    # Check date validity - should not be in the future
    if max(dataset.signuptime) < date.today() :
        pass
    elif max(dataset.signuptime)>= date.today():
        Error_flag=True
        print("Error - Sign Up Dates in the future, check entitlements table or joins")
        
    # Check signupdate format, must be after launch date
    if min(dataset.signuptime) < datetime.date(2019, 11, 7) :
        Error_flag=True
        print("Error- Minimum Sign Up Date before Launch Date, check data")
    
    if Error_flag==False:
        print('Table Structure all ok')
    
# The code should then take this table and join it to the revenues data and generate the resultant output in the format needed for the rest of the LTV code

def plot_df(df, x, y, title="Daily Revenue", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()


# Determine the fitted curves per segment and plot this 

# This is used for the curve-fitting procedure later, which requires the function as an input
def func_expdecay(xdata, a, b ,k):
    return a * np.exp(-b * xdata) + k

def func_log(xdata,a,b):
    return a * -np.log(b*xdata) 

def func_linear(xdata,a,b):
    return a * xdata +b  #mx +c

def func_normal(xdata,a,mu,std):
    return a*(np.exp(-((xdata-mu)**2)/(2*std**2)))

def func_weib(xdata,n,a):
    return (a / n) * (xdata / n)**(a-1) * np.exp(-(xdata/n)**a)

def func_lognormal(xdata, a, mu, std):
    return a*((1.0/(xdata*std*np.sqrt(2.0*np.pi)))*np.exp(-1.0*(((np.log(xdata)-mu)**2.0)/(2.0*(std**2.0)))))

def func_power(x, a, b):
    return a*(x**b)

def fit_test(ydata,y2,test='CHI2'):
    """returns fit scores for chi2 and rmse 
    chisquare requires large freq ideally greater than 5 
    (ref:https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html)
    """
    if (test=='CHI2') and (min(ydata)>5): 
        return stats.chisquare(f_obs=ydata, f_exp=y2)
    elif test == 'RMSE':
        return np.sqrt(np.mean((ydata - y2)**2))
    else:
        print('check conditions')

def mean_absolute_percentage_error(actual, estimated): 
    actual, estimated = np.array(actual), np.array(estimated)
    return np.mean(np.abs((actual - estimated) / actual)) 

In [27]:

def Run_LTV(signup_start,signup_end):

        # Extracting data to determine LTV
    bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)

    Segment_Query = """ (select distinct britbox_ID, subscriptionid, subscription.firstStart as signuptime , 'All' as Segment 
    from  `itv-bde-analytics-prd.britbox_analytics.entitlements`
    where date(subscription.firstStart) >= '{}' and date(subscription.firstStart) <= '{}'
    and billingprovider in ('iTunes', 'Stripe') 
    )  """.format(signup_start,signup_end)

    base_query="""
    select distinct itvid, subscriptionid,  proratedDailyRevenue,date_value  
    from `itv-bde-analytics-prd.britbox_model.LTV_Entlmt_Daily_Revenue`
        """

    Merged_query = """
    WITH
      table1 AS (
      SELECT
        a.*,
        CONCAT(a.itvid, a.subscriptionid) AS ID,
        b.Segment,
        b.signuptime
      FROM  ( """ +  base_query +  """ ) AS a INNER JOIN (  """  +  Segment_Query +  """) AS b
      ON
        a.itvid = b.britbox_ID
        AND a.subscriptionid = b.subscriptionid
        AND DATE(b.signuptime) <= DATE(a.date_value)
        AND DATE(b.signuptime) <= (CURRENT_DATE() -1)),
      table1a AS (
      SELECT
        DISTINCT id,
        proratedDailyRevenue,
        segment,
        ROW_NUMBER() OVER (PARTITION BY id, Segment, signuptime ORDER BY date_value) AS Day
      FROM
        table1),

      table2 AS (
      SELECT
        DISTINCT *,
        proratedDailyRevenue/Customers AS avg_dailyrev,
        MAX(Customers) OVER (PARTITION BY Segment) AS Total_Customers,
        MAX(ifnull(proratedDailyRevenue/Customers, 0.00000000001)) OVER (PARTITION BY Segment) AS max_rev
      FROM (
        SELECT
          DISTINCT Day,
          Segment,
          COUNT(DISTINCT id) AS Customers,
          SUM(ifnull(proratedDailyRevenue,0.00000000001))/100 AS proratedDailyRevenue
        FROM
          table1a
        GROUP BY
          1,
          2
        ORDER BY
          1,
          2)
      ORDER BY
        1,
        2,
        3)

    SELECT
      *
    FROM
      table2
    WHERE
      Total_Customers > 500

        """

    Final_Table = bq.query(Merged_query).to_dataframe()

    # Pivot the table so data is in correct structure for making graph
    df_2_pivot = pd.pivot_table(Final_Table,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])

    #################### INITIALISATION  ####################
    # Specify the test used to determine Goodness of Fit of the fitted curves
    test_type = "RMSE"
    integrate_to_days=1825 # How many days to integrate to e.g. 365 is one year
    function = [func_expdecay, func_normal,func_weib, func_lognormal,func_power] # Choose which functions to fit

    # Initialise empty lists where the parameters can be stored for later evaluation if needed
    list_parameters = []
    list_parameter_covariances = []

    # Initialise an empty DataFrame to hold the final output 
    LTV_dataset3 = pd.DataFrame()

    # Initialise empty dictionaries to hold the best functions (curves) and parameters
    selected_functions = {}
    LTV_selections=[]

    #### ALL SEGMENTS AND DAYS
    # df is the original, full table holding all segments and days both as records
    # df_2_pivot still holds all data, but now days are in columns, and there is just one row per segment

    #### ONE SEGMENT AND ALL DAYS >> USED FOR PLOTTING AND EMPIRICAL LTV CALCS
    ## sub_df is the original table but for just one segment (recreated in each loop)
    #### df_pivot is a pivot of sub_df that has one row and all days held in columns

    #### ONE SEGMENT AND ONLY THE DAYS AFTER THE MAX >> USED FOR CURVE FITTING
    ### sub_df2 is the subset of sub_df that only looks at the data from the Max point onwards
    #### sub_pivot is a pivot of sub_df2 that again transposes so that days are held in columns. This is used for curve fitting


    #################### LOOP THROUGH SEGMENTS AND FIT CURVES  ####################

    # Loop through each segment name (table is structured as one row per segment, one column per day)
    for n,rowname in enumerate(df_2_pivot.index) : 

        print('''\n ---------------------------------------------------------------------- \n \n Segment: ''' + str(rowname))

        #Initialise empty lists for later use
        errors = []
        param_list= []

        #################### Create a Dataframe that finds the maximum daily revenue and only holds data from there onwards
        # Assumption is that there will pretty much always be a decay curve, barring any major changes in price
        # This dataframe created will be the one used to fit a function, as it is much easier to fit a decay curve than something which also account for the trial period

        # Subset columns needed and only this segment
        sub_df=Final_Table[['Day','Segment','avg_dailyrev','max_rev','Total_Customers']][Final_Table['Segment']==rowname]
        sub_df = sub_df.reset_index(drop = True)

        # Find Index in which avg_dailyrev = max_rev
        Starting_Curve_Value = sub_df[sub_df.avg_dailyrev==sub_df.max_rev].index.values
        Starting_Curve_Value2 = Starting_Curve_Value[0] # There may be multiple days with the same (max) revenue, so start from the first

        # Subset this data so you start the curve fitting from the max value
        sub_df2 = sub_df.iloc[Starting_Curve_Value2:]


        #################### Transpose df and sub_df2 to get days into columns for the selected segment    
        sub_pivot=pd.pivot_table(sub_df2,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
        df_pivot=pd.pivot_table(sub_df,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])

        # Determines number of customers per segment
        Total_Customers = max(sub_df.Total_Customers)

        #################### Loop through the range of functions to be fitted to the LTV data     
        func_dict={}   
        for func in function : 

            #print('Function: ' + func.__name__)

            # Turn the above Data Frames into series/arrays: all= _all data for a segment _max = only data since the max
            xdata_all = np.array(df_pivot.columns)
            ydata_all = np.array(df_pivot.loc[rowname])

            xdata_max = np.array(sub_pivot.columns)
            ydata_max = np.array(sub_pivot.loc[rowname])

            #################### Define p0 for each function (effectively parameter seeds) >> could be moved into the functions themselves as defaults

            if func.__name__ in [  'func_expdecay']:
                p0 = (0.2,0.1, 1.5) # inital guess

            if func.__name__ in ['func_normal']:
                mu, std = norm.fit(xdata_max)
                p0 = (85,mu,std)

            if func.__name__ in ['func_lognormal']:
                mu, std = norm.fit(xdata_max)
                p0 = (10,mu,std)

            if func.__name__ in [ 'func_linear', 'func_log']:
                p0 = (2,3)

            if func.__name__ in ['func_weib', 'func_power']:
                p0 = (1,1)

            if func.__name__ in ['func_exp2']:
                p0 = (0.2,0.1,1.5,2)

            #################### Attempt to fit the function, return the parameters & covariance of the best fit (per function)
            try:
                params, params_covariance = optimize.curve_fit(func, xdata_max, ydata_max,p0, maxfev=100000,method='dogbox') 

                # Generate an array of x values (days) from the peak to the end of the observed dataset.
                # We do this because the number of days for any given segment may not = the total number of days across all segments 
                # E.g. some segments we may only have 100 days for, some we have 200 for, or the peak is at a different place.
                x2 = np.linspace(min(xdata_max),max(xdata_max),len(xdata_max))
                y2 = func(x2,*params) # Apply the fitted curve to the dummy X values

                # Account for when y values trend into negatives as we wouldn't want a negative revenue-- SHOULD WE DO THIS HERE OR LATER, SURELY ONLY NEED FOR INTEGRATION?
                y2[y2 < 0] = 0

                fit_error = fit_test(ydata_max,y2,test=test_type) # Fits the generated curve against the observed using intended test specified before loop
                abserr=mean_absolute_percentage_error(ydata_max,y2) # Calculates mean absolute error

                print('Function {} on segment {} has an average error of {:.2f}% and a value of {:.2f} for {}'.\
                      format(func,rowname,abserr*100,fit_error,test_type))

                func_dict[func]={'error':fit_error,'params':params,'mape':abserr}
                 # Plot each function
                plt.plot(x2, y2, color=np.random.rand(3,), label=func.__name__) # generates a random different colour per segment - check if there's a way to fix this? 

            except:
                pass
                print("Failed to fit function {} to segment {}".format(func,rowname))



        #################### Select best fitting function according to error, store the function name e.g. "normal" and the parameters
        selected_func,selected_params = [(key,func_dict[key]['params']) for key in func_dict if func_dict[key]['error']==min([d['error'] for d in func_dict.values()])][0]
        selected_functions[rowname]= {'function':selected_func,'params':selected_params} # Add to dictionary storing choices for all segments


        #################### Plot actuals on same graph as above
        plt.plot(xdata_all, ydata_all, 'bo', label='Real Data', markersize=0.5) # This is the real data but omitted for now as it makes the graph look messy

        # Format the plot
        plt.xlabel("Months Subscribed")
        plt.ylabel("Daily Revenue")
        plt.title("Segment Decay Curve")
        plt.legend(loc='best')
        plt.show() # Display the plot


        #################### Determine the best fitted curve to the real data and integrate to determine LTV

        observed_days=max(xdata_all) # Calculate the number of days of actual data we have
        predicted_days=integrate_to_days-observed_days # Calculate the number of days we're predicting


        ###### Select the necessary data ranges
        empirical_x, empirical_y=xdata_all,ydata_all # The actual data observed. This will be both used for visual and for integration

        forecast_x=range(observed_days+1,integrate_to_days + 1)
        forecast_y=selected_func(forecast_x,*selected_params) # The forecast into the future. This should be sequentially after the observed data ends and is used for the integration

        curve_x=range(min(xdata_max),integrate_to_days + 1)
        curve_y=selected_func(curve_x,*selected_params) # The curve fitted to both the empirical data after the max, and the forecast moving forwards. Used for the visual


        ###### Set -ve forecasts to 0
        forecast_y[forecast_y<0]=0
        curve_y[curve_y<0]=0

        ###### Plot and Integrate
        # Plot & integrate empirical data
        plt.plot(empirical_x,empirical_y,c='b') # Plot line in blue
        plt.fill_between(empirical_x,empirical_y, where = [(x >= 0)  and (x <= max(empirical_x)) for x in empirical_x], color = 'blue', alpha = 0.3) # Fill in under the graph
        val_actual_curve = scipy.integrate.trapz(empirical_y,empirical_x) # Integrate under the curve

        # Plot & integrate the fitted curve
        plt.plot(curve_x,curve_y,c='r') # Plot the line in red
        plt.fill_between(forecast_x,forecast_y, color = 'red', alpha = 0.3) # Fill in under the forecasted area only
        val_fitted_curve = scipy.integrate.trapz(forecast_y, forecast_x)

        # Generate clean name, add as title and save figure
        title=rowname + ": " +str(selected_func.__name__).replace('func_','')
        plt.title(title)
        save_name=title+ '.png'
        #plt.savefig(save_name)
        plt.show()

        LTV = (val_actual_curve + val_fitted_curve)
        print('LTV Value of Cohort: {} is £{:.2f}'.format(str(rowname), LTV))
        print('The observed component (blue) totals £{:.2f} across the first {} days'.format(val_actual_curve,observed_days))
        print('The forecasted component (red) totals £{:.2f} across the next {} days'.format(val_fitted_curve,predicted_days))

        LTV_selections.append({'Segment':rowname,\
                               'Customers' : Total_Customers,\
                               'LTV':round(LTV,2),'LTV_days':integrate_to_days,\
                               'LTV_Observed':round(val_actual_curve,2),'LTV_observed_days':observed_days,\
                               'LTV_Predicted':round(val_fitted_curve,2),'LTV_predicted_days':predicted_days,\
                               'Fitted_Function':str(selected_func.__name__).replace('func_',''),'Fitted_Params':str(selected_params),\
                              })

        # Export forecasted data sets to csv
        # All Data
        export_dataset = pd.DataFrame({'Avg_Daily_Rev':empirical_y} , index=empirical_x) 
        # Curve Data 
        export_dataset2 = pd.DataFrame({'Avg_Daily_Rev':curve_y} , index=curve_x)
        # Integrate Data
        export_dataset3 = pd.DataFrame({'Avg_Daily_Rev':forecast_y} , index=forecast_x)

        export_dataset4 = pd.concat([export_dataset, export_dataset2], axis = 1)

        export_dataset5 = pd.merge(export_dataset4, export_dataset3, left_index=True, right_index = True, how = 'left')

        #export_dataset5.to_excel(rowname + ' Graph Data.xlsx')

    LTV_final_df=pd.DataFrame(LTV_selections)
    LTV_final_df['signup_window_Start']=signup_start
    LTV_final_df['signup_window_End']=signup_end
    return LTV_final_df

    
    
    

In [None]:
Run_LTV(start_date,end_date

In [33]:
for n,i in enumerate(range(2,13)):
    print(i,n)
    
    #start_date=datetime.now().date() - timedelta(weeks=i)
    #end_date=datetime.now().date() - timedelta(weeks=(i-1))
    start_date=datetime.now().date() - relativedelta(months=i)
    end_date=datetime.now().date() - relativedelta(months=(i-1))
    
    relativedelta
    results=Run_LTV(start_date,end_date)

    if n==0:
        final_df=results.copy()
    else:
        final_df=pd.concat([final_df,results])

In [34]:
final_df

#### 1. Data Extraction


In [7]:
# Extracting data to determine LTV
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)

In [10]:
Segment_Query = """ (select distinct britbox_ID, subscriptionid, subscription.firstStart as signuptime , 'All' as Segment 
from  `itv-bde-analytics-prd.britbox_analytics.entitlements`
where date(subscription.firstStart) >= date_sub(current_date(),INTERVAL 6 MONTH) and date(subscription.firstStart) <= date_sub(current_date(),INTERVAL 10 WEEK)
and billingprovider in ('iTunes', 'Stripe') 
)  """

Segment_table = bq.query(Segment_Query).to_dataframe()

Check_input_table_structure(Segment_table)

In [11]:
# Base table currently set to 2020+ in the view within big query, earlier data can be looked at but through analysis 
# earlier cohorts skew the behaviour and product bad fits for the LTV model
# Filter of 500+ customers needed per segment 

base_query="""
select distinct itvid, subscriptionid,  proratedDailyRevenue,date_value  
from `itv-bde-analytics-prd.britbox_model.LTV_Entlmt_Daily_Revenue`
    """

Merged_query = """
WITH
  table1 AS (
  SELECT
    a.*,
    CONCAT(a.itvid, a.subscriptionid) AS ID,
    b.Segment,
    b.signuptime
  FROM  ( """ +  base_query +  """ ) AS a INNER JOIN (  """  +  Segment_Query +  """) AS b
  ON
    a.itvid = b.britbox_ID
    AND a.subscriptionid = b.subscriptionid
    AND DATE(b.signuptime) <= DATE(a.date_value)
    AND DATE(b.signuptime) <= (CURRENT_DATE() -1)),
  table1a AS (
  SELECT
    DISTINCT id,
    proratedDailyRevenue,
    segment,
    ROW_NUMBER() OVER (PARTITION BY id, Segment, signuptime ORDER BY date_value) AS Day
  FROM
    table1),
  
  table2 AS (
  SELECT
    DISTINCT *,
    proratedDailyRevenue/Customers AS avg_dailyrev,
    MAX(Customers) OVER (PARTITION BY Segment) AS Total_Customers,
    MAX(ifnull(proratedDailyRevenue/Customers, 0.00000000001)) OVER (PARTITION BY Segment) AS max_rev
  FROM (
    SELECT
      DISTINCT Day,
      Segment,
      COUNT(DISTINCT id) AS Customers,
      SUM(ifnull(proratedDailyRevenue,0.00000000001))/100 AS proratedDailyRevenue
    FROM
      table1a
    GROUP BY
      1,
      2
    ORDER BY
      1,
      2)
  ORDER BY
    1,
    2,
    3)

SELECT
  *
FROM
  table2
WHERE
  Total_Customers > 500
    
    """

Final_Table = bq.query(Merged_query).to_dataframe()

Final_Table.head()



In [12]:
# Pivot the table so data is in correct structure for making graph
df_2_pivot = pd.pivot_table(Final_Table,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
df_2_pivot

#### 2. Plot data

In [13]:
# Plotting actual data
# This enables the data to be plotted in seperate lines split by 'segment' by allocating the ydata per row 
# hence why data needed to be pivoted so there's one row per segment

traces = [go.Scatter (
        x = df_2_pivot.columns,
        y = df_2_pivot.loc[rowname],
        mode = 'lines',
        name = rowname
)for rowname in df_2_pivot.index]

# Plot the data
graph = go.Figure(data = traces)
#graph.update_xaxes(type="category",)
graph

In [14]:
# Draw Plot for Daily Revenue, sense check for any data anomolies
x = sorted(Final_Table.Day)
y = Final_Table.avg_dailyrev
plot_df(Final_Table, x, y, title='Daily Revenue')  

In [15]:
# Draw Plot for Daily Customers, expecting a constant decreasing volume
x = sorted(Final_Table.Day)
y = Final_Table.Customers
plot_df(Final_Table, x, y, title='Daily Customers')  

#### 3. Curve Fitting and Determining LTV

In [16]:
#################### INITIALISATION  ####################
# Specify the test used to determine Goodness of Fit of the fitted curves
test_type = "RMSE"
integrate_to_days=1825 # How many days to integrate to e.g. 365 is one year
function = [func_expdecay, func_normal,func_weib, func_lognormal,func_power] # Choose which functions to fit

# Initialise empty lists where the parameters can be stored for later evaluation if needed
list_parameters = []
list_parameter_covariances = []

# Initialise an empty DataFrame to hold the final output 
LTV_dataset3 = pd.DataFrame()

# Initialise empty dictionaries to hold the best functions (curves) and parameters
selected_functions = {}
LTV_selections=[]

#### ALL SEGMENTS AND DAYS
# df is the original, full table holding all segments and days both as records
# df_2_pivot still holds all data, but now days are in columns, and there is just one row per segment

#### ONE SEGMENT AND ALL DAYS >> USED FOR PLOTTING AND EMPIRICAL LTV CALCS
## sub_df is the original table but for just one segment (recreated in each loop)
#### df_pivot is a pivot of sub_df that has one row and all days held in columns

#### ONE SEGMENT AND ONLY THE DAYS AFTER THE MAX >> USED FOR CURVE FITTING
### sub_df2 is the subset of sub_df that only looks at the data from the Max point onwards
#### sub_pivot is a pivot of sub_df2 that again transposes so that days are held in columns. This is used for curve fitting


#################### LOOP THROUGH SEGMENTS AND FIT CURVES  ####################

# Loop through each segment name (table is structured as one row per segment, one column per day)
for n,rowname in enumerate(df_2_pivot.index) : 
    
    print('''\n ---------------------------------------------------------------------- \n \n Segment: ''' + str(rowname))
    
    #Initialise empty lists for later use
    errors = []
    param_list= []
    
    #################### Create a Dataframe that finds the maximum daily revenue and only holds data from there onwards
    # Assumption is that there will pretty much always be a decay curve, barring any major changes in price
    # This dataframe created will be the one used to fit a function, as it is much easier to fit a decay curve than something which also account for the trial period
    
    # Subset columns needed and only this segment
    sub_df=Final_Table[['Day','Segment','avg_dailyrev','max_rev','Total_Customers']][Final_Table['Segment']==rowname]
    sub_df = sub_df.reset_index(drop = True)
    
    # Find Index in which avg_dailyrev = max_rev
    Starting_Curve_Value = sub_df[sub_df.avg_dailyrev==sub_df.max_rev].index.values
    Starting_Curve_Value2 = Starting_Curve_Value[0] # There may be multiple days with the same (max) revenue, so start from the first
    
    # Subset this data so you start the curve fitting from the max value
    sub_df2 = sub_df.iloc[Starting_Curve_Value2:]

    
    #################### Transpose df and sub_df2 to get days into columns for the selected segment    
    sub_pivot=pd.pivot_table(sub_df2,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
    df_pivot=pd.pivot_table(sub_df,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
    
    # Determines number of customers per segment
    Total_Customers = max(sub_df.Total_Customers)
    
    #################### Loop through the range of functions to be fitted to the LTV data     
    func_dict={}   
    for func in function : 

        #print('Function: ' + func.__name__)
        
        # Turn the above Data Frames into series/arrays: all= _all data for a segment _max = only data since the max
        xdata_all = np.array(df_pivot.columns)
        ydata_all = np.array(df_pivot.loc[rowname])
        
        xdata_max = np.array(sub_pivot.columns)
        ydata_max = np.array(sub_pivot.loc[rowname])
        
        #################### Define p0 for each function (effectively parameter seeds) >> could be moved into the functions themselves as defaults
     
        if func.__name__ in [  'func_expdecay']:
            p0 = (0.2,0.1, 1.5) # inital guess
    
        if func.__name__ in ['func_normal']:
            mu, std = norm.fit(xdata_max)
            p0 = (85,mu,std)
            
        if func.__name__ in ['func_lognormal']:
            mu, std = norm.fit(xdata_max)
            p0 = (10,mu,std)
    
        if func.__name__ in [ 'func_linear', 'func_log']:
            p0 = (2,3)
        
        if func.__name__ in ['func_weib', 'func_power']:
            p0 = (1,1)
        
        if func.__name__ in ['func_exp2']:
            p0 = (0.2,0.1,1.5,2)
    
        #################### Attempt to fit the function, return the parameters & covariance of the best fit (per function)
        try:
            params, params_covariance = optimize.curve_fit(func, xdata_max, ydata_max,p0, maxfev=100000,method='dogbox') 
            
            # Generate an array of x values (days) from the peak to the end of the observed dataset.
            # We do this because the number of days for any given segment may not = the total number of days across all segments 
            # E.g. some segments we may only have 100 days for, some we have 200 for, or the peak is at a different place.
            x2 = np.linspace(min(xdata_max),max(xdata_max),len(xdata_max))
            y2 = func(x2,*params) # Apply the fitted curve to the dummy X values

            # Account for when y values trend into negatives as we wouldn't want a negative revenue-- SHOULD WE DO THIS HERE OR LATER, SURELY ONLY NEED FOR INTEGRATION?
            y2[y2 < 0] = 0

            fit_error = fit_test(ydata_max,y2,test=test_type) # Fits the generated curve against the observed using intended test specified before loop
            abserr=mean_absolute_percentage_error(ydata_max,y2) # Calculates mean absolute error
            
            print('Function {} on segment {} has an average error of {:.2f}% and a value of {:.2f} for {}'.\
                  format(func,rowname,abserr*100,fit_error,test_type))

            func_dict[func]={'error':fit_error,'params':params,'mape':abserr}
             # Plot each function
            plt.plot(x2, y2, color=np.random.rand(3,), label=func.__name__) # generates a random different colour per segment - check if there's a way to fix this? 
        
        except:
            pass
            print("Failed to fit function {} to segment {}".format(func,rowname))
            
   
       
    #################### Select best fitting function according to error, store the function name e.g. "normal" and the parameters
    selected_func,selected_params = [(key,func_dict[key]['params']) for key in func_dict if func_dict[key]['error']==min([d['error'] for d in func_dict.values()])][0]
    selected_functions[rowname]= {'function':selected_func,'params':selected_params} # Add to dictionary storing choices for all segments

    
    #################### Plot actuals on same graph as above
    plt.plot(xdata_all, ydata_all, 'bo', label='Real Data', markersize=0.5) # This is the real data but omitted for now as it makes the graph look messy
    
    # Format the plot
    plt.xlabel("Months Subscribed")
    plt.ylabel("Daily Revenue")
    plt.title("Segment Decay Curve")
    plt.legend(loc='best')
    plt.show() # Display the plot
    
    
    #################### Determine the best fitted curve to the real data and integrate to determine LTV
    
    observed_days=max(xdata_all) # Calculate the number of days of actual data we have
    predicted_days=integrate_to_days-observed_days # Calculate the number of days we're predicting

    
    ###### Select the necessary data ranges
    empirical_x, empirical_y=xdata_all,ydata_all # The actual data observed. This will be both used for visual and for integration
    
    forecast_x=range(observed_days+1,integrate_to_days + 1)
    forecast_y=selected_func(forecast_x,*selected_params) # The forecast into the future. This should be sequentially after the observed data ends and is used for the integration
    
    curve_x=range(min(xdata_max),integrate_to_days + 1)
    curve_y=selected_func(curve_x,*selected_params) # The curve fitted to both the empirical data after the max, and the forecast moving forwards. Used for the visual
    

    ###### Set -ve forecasts to 0
    forecast_y[forecast_y<0]=0
    curve_y[curve_y<0]=0
    
    ###### Plot and Integrate
    # Plot & integrate empirical data
    plt.plot(empirical_x,empirical_y,c='b') # Plot line in blue
    plt.fill_between(empirical_x,empirical_y, where = [(x >= 0)  and (x <= max(empirical_x)) for x in empirical_x], color = 'blue', alpha = 0.3) # Fill in under the graph
    val_actual_curve = scipy.integrate.trapz(empirical_y,empirical_x) # Integrate under the curve
    
    # Plot & integrate the fitted curve
    plt.plot(curve_x,curve_y,c='r') # Plot the line in red
    plt.fill_between(forecast_x,forecast_y, color = 'red', alpha = 0.3) # Fill in under the forecasted area only
    val_fitted_curve = scipy.integrate.trapz(forecast_y, forecast_x)
   
    # Generate clean name, add as title and save figure
    title=rowname + ": " +str(selected_func.__name__).replace('func_','')
    plt.title(title)
    save_name=title+ '.png'
    #plt.savefig(save_name)
    plt.show()

    LTV = (val_actual_curve + val_fitted_curve)
    print('LTV Value of Cohort: {} is £{:.2f}'.format(str(rowname), LTV))
    print('The observed component (blue) totals £{:.2f} across the first {} days'.format(val_actual_curve,observed_days))
    print('The forecasted component (red) totals £{:.2f} across the next {} days'.format(val_fitted_curve,predicted_days))
    
    LTV_selections.append({'Segment':rowname,\
                           'Customers' : Total_Customers,\
                           'LTV':round(LTV,2),'LTV_days':integrate_to_days,\
                           'LTV_Observed':round(val_actual_curve,2),'LTV_observed_days':observed_days,\
                           'LTV_Predicted':round(val_fitted_curve,2),'LTV_predicted_days':predicted_days,\
                           'Fitted_Function':str(selected_func.__name__).replace('func_',''),'Fitted_Params':str(selected_params),\
                          })

    # Export forecasted data sets to csv
    # All Data
    export_dataset = pd.DataFrame({'Avg_Daily_Rev':empirical_y} , index=empirical_x) 
    # Curve Data 
    export_dataset2 = pd.DataFrame({'Avg_Daily_Rev':curve_y} , index=curve_x)
    # Integrate Data
    export_dataset3 = pd.DataFrame({'Avg_Daily_Rev':forecast_y} , index=forecast_x)

    export_dataset4 = pd.concat([export_dataset, export_dataset2], axis = 1)

    export_dataset5 = pd.merge(export_dataset4, export_dataset3, left_index=True, right_index = True, how = 'left')

    #export_dataset5.to_excel(rowname + ' Graph Data.xlsx')
          
LTV_final_df=pd.DataFrame(LTV_selections)
LTV_final_df