# Customer life time value

In [1]:
# Install this package for use of graphical properties
!pip install plotly 

### STEPS
##### 1. Extract the data required
##### 2. Plot actual data - what does this look like? 
##### 3. Fit a curve based on actual data
##### 4. Select best fitted curve
##### 5. Intergrate ^, what is the expected tenure per segment?
##### 6. Multiple this by rev (for now use 5.99 as base)

In [2]:
# modules
import numpy as np
import matplotlib.pyplot as plt
import scipy
from   scipy import optimize
import pandas as pd
from scipy.optimize import curve_fit
import scipy.stats as stats
import sys

# Install/import plotly packages- this package has lots of graphical properties
import plotly.graph_objects as go
import plotly.offline as pyo

from scipy import integrate 
from scipy.stats import norm

from matplotlib.patches import Polygon

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)


In [3]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

#### 1. Data Extraction


In [4]:
# Extracting data to determine LTV
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)

In [5]:
from pandas.io import gbq

In [6]:
# Filter of 500+ customers needed per segment 

query="""

with table1 as 

(select distinct a.*,concat(a.itvid, a.subscriptionid) as ID, c.Segment, b.signuptime from 

(select distinct  itvid, subscriptionid,  proratedDailyRevenue,date_value from `itv-bde-analytics-prd.britbox_model.LTV_Entlmt_Daily_Revenue`) as a 

inner join 

(select distinct britbox_ID, subscriptionid, subscription.firstStart as signuptime from  `itv-bde-analytics-prd.britbox_analytics.entitlements`
where date(subscription.firstStart) >= '2020-04-01' and billingprovider in ('iTunes', 'Stripe')
and date(subscription.firstStart) <= '2020-11-10'
) as b

on a.itvid = b.britbox_ID 
    and a.subscriptionid = b.subscriptionid 

and date(b.signuptime) <= date(a.date_value) and date(b.signuptime) <= (current_date() -1)

left join 

(select * from (select distinct 
britbox_ID, 
platform_clean as Segment, 
count(distinct stream_ID) as Streams
,row_number() over (partition by britbox_ID order by count(distinct stream_ID) desc) as row_num
from `itv-bde-analytics-dev.britbox_analytics.Viewing_clean`
where open_event_time >= '2020-04-01' and  open_event_time <= '2020-11-10' 
group by 1,2) where row_num = 1 and lower(Segment) like '%connected%' ) as c

on a.itvid = c.britbox_ID

)

,table1a as 

(select * , row_number() over (partition by id , Segment , signuptime order by date_value ) as Day from table1)

 , table2 as
 (
 select distinct  * , proratedDailyRevenue/Customers as avg_dailyrev,
 max(Customers) over (partition by Segment) as Total_Customers,
 max(ifnull(proratedDailyRevenue/Customers, 0.00000000001)) over (partition by Segment) as max_rev 
 from (select distinct 
 Day,
 Segment,
 count(distinct id) as Customers,
 sum(ifnull(proratedDailyRevenue,0.00000000001))/100 as proratedDailyRevenue
from  table1a

group by 1,2
order by 1,2)

order by 1,2,3)

 select * from table2 where Total_Customers > 500

    """
df = bq.query(query ).to_dataframe()

df


In [7]:
# Pivot the table so data is in correct structure for making graph
df_2_pivot = pd.pivot_table(df,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
df_2_pivot

In [8]:
#df.to_excel('Actual device type graph data.xlsx')

#### 2. Plot data

In [9]:
# Plotting actual data
# This enables the data to be plotted in seperate lines split by 'segment' by allocating the ydata per row 
# hence why data needed to be pivoted so there's one row per segment

traces = [go.Scatter (
        x = df_2_pivot.columns,
        y = df_2_pivot.loc[rowname],
        mode = 'lines',
        name = rowname
)for rowname in df_2_pivot.index]

# Plot the data
graph = go.Figure(data = traces)
#graph.update_xaxes(type="category",)
graph

In [10]:
# Draw Plot
x = sorted(df.Day)
y = df.Customers

def plot_df(df, x, y, title="Daily Revenue", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(df, x, y, title='Daily Revenue')  

#### 3. Curve Fitting

In [11]:
# Determine the fitted curves per segment and plot this 

# This is used for the curve-fitting procedure later, which requires the function as an input
def func_expdecay(xdata, a, b ,k):
    return a * np.exp(-b * xdata) + k

#def func_exp2(xdata, a, b, c, d):
#    return a*np.exp(b*xdata + c) + d 

def func_log(xdata,a,b):
    return a * -np.log(b*xdata) 

#def func_linear(xdata,a,b):
#    return a * xdata +b  #mx +c

def func_normal(xdata,a,mu,std):
    return a*(np.exp(-((xdata-mu)**2)/(2*std**2)))

def func_weib(xdata,n,a):
    return (a / n) * (xdata / n)**(a-1) * np.exp(-(xdata/n)**a)

def func_lognormal(xdata, a, mu, std):
    return a*((1.0/(xdata*std*np.sqrt(2.0*np.pi)))*np.exp(-1.0*(((np.log(xdata)-mu)**2.0)/(2.0*(std**2.0)))))

def func_power(x, a, b):
    return a*(x**b)

In [12]:
function = [func_expdecay, func_log, func_normal,func_weib, func_lognormal, func_power]

In [13]:
def fit_test(ydata,y2,test='CHI2'):
    """returns fit scores for chi2 and rmse 
    chisquare requires large freq ideally greater than 5 
    (ref:https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html)
    """
    if (test=='CHI2') and (min(ydata)>5): 
        return stats.chisquare(f_obs=ydata, f_exp=y2)
    elif test == 'RMSE':
        return np.sqrt(np.mean((ydata - y2)**2))
    else:
        print('check conditions')

In [14]:


test_type = "RMSE"
list_1 = []
list_2 = []
list_3 = []
LTV_dataset3 = pd.DataFrame()
best_curves = {}
best_params = {}

# Determine the best fitted curve to the real data and integrate to determine LTV
def plot_best_values(data,best_params,best_curve):
    func = best_curves[rowname]
    #x2 = np.linspace(min(xdata_max), 1095, 1095) # 3 years of data
    x2 = range(min(xdata_max),1095 + 1)
    params = best_params[rowname]
    y2 = func(x2,*params)
    y2[y2 < 0] = 0
    func_out = func(x2,*params)
    func_out[func_out < 0] = 0
    plt.plot(xdata_all,ydata_all,c='b')
    plt.plot(x2,func_out,c='r')
    plt.title(rowname + ": " + func.__name__)
    plt.fill_between(xdata_all, ydata_all, where = [(x >= 0)  and (x <= max(xdata_all)) for x in xdata_all], color = 'blue', alpha = 0.3)
    plt.fill_between(x2, y2, where = [(x > max(xdata_all)) for x in x2 ], color = 'red', alpha = 0.3)
    save_name = rowname + ": " + func.__name__ + '.png'
    #plt.savefig(save_name)
    plt.show()
    

def mean_absolute_percentage_error(ydata_max, y2): 
    ydata_max, y_py2red = np.array(ydata_max), np.array(y2)
    return np.mean(np.abs((ydata_max - y2) / ydata_max)) 
    
for n,rowname in enumerate(df_2_pivot.index) : 
    
    print('Segment: ' + str(rowname))
    errors = []
    param_list= []
    
    sub_df=df[['Day','Segment','avg_dailyrev','max_rev','Total_Customers']][df['Segment']==rowname]
    
    sub_df = sub_df.reset_index(drop = True)
    # Find Index in which avg_dailyrev = max_rev
    Starting_Curve_Value = sub_df[sub_df.avg_dailyrev==sub_df.max_rev].index.values
    
    Starting_Curve_Value2 = Starting_Curve_Value[0]
    
    # Subset this data so you start the curve fitting from the max value
    sub_df2 = sub_df.iloc[Starting_Curve_Value2:]

    sub_pivot=pd.pivot_table(sub_df2,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
    
    df_pivot=pd.pivot_table(sub_df,  values = 'avg_dailyrev' , index = ['Segment'], columns = ['Day'])
    
    Total_Customers = max(sub_df.Total_Customers)
    
    for func in function : 

        #print('Function: ' + func.__name__)
        
        xdata_all = np.array(df_pivot.columns)
        ydata_all = np.array(df_pivot.loc[rowname])
        
        xdata_max = np.array(sub_pivot.columns)
        ydata_max = np.array(sub_pivot.loc[rowname])
        
        #print(sum(np.isnan(ydata_all)))
    
        if func.__name__ in [  'func_expdecay']:
            p0 = (0.2,0.1, 1.5) # inital guess
    
        if func.__name__ in ['func_normal']:
            mu, std = norm.fit(xdata_max)
            p0 = (85,mu,std)
            
        if func.__name__ in ['func_lognormal']:
            mu, std = norm.fit(xdata_max)
            p0 = (10,mu,std)
    
        if func.__name__ in [ 'func_linear', 'func_log']:
            p0 = (2,3)
        
        if func.__name__ in ['func_weib', 'func_power']:
            p0 = (1,1)
    
        params, params_covariance = optimize.curve_fit(func, xdata_max, ydata_max,p0, maxfev=100000,method='dogbox') 
        
        #print('Parameters: ' + str(params))
        list_1.append(params)
        list_2.append(params_covariance)
        
        range1 = len(ydata_max) + (len(ydata_all)-len(ydata_max))
        
        x2 = np.linspace(min(xdata_max), range1, len(ydata_max))  
        y2 = func(x2,*params) # Apply the fitted curve to the dummy X values
    
        # Account for when y values trend into negatives as we wouldn't want a negative revenue
        y2[y2 < 0] = 0
    
        fit_error = fit_test(ydata_max,y2,test=test_type)
        #print(test_type,fit_error)
        
        errors.append(fit_error)
        param_list.append(params)
        
        #mean_absolute_percentage_error = mean_absolute_percentage_error(ydata_max, y2)
        
        #plt.plot(x2, y2, color=np.random.rand(3,), label=func.__name__) # generates a random different colour per segment - check if there's a way to fix this? 
        #plt.xlabel("Months Subscribed")
        #plt.ylabel("Daily Revenue")
        #plt.title("Segment Decay Curve")
        
    best_fit_idx = np.argmin(errors)
    best_curves[rowname] = function[best_fit_idx]
    best_params[rowname] = param_list[best_fit_idx]
    
    percentage_error = "{:.2%}".format(mean_absolute_percentage_error(ydata_max, y2))

    #plt.plot(xdata_all, ydata_all, 'bo', label='Real Data', markersize=0.5) # This is the real data but omitted for now as it makes the graph look messy
    #plt.legend(loc='best')
    #plt.show()

    # Making additional variables for the purpose of integrating only AFTER there is no more actual data
    #x3 = np.linspace(max(xdata_all), 1095, 1095)
    x3 = range(max(xdata_all),1095 + 1)
    y3 = func(x3,*params)
    y3[y3 < 0] = 0 
        
    # Integrate actual data
    val_actual_curve = round(scipy.integrate.trapz(ydata_all,xdata_all),2)
      
    # Integrate forecasted data between end of actual and 'end'
    val_fitted_curve = round(scipy.integrate.trapz(y3, x3),2)
       
    LTV = round((val_actual_curve + val_fitted_curve),2)
        
    print(str(rowname) + ' Integrated Value for Actual Data : ' + str(val_actual_curve))
    print(str(rowname) + ' Integrated Value for Forecast Data : ' + str(val_fitted_curve))
    print(str(rowname) + ' LTV Value : ' + str(LTV))
                
    # Export forecasted data sets to csv
    # All Data
    export_dataset = pd.DataFrame({'Avg_Daily_Rev':ydata_all} , index=xdata_all) 
    # Curve Data 
    export_dataset2 = pd.DataFrame({'Avg_Daily_Rev':y2} , index=x2)
    # Integrate Data
    export_dataset3 = pd.DataFrame({'Avg_Daily_Rev':y3} , index=x3)
    export_dataset4 = pd.concat([export_dataset, export_dataset2], axis = 1)
    export_dataset5 = pd.merge(export_dataset4, export_dataset3, left_index=True, right_index = True, how = 'left')
    #export_dataset5.to_excel(rowname + ' Graph Data.xlsx')
        
    plot_best_values(df,best_params,best_curves)
    
    LTV_dataset2 = pd.DataFrame({"Segment" : rowname, 'LTV': [LTV], 'Total Customers': [Total_Customers], 'Mean absolute % error' : [percentage_error]}) 
    LTV_dataset3 = LTV_dataset3.append(LTV_dataset2, ignore_index=True)

print(LTV_dataset3)   
