# Customer life time value

In [1]:
# Install this package for use of graphical properties
!pip install plotly 

### STEPS
##### 1. Extract the data required
##### 2. Plot actual data - what does this look like? 
##### 3. Fit a curve based on actual data
##### 4. Select best fitted curve
##### 5. Intergrate ^, what is the expected tenure per segment?
##### 6. Multiple this by rev (for now use 5.99 as base)

In [1]:
# modules
import numpy as np
import matplotlib.pyplot as plt
import scipy
from   scipy import optimize
import pandas as pd
from scipy.optimize import curve_fit
import scipy.stats as stats
import sys

# Install/import plotly packages- this package has lots of graphical properties
import plotly.graph_objects as go
import plotly.offline as pyo

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)


In [2]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

#### 1. Data Extraction


In [3]:
# Extracting data to determine LTV
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)

In [4]:
from pandas.io import gbq

In [5]:


query="""

with table1 as 

(select a.*,b.level2, row_number() over (partition by user_ID, signuptime, level2 order by date_value) as Day from 

(select *  from `itv-bde-analytics-prd.britbox_model.LTV_Daily_Revenue`) as a 

inner join 

(select distinct user_ID, signuptime , level2 from  `itv-bde-analytics-prd.britbox_mart.last_click_referrer`
--where level2 = 'CRM'  and extract(month from signuptime ) = 4
) as b

on a.itvid = b.user_ID and date(b.signuptime) <= date(a.date_value) and date(b.signuptime) <= (current_date() -1))

(
select distinct  * , ifnull(proratedDailyRevenue/Customers, 0) as avg_dailyrev from (select distinct 
Day,
level2,
count(distinct itvid) as Customers,
sum(ifnull(proratedDailyRevenue,0)) as proratedDailyRevenue
from  table1
group by 1,2
order by 1,2)
order by 1,2)

    """
df = bq.query(query ).to_dataframe()

df


In [6]:
#date_value2 = df.date_value.astype(str)
#date_value2

In [7]:
#from matplotlib.dates import date2num
#date_value3 = date2num(df['date_value'])

In [8]:
#df['Date_Value2'] = date_value2
#df['Date_Value3'] = date_value3
#df = df.reset_index()
#df['index'] = range(1,len(df)+1)
#df

In [9]:
# Pivot the table so data is in correct structure for making graph
df_2_pivot = pd.pivot_table(df,  values = 'avg_dailyrev' , index = ['level2'], columns = ['Day'])
df_2_pivot

#### 2. Plot data

In [10]:
# Plotting actual data
# This enables the data to be plotted in seperate lines split by 'segment' by allocating the ydata per row 
# hence why data needed to be pivoted so there's one row per segment

traces = [go.Scatter (
        x = df_2_pivot.columns,
        y = df_2_pivot.loc[rowname],
        mode = 'lines',
        name = rowname
)for rowname in df_2_pivot.index]

# Plot the data
graph = go.Figure(data = traces)
#graph.update_xaxes(type="category",)
graph

In [11]:
# Draw Plot
x = sorted(df.Day)
y = df.avg_dailyrev

def plot_df(df, x, y, title="Daily Revenue", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(df, x, y, title='Daily Revenue')  

#### 3. Curve Fitting

In [12]:
# Determine the fitted curves per segment and plot this 

# This is used for the curve-fitting procedure later, which requires the function as an input
def func_expdecay(xdata, a, b ,k):
    return a * np.exp(-b * xdata) + k

def func_log(xdata,a,b):
    return a * -np.log(b*xdata) 

def func_linear(xdata,a,b):
    return a * xdata +b  #mx +c

def func_normal(xdata,a,mu,std):
    return a*(np.exp(-((xdata-mu)**2)/(2*std**2)))

def func_weib(xdata,a ):
    return (a ) * (xdata)**(a - 1) * np.exp(-(xdata )**a)

def func_lognormal(xdata, a, mu, std):
    return a*((1.0/(xdata*std*np.sqrt(2.0*np.pi)))*np.exp(-1.0*(((np.log(xdata)-mu)**2.0)/(2.0*(std**2.0)))))

In [13]:
function = [func_expdecay, func_log, func_linear, func_normal,func_weib, func_lognormal]

In [14]:
def fit_test(ydata,y2,test='CHI2'):
    """returns fit scores for chi2 and rmse 
    chisquare requires large freq ideally greater than 5 
    (ref:https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html)
    """
    if (test=='CHI2') and (min(ydata)>5): 
        return stats.chisquare(f_obs=ydata, f_exp=y2)
    elif test == 'RMSE':
        return np.sqrt(np.mean((ydata - y2)**2))
    else:
        print('check conditions')

In [15]:
from scipy.stats import norm

In [33]:
df_2_pivot.index

In [35]:
df_2_pivot.shape

In [42]:
sub_df=df[['Day','level2','avg_dailyrev']][df['level2']=='CRM BB']
sub_pivot=pd.pivot_table(sub_df,  values = 'avg_dailyrev' , index = ['level2'], columns = ['Day'])
sub_pivot

In [32]:
df_2_pivot[df_2_pivot.index=='CRM BB'].dropna()

In [44]:

test_type = "RMSE"
list_1 = []
list_2 = []
list_3 = []

best_curves = {}
best_params = {}

for n,rowname in enumerate(df_2_pivot.index) : 
    errors = []
    param_list= []
    
    sub_df=df[['Day','level2','avg_dailyrev']][df['level2']==rowname]
    sub_pivot=pd.pivot_table(sub_df,  values = 'avg_dailyrev' , index = ['level2'], columns = ['Day'])
    
    for func in function : 

        print(rowname)
        print(func)
        xdata = np.array(sub_pivot.columns)
        ydata = np.array(sub_pivot.loc[rowname])
    
        if func.__name__ in [  'func_expdecay']:
            p0 = (0.2,0.1, 1.5) # inital guess
    
        if func.__name__ in ['func_normal', 'func_lognormal']:
            mu, std = norm.fit(xdata)
            p0 = (1000,mu,std)
    
        if func.__name__ in [ 'func_linear', 'func_log']:
            p0 = (2,3)
        
        if func.__name__ in ['func_weib']:
            p0 = (2)
        try:
            params, params_covariance = optimize.curve_fit(func, xdata, ydata,p0, maxfev=100000,method='dogbox') 
        except:
            print("Failed to fit function {} to segment {}".format(func,rowname))
        print(params)
        list_1.append(params)
        list_2.append(params_covariance)
        x2 = np.linspace(1, len(ydata), len(ydata))  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
        y2 = func(x2,*params) # Apply the fitted curve to the dummy X values
    
    
        fit_error = fit_test(ydata,y2,test=test_type)
        print(test_type,fit_error)
        
        errors.append(fit_error)
        param_list.append(params)
        
        plt.plot(x2, y2, color=np.random.rand(3,), label=func.__name__) # generates a random different colour per segment - check if there's a way to fix this? 
        plt.xlabel("Months Subscribed")
        plt.ylabel("Daily Revenue")
        plt.title("Segment Decay Curve")
        
    best_fit_idx = np.argmin(errors)
    best_curves[rowname] = function[best_fit_idx]
    best_params[rowname] = param_list[best_fit_idx]
    
# If you indent inside the loop, then it generates a seperate graph for each segment, not indenting creates one graph, 
# depends on how many loops you have to how many indents you do to create the graphical output you want

    plt.plot(xdata, ydata, 'bo', label='Real Data') # This is the real data but omitted for now as it makes the graph look messy
    plt.legend(loc='best')
    plt.show()


#### 4. Select best fitted curve

In [46]:
def plot_est_values(data,best_params,best_curve):
    func = best_curves[rowname]
    x2 = np.linspace(1, 500, 500)  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
    params = best_params[rowname]
    y2 = func(x2,*params)
    func_out = func(x2,*params)
    plt.scatter(xdata,ydata,c='b')
    plt.scatter(x2,func_out,c='r')
    #plt.xticks(df["date_value"])
    plt.title(rowname + ": " + func.__name__)
    plt.show()

In [47]:
# Plotting the best fitted curve
plot_est_values(df,best_params,best_curves)

#### 5. Intergrate ^, what is the expected tenure per segment?

In [109]:
from scipy import integrate 

# Integrate the area under the curve to estimate expected tenure
val = scipy.integrate.trapz(y2, x2)
val



#### 6. Multiple this by rev (for now use 5.99 as base)

In [21]:

# Multiple this by 5.99 to estimate LTV per customer
# To do : determine methodlogy to make revenue  more accurate instead of hard coded
#LTV = val * 5.99
#LTV