# Customer life time value

In [69]:
# Install this package for use of graphical properties
# !pip install plotly 

### STEPS
##### 1. Extract the data required
##### 2. Plot actual data - what does this look like? 
##### 3. Fit a curve based on actual data
##### 4. Intergrate ^, what is the expected tenure per segment?
##### 5. Multiple this by rev (for now use 5.99 as base)

In [1]:
# modules
import numpy as np
import matplotlib.pyplot as plt
from   scipy import optimize
import pandas as pd
from scipy.optimize import curve_fit
import scipy.stats as stats
import sys

# Install/import plotly packages- this package has lots of graphical properties
import plotly.graph_objects as go
import plotly.offline as pyo

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

In [2]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

#### 1. Data Extraction

In [3]:
# Extracting data to determine LTV
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)

In [4]:
query="""
with tablea as  (
select distinct * , row_number() over (partition by yearmonth_joined, segment order by yearmonth_joined, yearmonth_sub, segment) as row_num
  from 
  (
  SELECT
    DISTINCT 
    concat(extract(year from account.firststart),extract(month from account.firststart)) as yearmonth_joined,
    concat(extract(year from openEntitlement.start),extract(month from openEntitlement.start)) as yearmonth_sub,
    b.level2 as segment,
    COUNT( DISTINCT britbox_ID ) AS Customers
  FROM
    `itv-bde-analytics-prd.britbox_analytics.entitlements` as a left join `itv-bde-svod-prd.reporting.Sign_up_Funnel_web` as b
    on a.britbox_ID = b.user_ID
        where billingprovider in ('Stripe','iTunes') and date(account.firststart) >= '2019-11-07' and 
        date(openEntitlement.start) <= DATE_SUB(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 DAY)
  GROUP BY
    1,2,3
    order by 1,2,3)
    where yearmonth_sub is not null
    )
    
  
  (select distinct 
  *, round(customers / max , 2) as pct_remaining from
  
    (select distinct *, max(customers) over (partition by segment, yearmonth_joined ) as max
  from tablea
)
where yearmonth_joined is not null and yearmonth_sub is not null
  order by yearmonth_joined, yearmonth_sub, segment  )
"""

In [5]:
# create dataframe from BQ query
def create_df():
    yearmonth_joined = []
    yearmonth_sub = []
    segment = []
    row_num = []
    max_ = []
    Customers= []
    pct_remaining = []
    for qry_res in query_results:
        yearmonth_joined.append(qry_res['yearmonth_joined'])
        yearmonth_sub.append(qry_res['yearmonth_sub'])
        segment.append(qry_res['segment'])
        Customers.append(qry_res['Customers'])
        row_num.append(qry_res['row_num'])
        max_.append(qry_res['max'])
        pct_remaining.append(qry_res['pct_remaining'])

    df = pd.DataFrame({
        'yearmonth_joined' : yearmonth_joined,
        'yearmonth_sub' : yearmonth_sub,
        'segment' : segment,
        'Customers' : Customers,
        'row_num' : row_num,
        'max_' : max_,
        'pct_remaining' : pct_remaining
    })
    return df

In [6]:
query_results = bq.query(query).result()

In [27]:
x=bq.query(query).to_dataframe()
x.head()

In [7]:
df = create_df()

In [8]:
df.head()

In [10]:
# Select a given month for example purposes 
# at a later stage determine how to run through with multiple months 
yearmonth_selected =  df['yearmonth_joined'] == '20201'
df_2 = df[yearmonth_selected]
df_2.head()

In [11]:
# Pivot the table so data is in correct structure for making graph
df_2_pivot = pd.pivot_table(df_2,  values = 'pct_remaining' ,index = ['segment'], columns = ['row_num'])
df_2_pivot

#### 2. Plot data

In [12]:
# Plotting actual data
# This enables the data to be plotted in seperate lines split by 'segment' by allocating the ydata per row 
# hence why data needed to be pivoted so there's one row per segment
traces = [go.Scatter (
        x = df_2_pivot.columns,
        y = df_2_pivot.loc[rowname],
        mode = 'markers',
        name = rowname
)for rowname in df_2_pivot.index]

# Plot the data
graph = go.Figure(data = traces)
#graph.update_xaxes(type="category",)
graph

#### 3. Curve Fitting

In [13]:
# Determine the fitted curves per segment and plot this 

# Define the exponential function>> 
# This is used for the curve-fitting procedure later, which requires the function as an input
def func_expdecay(xdata, a, b ,k):
    return a * np.exp(-b * xdata) + k

def func_log(xdata,a,b,k):
    return a * -np.log(b*xdata) + k

def func_linear(xdata,a,b,k):
    return a * xdata +k + b #mx +c

In [14]:
function = [func_expdecay, func_log, func_linear]

In [15]:
def fit_test(ydata,y2,test='CHI2'):
    """returns fit scores for chi2 and rmse 
    chisquare requires large freq ideally greater than 5 
    (ref:https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html)
    """
    if (test=='CHI2') and (min(ydata)>5): 
        return stats.chisquare(f_obs=ydata, f_exp=y2)
    elif test == 'RMSE':
        return np.sqrt(np.mean((ydata - y2)**2))
    else:
        print('check conditions')

In [20]:
p0 = (0.2,0.1, 1.5) # inital guess
test_type = "RMSE"
list_1 = []
list_2 = []
list_3 = []

best_curves = {}
best_params = {}
    
for n,rowname in enumerate(df_2_pivot.index) : 
    errors = []
    param_list= []
    
    for func in function : 

        print(rowname)
        print(func)
        xdata = np.array(df_2_pivot.columns)
        ydata = np.array(df_2_pivot.loc[rowname])

        params, params_covariance = optimize.curve_fit(func, xdata, ydata,p0,maxfev=2000,method='dogbox') #lm’, ‘trf’, ‘dogbox’}
        print(params)
        list_1.append(params)
        list_2.append(params_covariance)
        x2 = np.linspace(1, len(ydata), len(ydata))  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
        y2 = func(x2,*params) # Apply the fitted curve to the dummy X values
    
    
        fit_error = fit_test(ydata,y2,test=test_type)
        print(test_type,fit_error)
        
        #errors.append(chisquare[0])
        #errors.append(fit_error[0])
        param_list.append(params)
        
        plt.plot(x2, y2, color=np.random.rand(3,), label=func.__name__) # generates a random different colour per segment - check if there's a way to fix this? 
        plt.xlabel("Months Subscribed")
        plt.ylabel("% Remaining")
        plt.title("Segment Decay Curve")
        
    #best_fit_idx = np.argmax(errors)
    #best_curves[rowname] = function[best_fit_idx]
    #best_params[rowname] = param_list[best_fit_idx]
        
        
    # SELECTING THE BEST FITTED CURVE PER SEGMENT
    # My approach here would be to compare the chi squared p value and whether that's > or < less than our significance level,
    # however, currently all the p values are 0.99999 and if for example all the p values are above the sig level, which function are still meant to select?
    # Need help here as to the correct methology to select the best fitted curve and keep that function flowing through to integrate the area
    
# If you indent inside the loop, then it generates a seperate graph for each segment, not indenting creates one graph, 
# depends on how many loops you have to how many indents you do to create the graphical output you want

    plt.plot(xdata, ydata, 'bo', label='Real Data') # This is the real data but omitted for now as it makes the graph look messy
    plt.legend(loc='best')
    plt.show()


In [71]:
def plot_est_values(data,best_params,best_curve):
    x = np.array(data.columns)
    for title,ydata in data.iterrows():
        func = best_curves[title]
        params = best_params[title]
        func_out = func(x,*params)
        plt.scatter(x,ydata,c='b')
        plt.scatter(xdata,func_out,c='r')
        plt.title(title + ": " + func.__name__)
        plt.show()

In [72]:
plot_est_values(df_2_pivot,best_params,best_curves)

In [None]:
#NEXT STEPS

# Find the best fitting curve
# How to do this? 
#    Iterate through the functions comparing p values and chi stat and only output when it's max
#    Do you do above after the loop? If so how do you reference the data per segment still?

In [189]:
df