In [4]:
# Install this package for use of graphical properties
#!pip install plotly 

In [5]:
# Customer life time value

# STEPS
# 1. Extract the data required
# 2. Plot actual data - what does this look like? 
# 3. Fit a curve based on actual data
# 4. Intergrate ^, what is the expected tenure per segment?
# 5. Multiple this by rev (for now use 5.99 as base)

# modules
import numpy as np
import matplotlib.pyplot as plt
from   scipy import optimize
import pandas as pd
from scipy.optimize import curve_fit
import sys

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

################
###### 1 #######
################

# Extracting data to determine LTV
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)
query="""
with tablea as  (
select distinct * , row_number() over (partition by yearmonth_joined, segment order by yearmonth_joined, yearmonth_sub, segment) as row_num
  from 
  (
  SELECT
    DISTINCT 
    concat(extract(year from account.firststart),extract(month from account.firststart)) as yearmonth_joined,
    concat(extract(year from openEntitlement.start),extract(month from openEntitlement.start)) as yearmonth_sub,
    b.level2 as segment,
    COUNT( DISTINCT britbox_ID ) AS Customers
  FROM
    `itv-bde-analytics-prd.britbox_analytics.entitlements` as a left join `itv-bde-svod-prd.reporting.Sign_up_Funnel_web` as b
    on a.britbox_ID = b.user_ID
        where billingprovider in ('Stripe','iTunes') and date(account.firststart) >= '2019-11-07' and 
        date(openEntitlement.start) <= DATE_SUB(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 DAY)
  GROUP BY
    1,2,3
    order by 1,2,3)
    where yearmonth_sub is not null
    )
    
  
  (select distinct 
  *, round(customers / max , 2) as pct_remaining from
  
    (select distinct *, max(customers) over (partition by segment, yearmonth_joined ) as max
  from tablea
)
where yearmonth_joined is not null and yearmonth_sub is not null
  order by yearmonth_joined, yearmonth_sub, segment  )
"""
df = bq.query(query).to_dataframe()
df

# Select a given month for example purposes 
# at a later stage determine how to run through with multiple months 
yearmonth_selected =  df['yearmonth_joined'] == '20201'
df_2 = df[yearmonth_selected]
df_2

# Pivot the table so data is in correct structure for making graph
df_2_pivot = pd.pivot_table(df_2,  values = 'pct_remaining' ,index = ['segment'], columns = ['row_num'])
df_2_pivot.head()

# Install/import plotly packages- this package has lots of graphical properties
import plotly.graph_objects as go
import plotly.offline as pyo

################
###### 2 #######
################

# Plotting actual data
# This enables the data to be plotted in seperate lines split by 'segment' by allocating the ydata per row 
# hence why data needed to be pivoted so there's one row per segment
traces = [go.Scatter (
        x = df_2_pivot.columns,
        y = df_2_pivot.loc[rowname],
        mode = 'markers',
        name = rowname
)for rowname in df_2_pivot.index]

# Plot the data
graph = go.Figure(data = traces)
#graph.update_xaxes(type="category",)
graph

In [33]:
df_2_pivot.head()
df_2_pivot.loc['CRM']
#df_2_pivot['segment']

df_2_pivot.iloc[0]

In [35]:
for i in df_2_pivot.index:
    print(i)

print("""

""")
for n,i in enumerate(df_2_pivot.index):
    print(i)
    print(n)

In [40]:
p0 = (2,1, 1.5) # inital guess
list_1 = []
list_2 = []

for n,rowname in enumerate(df_2_pivot.index) : 
    print(rowname)
    xdata = np.array(df_2_pivot.columns)
    ydata = np.array(df_2_pivot.loc[rowname])
    print(xdata)
    print(ydata)
    params, params_covariance = optimize.curve_fit(func, xdata, ydata,p0,maxfev=2000)
    print(params)
    list_1.append(params)
    list_2.append(params_covariance)
list_1

In [24]:
################
###### 3 #######
################

# Determine the fitted curves per segment and plot this 

# Define the x and y data from the extracted data
xdata = np.array(df.row_num)
ydata = np.array(df.pct_remaining)

# What are the segments?
for rowname in df_2_pivot.index : 
    print(rowname)

In [11]:
# Determine the fitted curves per segment and plot this 

# Define the exponential function
def func(xdata, a, b ,k):
    return a * np.exp(-b * xdata) + k

In [27]:
# estimate the parameters and fit the curve using actual data
# EVERYTHING BELOW HERE I AM TRYING TO GROUP BY/ITERATE BY SEGMENTS

p0 = (2,1, 1.5) # inital guess

list_1 = []
list_2 = []

# I thought that by doing for 'rowname in df_2_pivot.index' this would iterate throuh segments and 
# put the segment paramters in a list and then figure out a way of calling the segments parameters automatically 
# when calculating y2 below 
for rowname in df_2_pivot.index : 
    params, params_covariance = optimize.curve_fit(func, xdata, ydata,p0,maxfev=2000)
    params= a, b , k
    list_1.append(params)
    list_2.append(params_covariance)


In [28]:
# This gives the same parameters for each segment, i'm under the impression the parameters should differ per segment?
# How should the code be written to give different results and execute for segment 
# I think it's something to do with looping it but this is where it breaks
print(list_1)

In [29]:
# Plot the fitted curve using dummy xdata
x2 = np.linspace(1, 24, 24)

# HOW TO ALTER THIS TO PRODUCE MULTPLE Y VALUES PER SEGMENT
# NEED TO ALTER TO INPUT THE PARAMETERS RELEVANT TO THE SEGMENT
y2 = func(x2, a, b, k)

# EVERYTHING BELOW HERE IS CODE TAKEN FROM THE PREVIOUS SCRIPT WITH NO SEGMENTS INCLUDED - DOES NOT RUN
# INLCUDED AS A GUIDE AS TO WHAT STEPS TO TAKE NEXT

fig, ax = plt.subplots()
plt.plot(x2, y2, color='r', label='Fitted Curve')
plt.plot(xdata, ydata, 'bo', label='Real Data')
plt.legend(loc='best')
plt.xlabel("Months Subscribed")
plt.ylabel("% Remaining")
plt.title("Segment Decay Curve")
plt.show()

N = 24

# Visiual representation of the area that will be integrated
for i in range(N):
    xs = [x[i],x[i],x[i+1],x[i+1]]
    ys = [0,func(x[i],a,b,k),func(x[i+1],a,b,k),0]
    plt.fill(xs,ys,'b',edgecolor='b',alpha=0.2)

plt.title('Trapezoid Rule, N = {}'.format(N))
plt.plot(x2, y2, color='r', label='Fitted Curve')
plt.plot(xdata, ydata, 'bo', label='Real Data')
plt.show()

################
###### 4 #######
################

from scipy import integrate 

# Integrate the area under the curve to estimate expected tenure
val = scipy.integrate.trapz(y2, x2)
val

################
###### 5 #######
################

# Multiple this by 5.99 to estimate LTV per customer
# To do : determine methodlogy to make revenue  more accurate instead of hard coded
LTV = val * 5.99
LTV