In [45]:

# STEPS
# 1. Extract the data required
# 2. Plot actual data - what does this look like? 
# 3. Fit a curve based on actual data
# 4. Intergrate ^, what is the expected tenure?
# 5. Multiple this by rev (for now use 5.99 as base)

# modules
import numpy as np
import matplotlib.pyplot as plt
from   scipy import optimize
from scipy.optimize import curve_fit
import sys

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

################
###### 1 #######
################

# Gather Data
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)
query="""
with tablea as  (
   select distinct * , row_number() over (partition by yearmonth_joined order by yearmonth_joined, yearmonth_sub) as row_num
  from 
  
  (SELECT
    DISTINCT 
    concat(extract(year from account.firststart),extract(month from account.firststart)) as yearmonth_joined,
    concat(extract(year from openEntitlement.start),extract(month from openEntitlement.start)) as yearmonth_sub,
    COUNT( DISTINCT britbox_ID ) AS Customers
  FROM
    `itv-bde-analytics-prd.britbox_analytics.entitlements` 
        where billingprovider in ('Stripe','iTunes') and date(account.firststart) >= '2019-11-07' and 
        date(openEntitlement.start) <= DATE_SUB(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 DAY)
        
  GROUP BY
    1,2
    order by 1,2)
    where yearmonth_sub is not null
    order by 1,2)
    
  
  (select distinct 
  *, round(customers / max , 2) as pct_remaining from
  
    (select distinct *, max(customers) over (partition by yearmonth_joined ) as max
  from tablea
)
where yearmonth_joined is not null and yearmonth_sub is not null
  order by yearmonth_joined, yearmonth_sub  )

"""
df = bq.query(query).to_dataframe()
df

# Select a given month for example purposes 
# at a later stage determine how to run through with multiple months 
yearmonth_selected =  df['yearmonth_joined'] == '20201'
df_2 = df[yearmonth_selected]
df_2

xdata = np.array(df_2.row_num)
ydata = np.array(df_2.pct_remaining)

################
###### 2 #######
################

# Plot actual volumes 
plt.scatter(xdata,ydata)
plt.show


In [52]:

################
###### 3 #######
################
# Determine the fitted curves per segment and plot this 

# Define the exponential function>> This is used for the curve-fitting procedure later, which requires the function as an input
def func_expdecay(xdata, a, b ,k):
    return a * np.exp(-b * xdata) + k

def func_log(xdata,a,b,k):
    return a * -np.log(b*xdata) + k

def func_linear(xdata,a,k):
    return a * xdata +k #mx +c

In [58]:
def Fit_a_distribution(p0,func, xdata, ydata):
    params, params_covariance = optimize.curve_fit(func, xdata, ydata,p0,maxfev=5000)
    
    x2 = np.linspace(1, 24, 24)  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
    y2 = func(x2,*params) # Apply the fitted curve to the dummy X values
    fig, ax = plt.subplots()
    plt.plot(x2, y2, color='r', label='Fitted Curve')
    plt.plot(xdata, ydata, 'bo', label='Real Data')
    plt.legend(loc='best')
    plt.xlabel("Months Subscribed")
    plt.ylabel("% Remaining")
    plt.title("Segment Decay Curve")
    plt.show()

    return params, params_covariance


def loop_functions:
    
woo=Fit_a_distribution((2,1, 1.5),func_log, xdata, ydata)
woo=Fit_a_distribution((2,1, 1.5),func_expdecay, xdata, ydata)
woo=Fit_a_distribution((2,1),func_linear, xdata, ydata)
print(woo[0])

In [49]:
# estimate the parameters and fit the curve using actual data
p0 = (2,1, 1.5) # initialisation parameters
params, params_covariance = optimize.curve_fit(func, xdata, ydata,p0,maxfev=5000)  # Curve fit procedure returns the parameters fitted
a, b , k= params # Store the parameters individually too

print(params)
print(params_covariance)

In [50]:

# Plot the fitted curve using xdummy data
x2 = np.linspace(1, 24, 24)  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
y2 = func(x2, a, b, k) # Apply the fitted curve to the dummy X values
fig, ax = plt.subplots()
plt.plot(x2, y2, color='r', label='Fitted Curve')
plt.plot(xdata, ydata, 'bo', label='Real Data')
plt.legend(loc='best')
plt.xlabel("Months Subscribed")
plt.ylabel("% Remaining")
plt.title("Segment Decay Curve")
plt.show()




In [51]:

x2 = np.linspace(1, 24, 24)  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
y2 = func(x2, a, b, k) # Apply the fitted curve to the dummy X values
fig, ax = plt.subplots()
plt.plot(x2, y2, color='r', label='Fitted Curve')
plt.plot(xdata, ydata, 'bo', label='Real Data')
plt.legend(loc='best')
plt.xlabel("Months Subscribed")
plt.ylabel("% Remaining")
plt.title("Segment Decay Curve")
plt.show()

In [42]:
params, params_covariance = optimize.curve_fit(func_linear, xdata, ydata,(1,1),maxfev=5000)  # Curve fit procedure returns the parameters fitted
a , k= params # Store the parameters individually too
print(params)
x2 = np.linspace(1, 24, 24)  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
y2 = func_linear(x2, a, k) # Apply the fitted curve to the dummy X values
fig, ax = plt.subplots()
plt.plot(x2, y2, color='r', label='Fitted Curve')
plt.plot(xdata, ydata, 'bo', label='Real Data')
plt.legend(loc='best')
plt.xlabel("Months Subscribed")
plt.ylabel("% Remaining")
plt.title("Segment Decay Curve")
plt.show()

In [44]:
params, params_covariance = optimize.curve_fit(func_log, xdata, ydata,(1,1,1),maxfev=5000)  # Curve fit procedure returns the parameters fitted
a ,b, k= params # Store the parameters individually too
print(params)
x2 = np.linspace(1, 24, 24)  # Plot values 1 to 24 in 24 separate increments (i.e. whole numbers!)
y2 = func_log(x2, a,b, k) # Apply the fitted curve to the dummy X values
fig, ax = plt.subplots()
plt.plot(x2, y2, color='r', label='Fitted Curve')
plt.plot(xdata, ydata, 'bo', label='Real Data')
plt.legend(loc='best')
plt.xlabel("Months Subscribed")
plt.ylabel("% Remaining")
plt.title("Segment Decay Curve")
plt.show()

In [20]:
# x and y values for the trapezoid rule
N = 24
x = np.linspace(1,24,25)
y = func(xdata,a,b,k)

# Visiual representation of the area that will be integrated
for i in range(N):
    xs = [x[i],x[i],x[i+1],x[i+1]]
    ys = [0,func(x[i],a,b,k),func(x[i+1],a,b,k),0]
    plt.fill(xs,ys,'b',edgecolor='b',alpha=0.2)

plt.title('Trapezoid Rule, N = {}'.format(N))
plt.plot(x2, y2, color='r', label='Fitted Curve')
plt.plot(xdata, ydata, 'bo', label='Real Data')
plt.show()

In [13]:

################
###### 4 #######
################

from scipy import integrate 

# Integrate the area under the curve to estimate expected tenure
val = scipy.integrate.trapz(y2, x2)
val

################
###### 5 #######
################

# Multiple this by 5.99 to estimate LTV per customer
# To do : determine methodlogy to make revenue  more accurate instead of hard coded
LTV = val * 5.99
LTV
