In [0]:
import os
import sys
import pickle
path=!pwd
sys.path.append(os.path.join(path[0], '..'))
import numpy as np
import math
import pandas as pd
import re
from datetime import date, datetime, timedelta
import json
from abc import ABCMeta, abstractmethod
import boto3
# import snowflake.connector
from io import StringIO
import logging 
from scipy.optimize import curve_fit
from scipy.stats import percentileofscore


logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
logger.info(f'Starting Notebook')

from utils import *

class Utils():
    @staticmethod
    def to_csv_s3(content, bucket, key_path, filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        csv_buffer = StringIO()
        content.to_csv(csv_buffer)
        client.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
        logger.info(f'Saved to {bucket}/{key}')
    
    @staticmethod
    def to_pkl_s3(content, bucket, key_path, filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        obj = pickle.dumps(content)
        client.put_object(Bucket=bucket, Key=key, Body=obj)
        logger.info(f'Saved to {bucket}/{key}')

    @staticmethod
    def read_csv_s3(bucket, key_path,filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        obj = client.get_object(Bucket=bucket, Key=key)
        df = pd.read_csv(obj['Body'], na_values="\\N")
        logger.info(f'Read from {bucket}/{key}')
        return df
        
    @staticmethod
    def read_pkl_s3(bucket, key_path,filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        obj = client.get_object(Bucket=bucket, Key=key)
        body = obj['Body'].read()
        model = pickle.loads(body)
        logger.info(f'Read from {bucket}/{key}')
        return model    

class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError

        
class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )
        return ctx

    
def run_query(querystr, ctx):
    cursor_list = ctx.execute_string(
        querystr
        )
    df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
    df.columns= df.columns.str.lower()
    return df

import plotly.express as px
def get_simple_plot(df_plt, x, y, grpby, text, title=''):
    if title=='':
        title = f'{y} vs {x}'
    df_plt[grpby] = df_plt[grpby].astype(str)
    fig = px.line(df_plt,
                  x=x, 
                  y=y, 
                  title=title,
                  color=grpby, 
                  hover_data=[text],
                  width=800, height=400)
    fig.show()
    return 


## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
# conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
# ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")
# cur = ctx.cursor()

input_bucket="hbo-ingest-datascience-content-dev"
output_bucket="hbo-outbound-datascience-content-dev"
key_path = 'cost_allocation/dev'

## SVOD Monthly, Retail,  domestic

In [0]:
def exponential_decay(x, a, b,c):
    return a * np.exp(b * x) + c

def exponential_decay_slope(x, a, b):
    return a * b*np.exp(b * x)

def fit_exponential(x_data, y_data, p0, param_bounds):
    x_fit = np.linspace(0, x_data.max(), 100)   
    params, _ = curve_fit(exponential_decay, np.array(x_data), y_data, p0, bounds=param_bounds)
    return x_fit, params


def get_churn_bin(df_in, grpby):
    df = df_in.groupby(by=['hbo_uuid','sub_month']+ grpby +['is_cancel']).sum().reset_index()
    df = df_in[df_in.monthly_hours_viewed<=60]
    nbins = 100
    df['hours_viewed_bin'] = pd.qcut(df['monthly_hours_viewed'], np.linspace(0,1,nbins), duplicates='drop')
    df['hours_viewed_bin'] = df['hours_viewed_bin'].apply(lambda x: (x.left+x.right)/2)
    df['hours_viewed_bin'] = df['hours_viewed_bin'].astype('float')
    df['churn'] = 1*df['is_cancel']  
    
    df_bin = df.groupby(['hours_viewed_bin']+grpby).agg({'churn':'mean', 'hbo_uuid':'count',
                                                         'is_cancel':'sum','monthly_hours_viewed':'sum'}).reset_index()
    return(df_bin)



def get_df_60_h(list_df):
    df_list=[]
    num=0
    for df_test in list_df:
        df_test['num_df'] = num
        df_list.append(df_test)
        num=num+1
    return(df_list)


def get_simple_plot_multiple(df_plt, x, y, x_fit, y_fit, params, title=''):
    if title=='':
        
        title = f'{y} vs {x}'
       
    a_fit, b_fit, c_fit = params
    annotation_x_loc = 50
    annotation_y_loc = y_fit.min() +(y_fit.max()  - y_fit.min() )/2 
        
    fig = px.scatter(df_plt,
                  x=x, 
                  y=y, 
                  title=title,
                  width=500, height=400)
    fig.add_scatter( 
              x=x_fit, 
              y=y_fit)

    fig.update_layout(
        template='simple_white',
        showlegend=False,
        xaxis=dict(range=[0,50]),
        annotations=[
        dict(
            x=annotation_x_loc,  # x-coordinate for the text
            y=annotation_y_loc,  # y-coordinate for the text
            text='y= {:.2f} * e^({:.2f} * hours_viewed) + {:.2f}'.format(a_fit, b_fit, c_fit),  # the text to display
            showarrow=False,  # disable arrow for the annotation
            xanchor='right',
            font=dict(
                family='Arial',  # specify font family
                size=18,  # specify font size
                color='black'  # specify font color
            )
        )
    ]
) 
    fig.show()
    return 

def get_simple_plot_multiple_dot(df_plt, x, y, x_fit, y_fit, params, x_med, y_med, title=''):
    if title=='':
        
        title = f'{y} vs {x}'
       
    a_fit, b_fit, c_fit = params
    print('y= {:.2f} * e^({:.2f} * hours_viewed) + {:.2f}'.format(a_fit, b_fit, c_fit))
    print('y= {:.3f} * e^({:.2f} * hours_viewed)'.format(a_fit*b_fit,b_fit))
    annotation_x_loc = 50
    annotation_y_loc = y_fit.min() +(y_fit.max()  - y_fit.min() )/2 
        
    fig = px.scatter(df_plt,
                  x=x, 
                  y=y, 
                  title=title,
                  width=500, height=400)
    fig.add_scatter( 
              x=x_fit, 
              y=y_fit)
    
    fig.add_scatter( 
              x=x_med, 
              y=y_med,
                mode='markers',
            marker=dict(size=14, color='red', line=dict(color='black', width=2)))

    fig.update_layout(
        template='simple_white',
        showlegend=False,
        xaxis=dict(range=[0,50]),
        annotations=[
        dict(
            x=x_med+0.2,  # x-coordinate for the text
            y=y_med+0.01,  # y-coordinate for the text
            text='{:.2f}, {:.2f}'.format(x_med, y_med),  # the text to display
            showarrow=False,  # disable arrow for the annotation
            xanchor='left',
            font=dict(
                family='Arial',  # specify font family
                size=18,  # specify font size
                color='black'  # specify font color
            )
        )
    ]
) 
    fig.show()
    return fig



def get_churn_plot_simple(df_i, title, param_dic, x_med=0):
    df_i = df_i[df_i.is_cancel>=20]
#         display(df_i.tail(5))

    x_var = df_i.hours_viewed_bin
    y_data = df_i.churn
    p0 = [0.5, -0.1, 0.01] 
    param_bounds = ([0, -0.8, 0.01], [np.inf, -0.1, np.inf])

    x_fit, params = fit_exponential(x_var, y_data, p0, param_bounds)
    a_fit, b_fit, c_fit = params
    y_fit = exponential_decay(x_fit, a_fit, b_fit, c_fit)
    
    if x_med==0:
        fig = get_simple_plot_multiple(df_i, 'hours_viewed_bin', 'churn', x_fit, y_fit, params, f'{title}')
    else:
        y_med = exponential_decay(x_med, a_fit, b_fit, c_fit)
        print(x_med)
        print(y_med)
        fig = get_simple_plot_multiple_dot(df_i, 'hours_viewed_bin', 'churn', x_fit, y_fit, params, x_med, np.array(y_med), f'{title}')
    display(df_i.head())
    param_dic['acquired'] = params
    return fig, params



def get_simple_plot_dot(df_plt, x, y, x_fit, y_fit, params, x_med, y_med, title=''):
    if title=='':
        
        title = f'{y} vs {x}'
       
    a_fit, b_fit, c_fit = params
    print('y= {:.2f} * e^({:.2f} * hours_viewed) + {:.2f}'.format(a_fit, b_fit, c_fit))
    print('y= {:.3f} * e^({:.2f} * hours_viewed)'.format(a_fit*b_fit,b_fit))
    annotation_x_loc = 50
    annotation_y_loc = y_fit.min() +(y_fit.max()  - y_fit.min() )/2 
        
    fig = px.line(x=x_fit, 
                  y=y_fit, 
                  title=title,
                  width=500, height=400)
    fig.add_scatter( 
              x=x_med, 
              y=y_med,
                mode='markers',
            marker=dict(size=14, color='red', line=dict(color='black', width=2)))

    fig.update_layout(
        template='simple_white',
        showlegend=False,
        xaxis=dict(range=[0,50]),
        xaxis_title= "hours_viewed_bin",
        yaxis_title= "Change in churn rate (slope)",
        annotations=[
        dict(
            x=x_med+0.25,  # x-coordinate for the text
            y=y_med+0.0005,  # y-coordinate for the text
            text='{:.2f}, {:.4f}'.format(x_med, y_med),  # the text to display
            showarrow=False,  # disable arrow for the annotation
            xanchor='left',
            font=dict(
                family='Arial',  # specify font family
                size=18,  # specify font size
                color='black'  # specify font color
            )
        )
    ]
) 
    fig.show()
    return fig

def get_churn_slope_plot_simple(df_i, title, params, x_med=0):
    df_i = df_i[df_i.is_cancel>=20]
#         display(df_i.tail(5))

    x_var = df_i.hours_viewed_bin
    x_fit = np.linspace(0, x_var.max(), 100)   
    a_fit, b_fit, c_fit = params
    y_fit = exponential_decay_slope(x_fit, a_fit, b_fit)
    
    y_med = exponential_decay_slope(x_med, a_fit, b_fit)
    print(x_med)
    print(y_med)
    fig = get_simple_plot_dot(df_i, 'hours_viewed_bin', 'churn', x_fit, y_fit, params, x_med, np.array(y_med), f'{title}')
    display(df_i.head())
    param_dic['acquired'] = params
    return fig




## EMEA markets

In [0]:
print(1)

In [0]:
df_60_00 = Utils.read_csv_s3(input_bucket, key_path, 'churn_user_stream_full_latam_60d_20230101_full.csv')
df_60_0 = Utils.read_csv_s3(input_bucket, key_path, 'churn_user_stream_full_latam_60d_20230301_full.csv')
df_60_1 = Utils.read_csv_s3(input_bucket, key_path, 'churn_user_stream_full_latam_60d_20230501_full.csv')
df_60_2 = Utils.read_csv_s3(input_bucket, key_path, 'churn_user_stream_full_latam_60d_20230701_full.csv')

def get_df_test(df_test):
    df_test['tenure_months'] = df_test['sub_month']
    df_test['monthly_hours_viewed'] = np.where(df_test['tenure_months']>1, df_test['hours_viewed']/2, df_test['hours_viewed'])
    user_total = df_test.groupby(['hbo_uuid'])['monthly_hours_viewed'].transform('sum')
    df_test['frc'] = df_test['monthly_hours_viewed'] / user_total
    
    df_test['program_type'] = np.where((df_test.program_type=='original') & (df_test.old_new=='library'), 'acquired', df_test.program_type)
    df_test = df_test[df_test.tenure_months>2]
    df_test = df_test.fillna(0)
    return(df_test)

df_60_00=get_df_test(df_60_00)
df_60_0=get_df_test(df_60_0)
df_60_1=get_df_test(df_60_1)
df_60_2=get_df_test(df_60_2)

df_list = get_df_60_h([df_60_00, df_60_0, df_60_1, df_60_2])
df_60 = pd.concat(df_list)
Utils.to_pkl_s3(df_60,input_bucket, key_path, 'df_latam_o_2023.pkl')


In [0]:
display(df_60_pr[['frc']][df_60_pr.program_type=='acquired'].describe(percentiles=[0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.55,0.8,0.9,1]))


In [0]:
# ## Nordic   

df_60 = Utils.read_pkl_s3(input_bucket, key_path, 'df_latam_o_2023.pkl')

## Hours viewed 
print('hours viewed distribution')
df_60_user = df_60.groupby(by=['hbo_uuid','is_cancel','num_df','sub_month']).sum().reset_index()
display(df_60_user.describe())
# get_histogram(df_60_user[df_60_user.monthly_hours_viewed<=50].sample(n=1000), 'monthly_hours_viewed', f'Hours_viewed audience distribution,total')

## hours viewed breakdown
print('hours viewed breakdown median')
df_60_pr = df_60.groupby(by=['hbo_uuid','sub_month','num_df','program_type']).sum().reset_index()
display(df_60_pr.groupby(by=['program_type']).median())
display(df_60_pr[['frc']][df_60_pr.program_type=='acquired'].describe(percentiles=[0.1,0.2,0.25,0.3,0.4,0.5,0.75,1]))

target_frc_list = [0.99, 0.9, 0.8, 0.7]
nor = []
for target_frc in target_frc_list:
    percentile = percentileofscore(df_60_pr[df_60_pr.program_type=='acquired']['frc'], target_frc)
    nor.append({'acquired_hr_fraction': target_frc, 'percentile': percentile})

## Average churn
print(df_60_user.shape, df_60_user.is_cancel.sum()/df_60_user.shape[0])

## Tenure churn
print('tenure churn')
df_60_month = df_60.groupby(by=['sub_month']).agg({'hbo_uuid':'count','is_cancel':'sum'})
df_60_month['churn'] = df_60_month['is_cancel'] /df_60_month['hbo_uuid'] 
display(df_60_month)

## some months have high churns (e.g. 17) investigate later 
df_60 = df_60[df_60.sub_month<=24]
 

## Total 
param_dic = {}
df_60_t = df_60.groupby(by=['hbo_uuid','is_cancel','sub_month']).sum().reset_index()
df_60_s = get_churn_bin(df_60_t, [])
param_dic = get_churn_plot_simple(df_60_s, 'total', param_dic)

## Acquired 
param_dic = {}
df_60_p= df_60.groupby(by=['hbo_uuid','is_cancel','sub_month','num_df','program_type']).sum().reset_index()

## Get median 
df_med= df_60_user[['hbo_uuid','sub_month','num_df']].merge(df_60_p[df_60_p.program_type=='acquired'], on=['hbo_uuid','sub_month','num_df'], how='left')
df_med = df_med.fillna(0)
med_x = df_med.monthly_hours_viewed.median()

df_60_s = df_60_p[df_60_p.frc>=0.9]
df_60_s = get_churn_bin(df_60_s, ['program_type'])
fig_nor, params = get_churn_plot_simple(df_60_s[df_60_s.program_type=='acquired'], 'LATAM acquired', param_dic, np.array(med_x))
fig_nor_slope = get_churn_slope_plot_simple(df_60_s[df_60_s.program_type=='acquired'], 'LATAM acquired slope', params, np.array(med_x))

df_nor = df_60_s[df_60_s.program_type=='acquired']

In [0]:
df = df_nor[['hours_viewed_bin','churn','hbo_uuid','is_cancel']]
df = df.rename(columns={'hbo_uuid':'uuid_count', 'is_cancel':'cancel_count'})
df.to_csv('latam_market_churn.csv')