### Viewing Sub % EDA 

In [0]:
# Import Packages
import sys, os, re 
import io
import pandas as pd
import numpy as np
import itertools as it
import logging
import boto3
import json
from datetime import datetime, timedelta


import lib.util_snowflake as sfk

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
logger.info(f'Starting Notebook')

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
sf_creds = 'hbo-max-content-datascience-snowflake-dev'
database = 'max_dev'
input_bucket = "hbo-ingest-datascience-content"
output_bucket = "hbo-outbound-datascience-content-dev"


In [0]:
## Run Credentials to connect to Snowflake
logger.info(f'TEST: {sf_creds}')
## Snowflake connection 
schema = 'workspace'
conn = sfk.SnowflakeConnector(sfk.SSMPSCredentials(sf_creds))
ctx= conn.connect(database, schema)
cur = ctx.cursor()

In [0]:
# Pull utils folder for SageMaker env
repo = 'SageMaker'
root_path = os.getcwd()[:re.search(repo, os.getcwd()).start()] + repo + '/'

utils_sf = root_path + 'utils/util_snowflake.py'
%run $utils_sf $root_path
print(f"ran paths: {utils_sf}, root_path: {root_path}")

In [0]:
## 1.0.1 Read Metadata New
query = f"""
with filter_base as
    (
    select  
        metric.geo_value,
        metric.title_series, 
        metric.title_season,
        metric.season_number,
        meta.reporting_net_studio,
        meta.content_source,
        meta.ckg_series_id,
        metric.offering_start_date,
        case when meta.geo_value = 'LATAM' then latam_home_medal else predicted_medal_us end as medal,
        meta.derived_genre,
        meta.program_type,
        metric.cume_title_viewing_subs, 
        cume_platform_viewing_subs, 
        percent_cumulative_viewing_subs,
        days_on_max
    from max_prod.content_datascience.title_season_metrics_platform as  metric
    inner join max_prod.content_datascience.all_titles_season_metadata as meta
        ON metric.ckg_match_id = meta.ckg_match_id
        and metric.geo_value = meta.geo_value
        and metric.season_number = meta.season_number
    where  metric.geo_level = 'REGION'
        and metric.days_on_max = (28) 
        and metric.season_number >0 
        and metric.offering_window_num = 1
        and metric.offering_start_date > to_date('2021-06-01')
 --       and percent_cumulative_viewing_subs >1
 --       and predicted_medal_us is not null
)
select table_a.geo_value,
    table_a.title_series,
    table_a.ckg_series_id,
    case when table_a.program_type = 'Original' and table_a.content_source = 'MAX' then 'Max Original'
        when table_a.program_type = 'Original' and table_a.content_source = 'HBO' then 'HBO Original'
        when table_a.program_type = 'Acquired' and table_a.content_source = 'MAX' then 'Acquired'
        else 'other' end as lop,
    table_a.medal,
    table_a.reporting_net_studio,
    table_a.content_source,
    table_a.program_type,
    table_a.season_number as season_number,
    table_b.season_number as season_prev,
    table_a.offering_start_date as start_date,
    table_b.offering_start_date as start_date_prev,
    datediff('days', start_date_prev, start_date) as date_diff,
    table_a.percent_cumulative_viewing_subs as vs_pct_curr,
    table_a.cume_title_viewing_subs as vs_title_curr, 
    table_a.cume_platform_viewing_subs as vs_platform_curr,
    table_b.percent_cumulative_viewing_subs as vs_pct_prev,
    table_b.cume_title_viewing_subs as vs_title_prev, 
    table_b.cume_platform_viewing_subs as vs_platform_prev
from filter_base as table_a
left join filter_base as table_b
on table_a.geo_value = table_b.geo_value
    and table_a.days_on_max = table_b.days_on_max
    and table_a.ckg_series_id = table_b.ckg_series_id
    and table_a.season_number = (table_b.season_number+1) -- Add 1 to last season (table_b) to equal target season (table_a) 
    and table_a.offering_start_date > table_b.offering_start_date -- Start date of last season must be earlier than current season
where table_a.season_number > 1 -- must not be a movie
and start_date > '2021-06-01'
order by table_a.geo_value, table_a.title_series, table_a.season_number;"""
logger.info(f'TEST: {query}')
df = sfk.execute_query(query = query, ctx=ctx)

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt 

In [0]:
df['vs_pct_curr'] = df['vs_pct_curr'].astype('float')
df['vs_pct_prev'] = df['vs_pct_prev'].astype('float')

In [0]:
df_graph = df[(df['geo_value']=='NORTH AMERICA')
              &(df['season_number']>1)
              &(df['vs_title_prev']>0)
              &(df['medal'].isin(['Platinum', 'Gold', 'Silver']))
              &(df['start_date']>pd.to_datetime('2021-06-01'))
             
             ]

# Compare Absolute Value Predictions with cutoff
# fig = plt.figure(figsize=(7,3))
x = f'vs_title_curr'
y = f'vs_title_prev'
data = df[df['season_number']>1]
sns.set_theme(style="whitegrid")
ax = sns.relplot(data=data, x=data[x]/1e6, y=data[y]/1e6, 
                     hue = 'medal',
                 col = 'geo_value'
#                 col = 'tier',
#                 row='observed_medal_num'
               )
# plt.title(f'Viewing Subs - 28 Training Data')
plt.axvline(x=0, linewidth=.5, color='black', ls= '--')
plt.axhline(y=0, linewidth=.5, color='black', ls= '--')

plt.xlim(-1,8)
plt.ylim(-1,8)
plt.tight_layout()

In [0]:
# Compare Absolute Value Predictions with cutoff
# fig = plt.figure(figsize=(7,3))
x = f'vs_title_curr'
y = f'vs_title_prev'

# sns.set_theme(style="whitegrid")
ax = sns.relplot(data=df_graph, x=x, y=y, 
                 hue = 'medal',
                 col = 'lop',
               )
# plt.title(f'Viewing Subs - 28 Training Data')
plt.axvline(x=0, linewidth=.5, color='black', ls= '--')
plt.axhline(y=0, linewidth=.5, color='black', ls= '--')

plt.xlim(-1,8000000)
plt.ylim(-1,8000000)
plt.tight_layout()

In [0]:
df_graph = df[(df['geo_value']=='NORTH AMERICA')
              &(df['season_number']>1)
              &(df['vs_title_prev']>0)
              &(df['medal'].isin(['Platinum', 'Gold', 'Silver']))
              &(df['start_date']>pd.to_datetime('2021-06-01'))
             
             ]

fig = plt.figure(figsize=(7,7))
x = f'vs_title_curr'
y = f'vs_title_prev'


ax = sns.scatterplot(data=df_graph, x=x, y=y, hue='lop')
plt.xlim(0,7500000)
plt.ylim(0,7500000)

In [0]:
df_graph = df[(df['geo_value']=='NORTH AMERICA')
              &(df['season_number']>1)
              &(df['vs_title_prev']>0)
              &(df['medal'].isin(['Platinum', 'Gold', 'Silver']))
              &(df['start_date']>pd.to_datetime('2021-06-01'))
             
             ]

In [0]:
df['title_pct_change'] = (df['vs_title_curr']-df['vs_title_prev'])/df['vs_title_prev']

In [0]:
df_graph[['title_series', 'season_number', 'start_date', 'start_date_prev','content_source', 
          'medal',  'vs_title_prev', 'vs_title_curr', 'vs_pct_prev','vs_pct_curr',
          'title_pct_change']].sort_values(by='title_pct_change', ascending=False)

In [0]:
df_graph = df[(df['geo_value']=='NORTH AMERICA')
              &(df['season_number']>1)
              &(df['vs_title_prev']>0)
              &(df['medal'].isin(['Platinum', 'Gold', 'Silver']))
              &(df['start_date']>pd.to_datetime('2021-06-01'))
             
             ]

fig = plt.figure(figsize=(7,7))
x = f'vs_title_curr'
y = f'vs_title_prev'


ax = sns.scatterplot(data=df_graph, x=x, y=y, hue='medal')
plt.xlim(0,7500000)
plt.ylim(0,7500000)

In [0]:
fig = plt.figure(figsize=(7,7))
x = f'vs_title_curr'
y = f'vs_title_prev'


ax = sns.scatterplot(data=df_graph, x=x, y=y, hue='program_type')
# plt.xlim(0,30)
# plt.ylim(0,30)

In [0]:
fig = plt.figure(figsize=(7,7))
x = f'vs_pct_curr'
y = f'vs_pct_prev'


ax = sns.scatterplot(data=df_graph, x=x, y=y, hue='medal')
# plt.xlim(0,30)
# plt.ylim(0,30)

In [0]:
# df_graph = df[(df['geo_value']=='NORTH AMERICA')&(df['season_number']>1)&
#              (df['vs_title_prev']>0)]

fig = plt.figure(figsize=(7,7))

ax = sns.scatterplot(data=df_graph, x=x, y=y, hue='medal')
# plt.xlim(0,30)
# plt.ylim(0,30)

In [0]:
df_graph = df[(df['geo_value']=='NORTH AMERICA')&(
    df['season_number']>1)&
             (df['vs_title_prev']>0)]


fig = plt.figure(figsize=(7,3))
# filter_col = 'months_on_platform'
x = f'vs_pct_curr'
y = f'vs_pct_prev'

ax = sns.scatterplot(data=df_graph, x=x, y=y, hue='medal')
plt.title(f'Season-over-season')
plt.axvline(x=0, linewidth=.5, color='black', ls= '--')
plt.axhline(y=0, linewidth=.5, color='black', ls= '--')
# plt.xlim(0,4)
# plt.ylim(0,4)
plt.xlabel(f'Current Season')
plt.ylabel(f'Previous Season')
# plt.plot(X_plot, Y_plot,ls =':')
plt.tight_layout()