In [2]:
##nodejs:  https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/setting-up-node-on-ec2-instance.html

# !pip install "jupyterlab>=3" "ipywidgets>=7.6" --user
# !pip install jupyter-dash --user
# !jupyter lab build --user

# !pip install snowflake --user
# !pip install snowflake-connector-python --user
import os
import sys
path=!pwd
sys.path.append(os.path.join(path[0], '..'))
from utils import *
import snowflake.connector

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx

def run_query(querystr, ctx):
    cursor_list = ctx.execute_string(
        querystr
        )
    df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
    df.columns= df.columns.str.lower()
    
    return df
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")
cur = ctx.cursor()

## Query for extrapolated prediction

In [3]:
for val_date in ['20210801', '20210901']:
    val_date_file = datetime.strptime(val_date, '%Y%m%d').strftime('%Y-%m-%d')

    ### Base assets 
    querystr_base_assets = '''
    --Step 4: Gather past metrics and create the basic heuristic forecast, plus median metrics tables
    create or replace table max_dev.workspace.psi_past_base_assets_temp as 
    (select distinct
          a.title_id
        , coalesce(a.season_number,0) as season_number
        , a.viewable_id
        , title_name
        , first_offered_date::date as asset_max_premiere
        , end_utc_max::date as asset_max_end_dt
        , coalesce(raod.season_first_offered_date::date,raod.title_first_offered_date::date) as season_premiere
        , asset_run_time
        , a.content_category
        , episode_number_in_season
        , content_source
        , program_type
        , category
        , tier
        , viewership_start_date as effective_start_date
        , viewership_end_date as effective_end_date
    from max_prod.catalog.reporting_asset_dim a
    join max_prod.catalog.reporting_asset_offering_dim raod
        on a.viewable_id = raod.viewable_id
        and brand = 'HBO MAX'
        and territory = 'HBO MAX DOMESTIC'
        and channel = 'HBO MAX SUBSCRIPTION'
    inner join max_prod.content_analytics.psi_past_title_metadata b
        on a.title_id = b.viewership_title_id
        and coalesce(a.season_number,0) = coalesce(b.viewership_season_number,0)
    where 1 = 1
    and asset_type IN ('FEATURE','ELEMENT')
    and start_utc_max is not null
    and a.content_category in ('movies','series','special')
    and coalesce(raod.season_first_offered_date,raod.title_first_offered_date)  >= '2020-05-27 07:01:00.000'
    order by season_premiere, title_name 
    );
    select * from max_dev.workspace.psi_past_base_assets_temp;
    '''


    ### Train fv 
    querystr_train_fv=''' 
    set val_date = date({val_date}, 'YYYYMMDD');
    --Step 4: Gather past metrics and create the basic heuristic forecast, plus median metrics tables
    create or replace table max_dev.workspace.psi_past_base_temp as (
    with fv as (
        select
              b.title_id
            , b.title_name
            , b.season_number
            , b.content_category
            , b.category
            , tier
            , request_time_gmt::date as request_date
            , count(distinct concat(hbo_uuid, subscription_id)) as first_views
        from MAX_PROD.BI_ANALYTICS.SUBSCRIPTION_FIRST_CONTENT_WATCHED a
        inner join max_dev.workspace.psi_past_base_assets_temp b
            on a.viewable_id = b.viewable_id
            --and request_time_gmt::date between season_premiere_date and dateadd('day',90,season_premiere_date)
            --and season_premiere_date >= '2020-05-27 07:00:01'
        where 1 = 1
            and request_time_gmt::date between asset_max_premiere and asset_max_end_dt
            and request_time_gmt::date between effective_start_date and effective_end_date
            and request_time_gmt::date < dateadd('days',-1,$val_date)
            and country_iso_code in ('US','PR','GU')
        group by 1,2,3,4,5,6,7
        --order by 2,4
    )
    , dates as (
        select distinct
              rs.title_id
            , rs.title_name
            , rs.season_number
            , rs.content_category
            , rs.content_source
            , rs.program_type
            , rs.category
            , rs.tier
            --, rs.season_premiere
            , rs.effective_start_date
            , request_date
            , case when request_date::date = effective_start_date::date then 1 else 0 end as premiere_ind
            , count(distinct case when request_date::date = asset_max_premiere::date then viewable_id else null end) as asset_premiere_count
            , round(sum(distinct case when request_date::date = asset_max_premiere::date then asset_run_time else 0 end)/3600,3) as premiering_hours_runtime
        from max_dev.workspace.psi_past_base_assets_temp rs
        cross join (
            select distinct seq_date as request_date 
            from max_prod.staging.date_range 
            where seq_date < '2024-12-31'::date
        ) rd
        where rd.request_date between 
        coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere) 
        and dateadd('days',90,coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere))
          and rd.request_date between effective_start_date and effective_end_date
        group by 1,2,3,4,5,6,7,8,9,10,11
        order by 2,3,8
    )
        select dt.*
            , coalesce(first_views,0) as first_views
            , dt.request_date - effective_start_date as days_since_premiere
            , $val_date - effective_start_date -1 as days_on_platform
            , case when $val_date - effective_start_date - 1 >=
                case when dt.category = 'Popcorn' and year(effective_start_date) < 2022 then 31 else 90 end
            then 1 else 0 end as finished_window_flag
        from dates dt
        left join fv
            on dt.title_id = fv.title_id
            and dt.season_number = fv.season_number
            and dt.request_date = fv.request_date
            and dt.content_category = fv.content_category
            and dt.category = fv.category
            and dt.tier = fv.tier
        where 1 = 1
        --and dt.title_name like 'In Treatment'
        order by title_id, title_name, season_number, category, request_date
    );
    select 
    * 
    from max_dev.workspace.psi_past_base_temp;
    '''.format(val_date = val_date)

    
    querystr_psi_median_decay='''
    set val_date = date({val_date}, 'YYYYMMDD');
    create or replace table max_dev.workspace.psi_median_decay as (
    with title_totals as (
        select
              title_id
            , title_name
            , season_number
            , content_category
            , category
            , tier
            , finished_window_flag
            , sum(first_views) as total_first_views
        from max_dev.workspace.psi_past_base_temp
        --where finished_window_flag = 1
        group by 1,2,3,4,5,6,7
        )
    , enriched_base as (
        select
            base.*
            , div0(first_views,total_first_views) as first_views_pct
        from max_dev.workspace.psi_past_base_temp base
        left join title_totals tt
        on base.title_id = tt.title_id
        and base.season_number = tt.season_number
        and base.content_category = tt.content_category
        and base.category = tt.category
        and base.tier = tt.tier
        and tt.finished_window_flag = 1
        where 1 = 1
        and total_first_views >= 0
        or base.finished_window_flag = 0
        )
    , median_decay_pre as (
        select
              category
            , days_since_premiere
            , median(first_views_pct) as med_first_views_pct
        from enriched_base
        where finished_window_flag = 1
        group by 1,2
        order by 1,2
        )
    , median_decay_modifier as (
        select
              category
            , sum(med_first_views_pct) as med_fv_mod
        from median_decay_pre
        group by 1
        )
    select
          a.category
        , days_since_premiere
        , med_first_views_pct/med_fv_mod as med_first_views_pct
    from median_decay_pre a
    join median_decay_modifier b
    on a.category = b.category
    );
    select 
    * 
    from max_dev.workspace.psi_median_decay;
    '''.format(val_date = val_date)

    
    querystr_psi_daily_viewership='''
    set val_date = date({val_date}, 'YYYYMMDD');
    create or replace table max_dev.workspace.psi_past_current_daily_viewership as (
    with current_running_assets as (
    select
          title_id
        , title_name
        , season_number
        , content_category
        , category
        , tier
        , effective_start_date
        , max(days_since_premiere) days_so_far
        , sum(first_views) as fv_so_far
    from max_dev.workspace.psi_past_base_temp
    where 1 = 1
    and request_date < dateadd('days',-1,$val_date)
    and effective_start_date < dateadd('days',-4,$val_date)
    and finished_window_flag = 0
    group by 1,2,3,4,5,6,7
    )
    , current_running_assets_enriched as (
    select
          title_id
        , title_name
        , season_number
        , content_category
        , a.category
        , a.tier
        , effective_start_date
        , days_so_far
        , fv_so_far
        , sum(med_first_views_pct) fv_pct_so_far
    from current_running_assets a
    join max_dev.workspace.psi_median_decay b
    on case when a.category = 'Popcorn' and year(effective_start_date) >= 2022 then 'Scripted Features'
        else a. category end = b.category
    and days_since_premiere <= days_so_far
    group by 1,2,3,4,5,6,7,8,9
    )
    , current_running_assets_predicted_totals as (
    select
          title_id
        , title_name
        , season_number
        , content_category
        , category
        , tier
        , effective_start_date
        , fv_so_far/fv_pct_so_far as predicted_total_first_views
    from current_running_assets_enriched
    )
    select
          a.*
        , case when request_date < dateadd('days',-1, $val_date) then first_views
        else round(b.med_first_views_pct * c.predicted_total_first_views,0) end as predicted_first_views
    from max_dev.workspace.psi_past_base_temp a
    left join max_dev.workspace.psi_median_decay b
    on case when a.category = 'Popcorn' and year(effective_start_date) >= 2022 then 'Scripted Features'
        else a. category end = b.category
    and a.days_since_premiere = b.days_since_premiere
    left join current_running_assets_predicted_totals c
    on a.title_id = c.title_id
    and a.season_number = c.season_number
    and a.content_category = c.content_category
    and a.category = c.category
    and a.tier =  c.tier
    --where a.title_name like '%Snyder%'
    );
    select 
    *,
    'past' as schedule_label
    from max_dev.workspace.psi_past_current_daily_viewership
    where 1 = 1
    and effective_start_date < dateadd('days',-4,$val_date::date);
    '''.format(val_date = val_date)

    
    df = run_query(querystr_base_assets, ctx)
    df_train_fv = run_query(querystr_train_fv, ctx)
    df_train_fv.to_csv('s3://hbo-ingest-datascience-content-dev/psi_first_views/dev/fv_train_{}.csv'.format(val_date_file))

    df_median_decay= run_query(querystr_psi_median_decay, ctx)
    df_fv_ext= run_query(querystr_psi_daily_viewership, ctx)
    df_fv_ext.to_csv('s3://hbo-ingest-datascience-content-dev/psi_first_views/dev/fv_ext_{}.csv'.format(val_date_file))
     
    print(f'{val_date} saved')


20210801 saved
20210901 saved


In [4]:
val_date_file='2022-02-11'
querystr='''
select 
* 
from max_prod.content_analytics.psi_daily_rw_mean_forecast 
where finished_window_flag=0 and days_on_platform>0  
'''

df_rwm= run_query(querystr, ctx)
df_rwm.to_csv('s3://hbo-ingest-datascience-content-dev/psi_first_views/dev/rwm_{}.csv'.format(val_date_file))
