In [108]:
##nodejs:  https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/setting-up-node-on-ec2-instance.html

# !pip install "jupyterlab>=3" "ipywidgets>=7.6" --user
# !pip install jupyter-dash --user
# !jupyter lab build --user

# !pip install snowflake --user
# !pip install snowflake-connector-python --user
import os
import sys
path=!pwd
sys.path.append(os.path.join(path[0], '..'))
from utils import *
import snowflake.connector

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx

def run_query(querystr, ctx):
    cursor_list = ctx.execute_string(
        querystr
        )
    df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
    df.columns= df.columns.str.lower()
    
    return df
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")
cur = ctx.cursor()

## Query for extrapolated prediction

In [72]:
for val_date in ['20220101']:#,'20220108','20211224','20211217','20211210']: #['20220115', '20220122', '20220201', '20220207', '20220215', 
                 #'20210801', '20210901', '20211001', '20211101', '20211201']:
    val_date_file = datetime.strptime(val_date, '%Y%m%d').strftime('%Y-%m-%d')

    ### Base assets 
    querystr_base_assets = '''
    --Step 4: Gather past metrics and create the basic heuristic forecast, plus median metrics tables
    create or replace table max_dev.workspace.psi_past_base_assets_temp as 
    (select distinct
          a.title_id
        , coalesce(a.season_number,0) as season_number
        , a.viewable_id
        , title_name
        , first_offered_date::date as asset_max_premiere
        , end_utc_max::date as asset_max_end_dt
        , coalesce(raod.season_first_offered_date::date,raod.title_first_offered_date::date) as season_premiere
        , asset_run_time
        , a.content_category
        , episode_number_in_season
        , content_source
        , program_type
        , category
        , tier
        , viewership_start_date as effective_start_date
        , viewership_end_date as effective_end_date
    from max_prod.catalog.reporting_asset_dim a
    join max_prod.catalog.reporting_asset_offering_dim raod
        on a.viewable_id = raod.viewable_id
        and brand = 'HBO MAX'
        and territory = 'HBO MAX DOMESTIC'
        and channel = 'HBO MAX SUBSCRIPTION'
    inner join max_prod.content_analytics.psi_past_title_metadata b
        on a.title_id = b.viewership_title_id
        and coalesce(a.season_number,0) = coalesce(b.viewership_season_number,0)
    where 1 = 1
    and asset_type IN ('FEATURE','ELEMENT')
    and start_utc_max is not null
    and a.content_category in ('movies','series','special')
    and coalesce(raod.season_first_offered_date,raod.title_first_offered_date)  >= '2020-05-27 07:01:00.000'
    order by season_premiere, title_name 
    );
    select * from max_dev.workspace.psi_past_base_assets_temp;
    '''

    ### Train fv 
    querystr_train_fv=''' 
    set val_date = date({val_date}, 'YYYYMMDD');
    --Step 4: Gather past metrics and create the basic heuristic forecast, plus median metrics tables
    create or replace table max_dev.workspace.psi_past_base_temp as (
    with fv as (
        select
              b.title_id
            , b.title_name
            , b.season_number
            , b.content_category
            , b.category
            , tier
            , request_time_gmt::date as request_date
            , count(distinct concat(hbo_uuid, subscription_id)) as first_views
        from MAX_PROD.BI_ANALYTICS.SUBSCRIPTION_FIRST_CONTENT_WATCHED a
        inner join max_dev.workspace.psi_past_base_assets_temp b
            on a.viewable_id = b.viewable_id
            --and request_time_gmt::date between season_premiere_date and dateadd('day',90,season_premiere_date)
            --and season_premiere_date >= '2020-05-27 07:00:01'
        where 1 = 1
            and request_time_gmt::date between asset_max_premiere and asset_max_end_dt
            and request_time_gmt::date between effective_start_date and effective_end_date
            and request_time_gmt::date < dateadd('days',-1,$val_date)
            and country_iso_code in ('US','PR','GU')
        group by 1,2,3,4,5,6,7
        --order by 2,4
    )
    , dates as (
        select distinct
              rs.title_id
            , rs.title_name
            , rs.season_number
            , rs.content_category
            , rs.content_source
            , rs.program_type
            , rs.category
            , rs.tier
            --, rs.season_premiere
            , rs.effective_start_date
            , request_date
            , case when request_date::date = effective_start_date::date then 1 else 0 end as premiere_ind
            , count(distinct case when request_date::date = asset_max_premiere::date then viewable_id else null end) as asset_premiere_count
            , round(sum(distinct case when request_date::date = asset_max_premiere::date then asset_run_time else 0 end)/3600,3) as premiering_hours_runtime
        from max_dev.workspace.psi_past_base_assets_temp rs
        cross join (
            select distinct seq_date as request_date 
            from max_prod.staging.date_range 
            where seq_date < '2024-12-31'::date
        ) rd
        where rd.request_date between 
        coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere) 
        and dateadd('days',90,coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere))
          and rd.request_date between effective_start_date and effective_end_date
        group by 1,2,3,4,5,6,7,8,9,10,11
        order by 2,3,8
    )
        select dt.*
            , coalesce(first_views,0) as first_views
            , dt.request_date - effective_start_date as days_since_premiere
            , $val_date - effective_start_date -1 as days_on_platform
            , case when $val_date - effective_start_date - 1 >=
                case when dt.category = 'Popcorn' and year(effective_start_date) < 2022 then 31 else 90 end
            then 1 else 0 end as finished_window_flag
            , sum(case when days_since_premiere<=4 then first_views else 0 end) over (partition by dt.title_id, dt.season_number) as fv_4d
            , case when fv_4d>=20000 and dt.tier <=1 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Top IP Scripted Series' 
              else (case when dt.tier <=1 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 0,1 Scripted Series'
                  else (case when dt.tier = 2 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 2 Scripted Series'
                      else (case when dt.tier = 3 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 3 Scripted Series' 
                      else dt.category end) end) end) end as decay_category
        from dates dt
        left join fv
            on dt.title_id = fv.title_id
            and dt.season_number = fv.season_number
            and dt.request_date = fv.request_date
            and dt.content_category = fv.content_category
            and dt.category = fv.category
            and dt.tier = fv.tier
        where 1 = 1
        --and dt.title_name like 'In Treatment'
        order by title_id, title_name, season_number, category, request_date
    );
    select 
    * 
    from max_dev.workspace.psi_past_base_temp;
    '''.format(val_date = val_date)

    querystr_train_fv=''' 
    set val_date = date({val_date}, 'YYYYMMDD');
    --Step 4: Gather past metrics and create the basic heuristic forecast, plus median metrics tables
    create or replace table max_dev.workspace.psi_past_base_temp as (
        select
              b.title_id
            , b.title_name
            , b.season_number
            , b.content_category
            , b.category
            , tier
            , request_time_gmt::date as request_date
            , count(distinct concat(hbo_uuid, subscription_id)) as first_views
        from MAX_PROD.BI_ANALYTICS.SUBSCRIPTION_FIRST_CONTENT_WATCHED a
        inner join max_dev.workspace.psi_past_base_assets_temp b
            on a.viewable_id = b.viewable_id
            --and request_time_gmt::date between season_premiere_date and dateadd('day',90,season_premiere_date)
            --and season_premiere_date >= '2020-05-27 07:00:01'
        where 1 = 1
            and request_time_gmt::date between asset_max_premiere and asset_max_end_dt
            and request_time_gmt::date between effective_start_date and effective_end_date
            and request_time_gmt::date < dateadd('days',-1,$val_date)
            and country_iso_code in ('US','PR','GU')
        group by 1,2,3,4,5,6,7
        --order by 2,4
        );
        select 
        * 
        from max_dev.workspace.psi_past_base_temp;
        '''.format(val_date = val_date)
    
    querystr_psi_median_decay='''
    set val_date = date({val_date}, 'YYYYMMDD');
    create or replace table max_dev.workspace.psi_median_decay as (
    with title_totals as (
        select
              title_id
            , title_name
            , season_number
            , content_category
            , tier
            , finished_window_flag
            , decay_category
            , sum(first_views) as total_first_views
        from max_dev.workspace.psi_past_base_temp
        where effective_start_date>='2021-01-01'
        group by 1,2,3,4,5,6,7
        )
    , enriched_base as (
        select
            base.*
            , div0(first_views,total_first_views) as first_views_pct
        from max_dev.workspace.psi_past_base_temp base
        left join title_totals tt
        on base.title_id = tt.title_id
        and base.season_number = tt.season_number
        and base.content_category = tt.content_category
        and base.decay_category = tt.decay_category
        and base.tier = tt.tier
        and tt.finished_window_flag = 1
        where 1 = 1
        and base.effective_start_date>='2021-01-01'
        and total_first_views >= 0
        or base.finished_window_flag = 0
        )
    , median_decay_pre as (
        select
              decay_category
            , days_since_premiere
            , median(first_views_pct) as med_first_views_pct
        from enriched_base
        where finished_window_flag = 1
        group by 1,2
        order by 1,2
        )
    , median_decay_modifier as (
        select
              decay_category
            , sum(med_first_views_pct) as med_fv_mod
        from median_decay_pre
        group by 1
        )
    select
          a.decay_category
        , days_since_premiere
        , med_first_views_pct/med_fv_mod as med_first_views_pct
    from median_decay_pre a
    join median_decay_modifier b
    on a.decay_category = b.decay_category
    );
    select 
    * 
    from max_dev.workspace.psi_median_decay;
    '''.format(val_date = val_date)

    
    querystr_psi_daily_viewership='''
    set val_date = date({val_date}, 'YYYYMMDD');
    create or replace table max_dev.workspace.psi_past_current_daily_viewership as (
    with current_running_assets as (
    select
          title_id
        , title_name
        , season_number
        , content_category
        , tier
        , effective_start_date
        , decay_category
        , max(days_since_premiere) days_so_far
        , sum(first_views) as fv_so_far
    from max_dev.workspace.psi_past_base_temp
    where 1 = 1
    and request_date < dateadd('days',-1,$val_date)
    and effective_start_date < dateadd('days',-4,$val_date)
    and finished_window_flag = 0
    group by 1,2,3,4,5,6,7
    )
    , current_running_assets_enriched as (
    select
          title_id
        , title_name
        , season_number
        , content_category
        , a.tier
        , a.decay_category
        , effective_start_date
        , days_so_far
        , fv_so_far
        , sum(med_first_views_pct) fv_pct_so_far
    from current_running_assets a
    join max_dev.workspace.psi_median_decay b
    on case when a.decay_category = 'Popcorn' and year(effective_start_date) >= 2022 then 'Scripted Features'
        else a. decay_category end = b.decay_category
    and days_since_premiere <= days_so_far
    group by 1,2,3,4,5,6,7,8,9
    )
    , current_running_assets_predicted_totals as (
    select
          title_id
        , title_name
        , season_number
        , content_category
        , tier
        , decay_category
        , effective_start_date
        , fv_so_far/fv_pct_so_far as predicted_total_first_views
    from current_running_assets_enriched
    )
    select
          a.*
        , case when request_date < dateadd('days',-1, $val_date) then first_views
        else round(b.med_first_views_pct * c.predicted_total_first_views,0) end as predicted_first_views
    from max_dev.workspace.psi_past_base_temp a
    left join max_dev.workspace.psi_median_decay b
    on case when a.decay_category = 'Popcorn' and year(effective_start_date) >= 2022 then 'Scripted Features'
        else a. decay_category end = b.decay_category
    and a.days_since_premiere = b.days_since_premiere
    left join current_running_assets_predicted_totals c
    on a.title_id = c.title_id
    and a.season_number = c.season_number
    and a.content_category = c.content_category
    and a.tier =  c.tier
    --where a.title_name like '%Snyder%'
    );
    select 
    *,
    'past' as schedule_label
    from max_dev.workspace.psi_past_current_daily_viewership
    where 1 = 1
    and effective_start_date < dateadd('days',-4,$val_date::date);
    '''.format(val_date = val_date)

    
    df = run_query(querystr_base_assets, ctx)
    df_train_fv = run_query(querystr_train_fv, ctx)
#     df_train_fv.to_csv('s3://hbo-ingest-datascience-content-dev/psi_first_views/other/fv_train_{}.csv'.format(val_date_file))

#     df_median_decay= run_query(querystr_psi_median_decay, ctx)
#     df_fv_ext= run_query(querystr_psi_daily_viewership, ctx)
#     df_fv_ext.to_csv('s3://hbo-ingest-datascience-content-dev/psi_first_views/other/fv_ext_{}_fix.csv'.format(val_date_file))
     
    print(f'{val_date} saved')


20220101 saved


In [73]:
a = '''select * from max_dev.workspace.psi_past_current_daily_viewership'''
df = run_query(a, ctx)


Unnamed: 0,season_number,tier,first_views
count,90234.0,90234.0,90234.0
mean,3.105193,2.588548,282.326196
std,9.730944,0.696053,4486.638812
min,0.0,0.0,1.0
25%,0.0,2.0,3.0
50%,1.0,3.0,11.0
75%,1.0,3.0,55.0
max,52.0,3.0,732748.0


In [4]:
val_date_file='2022-02-11'
querystr='''
select 
* 
from max_prod.content_analytics.psi_daily_rw_mean_forecast 
where finished_window_flag=0 and days_on_platform>0  
'''

df_rwm= run_query(querystr, ctx)
df_rwm.to_csv('s3://hbo-ingest-datascience-content-dev/psi_first_views/dev/rwm_{}.csv'.format(val_date_file))


Pull request 

In [126]:
ci_db = 'max_dev'
ci_schema= 'workspace'
ds = '20220201'
# querystr_pred ='''
# '''
# .format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)

querystr_past ='''
set ds = date({ds}, 'YYYYMMDD');
create or replace table {ci_db}.{ci_schema}.psi_past_base_temp as (
with assets as (
select distinct
      a.title_id
    , coalesce(a.season_number,0) as season_number
    , a.viewable_id
    , title_name
    , first_offered_date::date as asset_max_premiere
    , end_utc::date as asset_max_end_dt
    , coalesce(raod.season_first_offered_date::date,raod.title_first_offered_date::date) as season_premiere
    , asset_run_time
    , a.content_category
    , episode_number_in_season
    , content_source
    , program_type
    , category
    , tier
    , viewership_start_date as effective_start_date
    , viewership_end_date as effective_end_date
from max_prod.catalog.reporting_asset_dim a
join max_prod.catalog.reporting_asset_offering_dim raod
on a.viewable_id = raod.viewable_id
and brand = 'HBO MAX'
and territory = 'HBO MAX DOMESTIC'
and channel = 'HBO MAX SUBSCRIPTION'
inner join max_prod.content_analytics.psi_past_title_metadata b
on a.title_id = b.viewership_title_id
and coalesce(a.season_number,0) = coalesce(b.viewership_season_number,0)
where 1 = 1
and title_name='Free Guy'
and asset_type IN ('FEATURE','ELEMENT')
and start_utc is not null
and a.content_category in ('movies','series','special')
and coalesce(raod.season_first_offered_date,raod.title_first_offered_date)  >= '2020-05-27 07:01:00.000'
order by season_premiere, title_name-- desc
)
, fv as (
    select
          b.title_id
        , b.title_name
        , b.season_number
        , b.content_category
        , b.category
        , tier
        , request_time_gmt::date as request_date
        , count(distinct concat(hbo_uuid, subscription_id)) as first_views
    from MAX_PROD.BI_ANALYTICS.SUBSCRIPTION_FIRST_CONTENT_WATCHED a
    inner join assets b
        on a.viewable_id = b.viewable_id
        --and request_time_gmt::date between season_premiere_date and dateadd('day',90,season_premiere_date)
        --and season_premiere_date >= '2020-05-27 07:00:01'
    where 1 = 1
    and request_time_gmt::date between asset_max_premiere and asset_max_end_dt
    and request_time_gmt::date between effective_start_date and effective_end_date
    and request_time_gmt::date < dateadd('days',-1, $ds)
    and country_iso_code in ('US','PR','GU')
    group by 1,2,3,4,5,6,7
    --order by 2,4
)
, hv as (
    select
          b.title_id
        , b.title_name
        , b.season_number
        , b.content_category
        , b.category
        , tier
        , request_time_gmt::date as request_date
        , coalesce(round(sum(stream_elapsed_play_seconds)/3600,3), 0) as hours_viewed
    from max_prod.viewership.max_user_stream_heartbeat a
    inner join assets b
        on a.viewable_id = b.viewable_id
    where 1 = 1
    and stream_elapsed_play_seconds >= 120
    and request_time_gmt > '2020-05-27 07:00:00'
    and request_time_gmt::date between asset_max_premiere and asset_max_end_dt
    and request_time_gmt::date between effective_start_date and effective_end_date
    and request_time_gmt::date < dateadd('days',-1, $ds)
    group by 1,2,3,4,5,6,7
)
, dates as (
    select distinct
          rs.title_id
        , rs.title_name
        , rs.season_number
        , rs.content_category
        , rs.content_source
        , rs.program_type
        , rs.category
        , rs.tier
        --, rs.season_premiere
        , rs.effective_start_date
        , request_date
        , case when request_date::date = effective_start_date::date then 1 else 0 end as premiere_ind
        , count(distinct case when request_date::date = asset_max_premiere::date then viewable_id else null end) as asset_premiere_count
        , round(sum(distinct case when request_date::date = asset_max_premiere::date then asset_run_time else 0 end)/3600,3) as premiering_hours_runtime
    from assets rs
    cross join (
        select distinct seq_date as request_date from max_prod.staging.date_range where seq_date < '2024-12-31'::date
    ) rd
    where rd.request_date between coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere) and dateadd('days',90,coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere))
      and rd.request_date between effective_start_date and effective_end_date
    group by 1,2,3,4,5,6,7,8,9,10,11
    order by 2,3,8
)
    select dt.*
        , coalesce(first_views,0) as first_views
        , coalesce(hours_viewed,0) as hours_viewed
        , dt.request_date::date - effective_start_date::date as days_since_premiere
        , $ds - effective_start_date::date -1 as days_on_platform    
        , case when $ds - effective_start_date::date - 1 >=
            case when dt.category = 'Popcorn' and year(effective_start_date::date) < 2022 then 31 else 90 end
        then 1 else 0 end as finished_window_flag
        , sum(case when days_since_premiere<=4 then first_views else 0 end) over (partition by dt.title_id, dt.season_number) as fv_4d
            , case when fv_4d>=20000 and dt.tier <=1 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Top IP Scripted Series' 
              else (case when dt.tier <=1 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 0,1 Scripted Series'
                  else (case when dt.tier = 2 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 2 Scripted Series'
                      else (case when dt.tier = 3 and dt.category in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 3 Scripted Series' 
                      else dt.category end) end) end) end as decay_category
    from dates dt
    left join hv
    on dt.title_id = hv.title_id
    and dt.season_number = hv.season_number
    and dt.request_date = hv.request_date
    and dt.content_category = hv.content_category
    and dt.category = hv.category
    and dt.tier = hv.tier
    left join fv
    on dt.title_id = fv.title_id
    and dt.season_number = fv.season_number
    and dt.request_date = fv.request_date
    and dt.content_category = fv.content_category
    and dt.category = fv.category
    and dt.tier = fv.tier
    where 1 = 1
    --and dt.title_name like 'In Treatment'
    order by title_id, title_name, season_number, category, request_date
)
;
select 
    * 
    from max_dev.workspace.psi_past_base_temp
'''.format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)


# querystr_past ='''
# create or replace table {ci_db}.{ci_schema}.psi_past_base_temp as (
# select distinct
#       a.title_id
#     , coalesce(a.season_number,0) as season_number
#     , a.viewable_id
#     , title_name
#     , first_offered_date::date as asset_max_premiere
#     , end_utc::date as asset_max_end_dt
#     , coalesce(raod.season_first_offered_date::date,raod.title_first_offered_date::date) as season_premiere
#     , asset_run_time
#     , a.content_category
#     , episode_number_in_season
#     , content_source
#     , program_type
#     , category
#     , tier
#     , viewership_start_date as effective_start_date
#     , viewership_end_date as effective_end_date
# from max_prod.catalog.reporting_asset_dim a
# join max_prod.catalog.reporting_asset_offering_dim raod
# on a.viewable_id = raod.viewable_id
# and brand = 'HBO MAX'
# and territory = 'HBO MAX DOMESTIC'
# and channel = 'HBO MAX SUBSCRIPTION'
# inner join max_prod.content_analytics.psi_past_title_metadata b
# on a.title_id = b.viewership_title_id
# and coalesce(a.season_number,0) = coalesce(b.viewership_season_number,0)
# where 1 = 1
# and asset_type IN ('FEATURE','ELEMENT')
# and start_utc is not null
# and a.content_category in ('movies','series','special')
# and coalesce(raod.season_first_offered_date,raod.title_first_offered_date)  >= '2020-05-27 07:01:00.000'
# order by season_premiere, title_name-- desc
# );
# select 
#     * 
#     from {ci_db}.{ci_schema}.psi_past_base_temp
# '''.format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)

querystr_decay ='''
set ds = date({ds}, 'YYYYMMDD');
create or replace table {ci_db}.{ci_schema}.psi_median_decay as (
with title_totals as (
select
      title_id
    , title_name
    , season_number
    , content_category
    , decay_category
    , tier
    , finished_window_flag
    , sum(first_views) as total_first_views
    , sum(hours_viewed) as total_hours_viewed
from {ci_db}.{ci_schema}.psi_past_base_temp
where effective_start_date>='2021-01-01'
group by 1,2,3,4,5,6,7
)
, enriched_base as (
select
    base.*
    , div0(first_views,total_first_views) as first_views_pct
    , div0(hours_viewed,total_hours_viewed) as hours_viewed_pct
from {ci_db}.{ci_schema}.psi_past_base_temp base
left join title_totals tt
on base.title_id = tt.title_id
and base.season_number = tt.season_number
and base.content_category = tt.content_category
and base.decay_category = tt.decay_category
and base.tier = tt.tier
and tt.finished_window_flag = 1
where 1 = 1
and base.effective_start_date>='2021-01-01'
and total_first_views >= 0
or base.finished_window_flag = 0
)
, median_decay_pre as (
select
      decay_category
    , days_since_premiere
    , median(hours_viewed_pct) as med_hours_viewed_pct
    , median(first_views_pct) as med_first_views_pct
from enriched_base
where finished_window_flag = 1
group by 1,2
order by 1,2
)
, median_decay_modifier as (
select
      decay_category
    , sum(med_hours_viewed_pct) as med_hv_mod
    , sum(med_first_views_pct) as med_fv_mod
from median_decay_pre
group by 1
)
--, median_decay as (
select
      a.decay_category
    , days_since_premiere
    , med_hours_viewed_pct/med_hv_mod as med_hours_viewed_pct
    , med_first_views_pct/med_fv_mod as med_first_views_pct
from median_decay_pre a
join median_decay_modifier b
on a.decay_category = b.decay_category
)
;
select * from {ci_db}.{ci_schema}.psi_median_decay
'''.format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)

# querystr_decay ='''
# create or replace table {ci_db}.{ci_schema}.psi_median_decay as (
# select
#       title_id
#     , title_name
#     , season_number
#     , content_category
#     , decay_category
#     , tier
#     , finished_window_flag
#     , sum(first_views) as total_first_views
#     , sum(hours_viewed) as total_hours_viewed
# from {ci_db}.{ci_schema}.psi_past_base_temp
# group by 1,2,3,4,5,6,7
# )
# ;
# select * from {ci_db}.{ci_schema}.psi_median_decay
# '''.format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)


query_past_current='''
set ds = date({ds}, 'YYYYMMDD');
create or replace temporary table {ci_db}.{ci_schema}.psi_past_current_daily_viewership as (
with current_running_assets as (
select
      title_id
    , title_name
    , season_number
    , content_category
    , decay_category
    , tier
    , effective_start_date
    , max(days_since_premiere) days_so_far
    , sum(hours_viewed) as hv_so_far
    , sum(first_views) as fv_so_far
from {ci_db}.{ci_schema}.psi_past_base_temp
where 1 = 1
and request_date < dateadd('days',-1,$ds)
and effective_start_date < dateadd('days',-4, $ds)
and finished_window_flag = 0
group by 1,2,3,4,5,6,7
)
, current_running_assets_enriched as (
select
      title_id
    , title_name
    , season_number
    , content_category
    , a.decay_category
    , a.tier
    , effective_start_date
    , days_so_far
    , hv_so_far
    , sum(med_hours_viewed_pct) hv_pct_so_far
    , fv_so_far
    , sum(med_first_views_pct) fv_pct_so_far
from current_running_assets a
join {ci_db}.{ci_schema}.psi_median_decay b
on case when a.decay_category = 'Popcorn' and year(effective_start_date::date) >= 2022 then 'Scripted Features'
    else a. decay_category end = b.decay_category
and days_since_premiere <= days_so_far
group by 1,2,3,4,5,6,7,8,9,11
)
, current_running_assets_predicted_totals as (
select
      title_id
    , title_name
    , season_number
    , content_category
    , decay_category
    , tier
    , effective_start_date
    , hv_so_far/hv_pct_so_far as predicted_total_hours_viewed
    , fv_so_far/fv_pct_so_far as predicted_total_first_views
from current_running_assets_enriched
)
--, past_current_daily_predictions as (
select
      a.*
    , case when request_date::date < dateadd('days',-1,$ds) then first_views
    else round(b.med_first_views_pct * c.predicted_total_first_views,0) end as predicted_first_views
    , case when request_date::date < dateadd('days',-1,$ds) then hours_viewed
    else round(b.med_hours_viewed_pct * c.predicted_total_hours_viewed,3) end as predicted_hours_viewed
from {ci_db}.{ci_schema}.psi_past_base_temp a
left join {ci_db}.{ci_schema}.psi_median_decay b
on case when a.decay_category = 'Popcorn' and year(effective_start_date::date) >= 2022 then 'Scripted Features'
    else a.decay_category end = b.decay_category
and a.days_since_premiere = b.days_since_premiere
left join current_running_assets_predicted_totals c
on a.title_id = c.title_id
and a.season_number = c.season_number
and a.content_category = c.content_category
and a.decay_category = c.decay_category
and a.tier =  c.tier
--where a.title_name like '%Snyder%'
);
select * from {ci_db}.{ci_schema}.psi_past_current_daily_viewership;
'''.format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)


querystr_decay_values ='''
set ds = date({ds}, 'YYYYMMDD');
create or replace table {ci_db}.{ci_schema}.psi_past_current_inferred_decay_values as (
with title_totals as (
select
      title_id
    , title_name
    , season_number
    , content_category
    , category
    , tier
    , effective_start_date
    , sum(predicted_hours_viewed) as total_hours_viewed
    , sum(predicted_first_views) as total_first_views
from {ci_db}.{ci_schema}.psi_past_current_daily_viewership
--where tier = 1 and category = 'Scripted Drama Series'
group by 1,2,3,4,5,6,7
)
, recency_rank as (
select *
    , dense_rank() over (partition by category, tier order by effective_start_date) as recency_rank
    , total_first_views*(dense_rank() over (partition by category, tier order by effective_start_date)) as weighted_first_views
    , total_hours_viewed*(dense_rank() over (partition by category, tier order by effective_start_date)) as weighted_hours_viewed
from title_totals
order by category, tier, effective_start_date
)
-- , tier_x_catg_weighted_avgs as (
select
      category
    , round(tier,0)::varchar as tier
    , sum(weighted_hours_viewed)/sum(recency_rank) as avg_hours_viewed
    , sum(weighted_first_views)/sum(recency_rank) as avg_first_views
from recency_rank
group by 1,2
order by 1,2
)
;
select * from {ci_db}.{ci_schema}.psi_past_current_inferred_decay_values
'''.format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)


querystr_rwm ='''
set ds = date({ds}, 'YYYYMMDD');
create or replace table {ci_db}.{ci_schema}.psi_daily_rw_mean_forecast as (
select
    *,
    'past' as schedule_label
from {ci_db}.{ci_schema}.psi_past_current_daily_viewership
where 1 = 1
and effective_start_date < dateadd('days',-4,$ds)
union
select
      null as title_id
    , a.title as title_name
    , a.season as season_number
    , null as content_category
    , source as content_source
    , null as program_type
    , initcap(a.category) as category
    , a.tier
    --, a.season_premiere
    , a.premiere_date
    , a.seq_date
    , num_premiering_titles
    , num_episodes_released
    , num_hours_released
    , 0 as first_views
    , 0 as hours_viewed
    , seq_date::date - a.premiere_date as days_since_premiere
    , $ds - 1 - a.premiere_date as days_on_platform
    , 0 as finished_window_flag
    , 0 as fv_4d
    , case when a.tier <=1 and initcap(a.category) in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 0,1 Scripted Series' else 
    (case when a.tier = 2 and initcap(a.category) in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 2 Scripted Series' else 
     (case when a.tier = 3 and initcap(a.category) in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 3 Scripted Series' 
      else initcap(a.category) end) end) end as decay_category
    , round(c.avg_hours_viewed * b.med_hours_viewed_pct,4) as predicted_hours_viewed
    , round(c.avg_first_views * b.med_first_views_pct,0) as predicted_first_views
--     , b.predicted_first_views
--     , b.predicted_hours_viewed
    , schedule_label
from max_prod.content_analytics.daily_future_programming_schedule a
left join {ci_db}.{ci_schema}.psi_median_decay b
on case when initcap(a.category) = 'Popcorn' and year(a.premiere_date) >= 2022 then 'Scripted Features' else
    (case when a.tier <=1 and initcap(a.category) in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 0,1 Scripted Series' else 
    (case when a.tier = 2 and initcap(a.category) in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 2 Scripted Series' else 
     (case when a.tier = 3 and initcap(a.category) in ('Scripted Drama Series','Scripted Comedy Series') then 'Tier 3 Scripted Series' 
      else initcap(a.category) end) end) end) end  = b.decay_category
and seq_date - a.premiere_date = b.days_since_premiere
left join {ci_db}.{ci_schema}.psi_past_current_inferred_decay_values c
on case when initcap(a.category) = 'Popcorn' and year(a.premiere_date) >= 2022 then 'Scripted Features'
    else initcap(a. category) end = initcap(c.category)
and a.tier::varchar = c.tier::varchar
where 1 = 1
and first_window_flag = 1
and finished_window_flag = 0
and premiere_date >= dateadd('days',-4,$ds)
and concat(title,season) not in (
    select distinct concat(psi_title,a.season_number)
    from {ci_db}.{ci_schema}.psi_past_current_daily_viewership a
    join max_prod.content_analytics.psi_past_title_metadata b
    on initcap(a.title_name) = initcap(b.viewership_title)
    and coalesce(a.season_number,0) = coalesce(b.viewership_season_number,0)
    where premiere_ind = 1
    and a.effective_start_date < dateadd('days',-4,$ds)
    )
);
select * from {ci_db}.{ci_schema}.psi_daily_rw_mean_forecast'''.format(ci_db=ci_db, ci_schema=ci_schema , ds=ds)


# df_past= run_query(querystr_past, ctx) 
# print('done')
# display(df_past.describe())

# df_decay= run_query(querystr_decay, ctx) 
# print('done')
# display(df_decay.head(2))

# df_current= run_query(query_past_current, ctx) 
# print('done')
# display(df_current.head(2))

# df_decay_values= run_query(querystr_decay_values, ctx) 
# print('done')
# display(df_decay_values.head(2))

df_rwm=run_query(querystr_rwm, ctx) 
print('done')
display(df_rwm.head(2))

done


Unnamed: 0,title_id,title_name,season_number,content_category,content_source,program_type,category,tier,premiere_date,seq_date,num_premiering_titles,num_episodes_released,num_hours_released,first_views,hours_viewed,days_since_premiere,days_on_platform,finished_window_flag,fv_4d,decay_category,predicted_hours_viewed,predicted_first_views,schedule_label
0,,Endangered,0,,,,Documentary Features,3,2022-03-23,2022-05-17,0,0,0.0,0,0,55,-51,0,0,Documentary Features,989.864,5.0,alpha
1,,Minx,1,,,,Scripted Comedy Series,2,2022-03-17,2022-05-17,0,0,0.0,0,0,61,-45,0,0,Tier 2 Scripted Series,35592.7668,146.0,alpha


In [135]:
display(df_rwm = df_rwm.isnull().sum())
df_rwm = df_rwm.fillna(0)
df_rwm['premiere_date'] = pd.to_datetime(df_rwm['premiere_date'])
df_rwm['predicted_first_views'] = df_rwm['predicted_first_views'].astype(int)
a = df_rwm[['title_name','premiere_date','category','first_views','predicted_first_views']].groupby(by=['title_name','premiere_date','category']).sum()

a = a.reset_index()
a.sort_values(by='premiere_date')

Unnamed: 0,title_name,premiere_date,category,first_views,predicted_first_views
23,And Just Like That... The Documentary,2022-02-03,Documentary Features,0,42478
338,Looney Tunes,2022-02-03,Kids & Family,0,4479
422,Raised By Wolves,2022-02-03,Scripted Drama Series,0,107913
534,The Invisible Pilot,2022-02-07,Docu-Series,0,4854
71,Carole King & James Taylor: Just Call Out My Name,2022-02-10,Documentary Features,0,2702
...,...,...,...,...,...
73,Charlotte's Web,2024-12-12,Kids & Family,0,942
229,HBO 2024 TBD Doc Feature 31,2024-12-15,Documentary Features,0,1990
226,HBO 2024 TBD Doc Feature 29,2024-12-15,Documentary Features,0,1990
82,Co-Pro Series #2,2024-12-19,Kids & Family,0,789


In [131]:
df_rwm[df_rwm.title_name=='Free Guy']

Unnamed: 0,title_id,title_name,season_number,content_category,content_source,program_type,category,tier,premiere_date,seq_date,num_premiering_titles,num_episodes_released,num_hours_released,first_views,hours_viewed,days_since_premiere,days_on_platform,finished_window_flag,fv_4d,decay_category,predicted_hours_viewed,predicted_first_views,schedule_label
51,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-17,0,0,0.0,0,0,83,-23,0,0,Pay1,13138.546,89.0,alpha
73,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-18,0,0,0.0,0,0,84,-23,0,0,Pay1,14794.6809,83.0,alpha
169,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-19,0,0,0.0,0,0,85,-23,0,0,Pay1,11929.8315,73.0,alpha
233,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-20,0,0,0.0,0,0,86,-23,0,0,Pay1,11436.1326,76.0,alpha
302,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-21,0,0,0.0,0,0,87,-23,0,0,Pay1,10400.0078,70.0,alpha
363,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-22,0,0,0.0,0,0,88,-23,0,0,Pay1,10176.2753,62.0,alpha
458,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-23,0,0,0.0,0,0,89,-23,0,0,Pay1,10086.4133,59.0,alpha
511,0,Free Guy,0,0,0,0,Pay1,1,2022-02-23,2022-05-24,0,0,0.0,0,0,90,-23,0,0,Pay1,11380.6355,71.0,alpha
6834,0,Free Guy,0,0,HBO,0,Pay1,1,2022-02-23,2022-02-23,1,1,2.0,0,0,0,-23,0,0,Pay1,359569.975,1414.0,alpha
6846,0,Free Guy,0,0,HBO,0,Pay1,1,2022-02-23,2022-02-24,0,0,0.0,0,0,1,-23,0,0,Pay1,234700.9456,1130.0,alpha
