In [5]:
##### nodejs:  https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/setting-up-node-on-ec2-instance.html

# !pip install "jupyterlab>=3" "ipywidgets>=7.6"
# !pip install jupyter-dash
# !jupyter lab build

# !pip install snowflake --user
# !pip install snowflake-connector-python --userqr4
# !pip install category_encoders
# !pip install xgboost
# !pip install fuzzywuzzy --user
# !pip install lightgbm --user


import os
import sys
path=!pwd
sys.path.append(os.path.join(path[0], '..'))
sys.path.append('/home/ec2-user/SageMaker/jupyter-notebooks/')
from utils import *
import snowflake.connector
from datetime import timedelta

from category_encoders import OneHotEncoder
import xgboost as xgb
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score as r2_score
import sklearn.model_selection
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


pd.options.mode.chained_assignment = None

In [14]:
class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx


def run_query(querystr, ctx):
    cursor_list = ctx.execute_string(
        querystr
        )
    df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
    df.columns= df.columns.str.lower()
    
    return df

## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")


val_date = '2021-01-01'

### psi_past_base_assets

In [12]:
querystr = '''
--Step 4: Gather past metrics and create the basic heuristic forecast, plus median metrics tables
create or replace table max_dev.workspace.psi_past_base_assets as 
(select distinct
      a.title_id
    , coalesce(a.season_number,0) as season_number
    , a.viewable_id
    , title_name
    , first_offered_date::date as asset_max_premiere
    , end_utc_max::date as asset_max_end_dt
    , coalesce(raod.season_first_offered_date::date,raod.title_first_offered_date::date) as season_premiere
    , asset_run_time
    , a.content_category
    , episode_number_in_season
    , content_source
    , program_type
    , category
    , tier
    , viewership_start_date as effective_start_date
    , viewership_end_date as effective_end_date
from max_prod.catalog.reporting_asset_dim a
join max_prod.catalog.reporting_asset_offering_dim raod
    on a.viewable_id = raod.viewable_id
    and brand = 'HBO MAX'
    and territory = 'HBO MAX DOMESTIC'
    and channel = 'HBO MAX SUBSCRIPTION'
inner join max_prod.content_analytics.psi_past_title_metadata b
    on a.title_id = b.viewership_title_id
    and coalesce(a.season_number,0) = coalesce(b.viewership_season_number,0)
where 1 = 1
and asset_type IN ('FEATURE','ELEMENT')
and start_utc_max is not null
and a.content_category in ('movies','series','special')
and coalesce(raod.season_first_offered_date,raod.title_first_offered_date)  >= '2020-05-27 07:01:00.000'
order by season_premiere, title_name 
);
select * from max_dev.workspace.psi_past_base_assets;
'''

df = run_query(querystr, ctx)


### training_first_views

In [15]:
querystr='''
set val_date = to_date(convert_timezone('America/Los_Angeles', {val_date}));
--Step 4: Gather past metrics and create the basic heuristic forecast, plus median metrics tables
create or replace table max_dev.workspace.psi_past_base_full as (
with fv as (
    select
          b.title_id
        , b.title_name
        , b.season_number
        , b.content_category
        , b.category
        , tier
        , request_time_gmt::date as request_date
        , count(distinct concat(hbo_uuid, subscription_id)) as first_views
    from MAX_PROD.BI_ANALYTICS.SUBSCRIPTION_FIRST_CONTENT_WATCHED a
    inner join max_dev.workspace.psi_past_base_assets b
        on a.viewable_id = b.viewable_id
        --and request_time_gmt::date between season_premiere_date and dateadd('day',90,season_premiere_date)
        --and season_premiere_date >= '2020-05-27 07:00:01'
    where 1 = 1
        and request_time_gmt::date between asset_max_premiere and asset_max_end_dt
        and request_time_gmt::date between effective_start_date and effective_end_date
        and request_time_gmt::date < dateadd('days',-1,$val_date)
        and country_iso_code in ('US','PR','GU')
    group by 1,2,3,4,5,6,7
    --order by 2,4
)
, hv as (
    select
          b.title_id
        , b.title_name
        , b.season_number
        , b.content_category
        , b.category
        , tier
        , request_time_gmt::date as request_date
        , coalesce(round(sum(stream_elapsed_play_seconds)/3600,3), 0) as hours_viewed
    from max_prod.viewership.max_user_stream_heartbeat a
    inner join max_dev.workspace.psi_past_base_assets b
        on a.viewable_id = b.viewable_id
    where 1 = 1
    and stream_elapsed_play_seconds >= 120
    and request_time_gmt > '2020-05-27 07:00:00'
    and request_time_gmt::date between asset_max_premiere and asset_max_end_dt
    and request_time_gmt::date between effective_start_date and effective_end_date
    and request_time_gmt::date < dateadd('days',-1,$val_date)
    group by 1,2,3,4,5,6,7
)
, dates as (
    select distinct
          rs.title_id
        , rs.title_name
        , rs.season_number
        , rs.content_category
        , rs.content_source
        , rs.program_type
        , rs.category
        , rs.tier
        --, rs.season_premiere
        , rs.effective_start_date
        , request_date
        , case when request_date::date = effective_start_date::date then 1 else 0 end as premiere_ind
        , count(distinct case when request_date::date = asset_max_premiere::date then viewable_id else null end) as asset_premiere_count
        , round(sum(distinct case when request_date::date = asset_max_premiere::date then asset_run_time else 0 end)/3600,3) as premiering_hours_runtime
    from max_dev.workspace.psi_past_base_assets rs
    cross join (
        select distinct seq_date as request_date 
        from max_prod.staging.date_range 
        where seq_date < '2024-12-31'::date
    ) rd
    where rd.request_date between 
    coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere) 
    and dateadd('days',90,coalesce(rs.effective_start_date,rs.season_premiere,rs.asset_max_premiere))
      and rd.request_date between effective_start_date and effective_end_date
    group by 1,2,3,4,5,6,7,8,9,10,11
    order by 2,3,8
)
    select dt.*
        , coalesce(first_views,0) as first_views
        , coalesce(hours_viewed,0) as hours_viewed
        , dt.request_date - effective_start_date as days_since_premiere
        , $val_date - effective_start_date -1 as days_on_platform
        , case when $val_date - effective_start_date - 1 >=
            case when dt.category = 'Popcorn' and year(effective_start_date) < 2022 then 31 else 90 end
        then 1 else 0 end as finished_window_flag
    from dates dt
    left join hv
        on dt.title_id = hv.title_id
        and dt.season_number = hv.season_number
        and dt.request_date = hv.request_date
        and dt.content_category = hv.content_category
        and dt.category = hv.category
        and dt.tier = hv.tier
    left join fv
        on dt.title_id = fv.title_id
        and dt.season_number = fv.season_number
        and dt.request_date = fv.request_date
        and dt.content_category = fv.content_category
        and dt.category = fv.category
        and dt.tier = fv.tier
    where 1 = 1
    --and dt.title_name like 'In Treatment'
    order by title_id, title_name, season_number, category, request_date
);
select
* 
from max_dev.workspace.psi_past_base_full
'''.format(val_date = val_date)

df = run_query(querystr, ctx)
df.to_csv('s3://hbo-ingest-datascience-content-dev/psi_firstviews/fv_actual_{}.csv'.format(val_date))


ProgrammingError: 001044 (42P13): SQL compilation error: error line 1 at position 23
Invalid argument types for function 'CONVERT_TIMEZONE': (VARCHAR(19), NUMBER(6,0))

### training_features

In [None]:

## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")


class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx


def run_query(self, query, ctx):
    conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
    ctx=conn.connect(dbname,schema)
    
#     cursor = ctx.cursor()
#     cursor.execute(query)
#     df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
#     df.columns= df.columns.str.lower()
    return df

# print ('Create Table: ' + table_name)
#         self.run_query('''
#         create or replace table {table_name}(
#         match_id varchar,
#         title varchar,
#         title_id varchar, 
#         available_date varchar,
#         originals_type varchar,
#         content_category varchar,
#         prediction_start_date varchar,
#         real_date varchar,
#         prediction_start_day int,
#         days_after_launch int,
#         actuals float,
#         prediction float
#         )
#         '''.format(table_name = table_name), dbname, schema)

In [7]:
querystr='''
select distinct
      pba.title_id
    , coalesce(pba.season_number,0) as season_number
    , pba.viewable_id
    , pba.title_name
    , pba.content_category
    , pba.program_type
    , pba.category
    , pba.tier
    , pba.effective_start_date
    , pba.effective_end_date
    , coalesce(ivm.imdb_id, ivm.imdb_series_id) as imdb_imdb_series_id
    , imc.reference_type
    , itr.original_title as reference_title
    , itr.title_id as reference_title_id
    , itr.title_type as reference_title_type
    , imcr.reference_type as reference_reference_type
    , itrr.title_id as reference_reference_title_id
from max_dev.workspace.psi_past_base_assets pba
left join max_prod.editorial.imdb_viewable_map ivm
    on pba.title_id = coalesce(ivm.viewable_id, ivm.viewable_series_id) 
left join enterprise_data.catalog.imdb_title it 
    on coalesce(ivm.imdb_id, ivm.imdb_series_id) = it.title_id
left join enterprise_data.catalog.imdb_movie_connection imc 
    on it.title_id = imc.title_id
left join enterprise_data.catalog.imdb_title itr 
    on itr.title_id = imc.reference_title_id
left join enterprise_data.catalog.imdb_movie_connection imcr
    on itr.title_id = imcr.title_id
    and imcr.reference_type in ('featured_in')
left join enterprise_data.catalog.imdb_title itrr 
    on itrr.title_id = imcr.reference_title_id
where 1 = 1
  and imc.reference_type in ('follows','spin_off_from','remake_of','version_of','featured_in')
order by effective_start_date, title_name
;
'''

cursor_list = ctx.execute_string(
    querystr
    )
df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
df.columns= df.columns.str.lower()
display(df.head())
df.to_csv('s3://hbo-ingest-datascience-content-dev/psi_firstviews/prod_df_imdb_2021-12-13.csv')

Unnamed: 0,title_id,season_number,viewable_id,title_name,content_category,program_type,category,tier,effective_start_date,effective_end_date,imdb_imdb_series_id,reference_type,reference_title,reference_title_id,reference_title_type,reference_reference_type,reference_reference_title_id
0,GXsRXGAJrtsPDFwEAAAAC,1,GXsRY2gdA0jijwwEAAAAH,Legendary,series,original,Unscripted Series,2,2020-05-27,2036-02-01,tt11048090,featured_in,Top 10 HBO Max Shows You Should Be Watching,tt15209946,tvEpisode,,
1,GXsRXGAJrtsPDFwEAAAAC,1,GXvTLLwb_h8JZjgEAAALK,Legendary,series,original,Unscripted Series,2,2020-05-27,2036-02-01,tt11048090,featured_in,Top 10 HBO Max Shows You Should Be Watching,tt15209946,tvEpisode,,
2,GXsRXGAJrtsPDFwEAAAAC,1,GXsRXmgTOd5hbJQEAAAAD,Legendary,series,original,Unscripted Series,2,2020-05-27,2036-02-01,tt11048090,featured_in,Top 10 HBO Max Shows You Should Be Watching,tt15209946,tvEpisode,,
3,GXsRXGAJrtsPDFwEAAAAC,1,GXs1DUw0VFbuKjQEAAAAC,Legendary,series,original,Unscripted Series,2,2020-05-27,2036-02-01,tt11048090,featured_in,Top 10 HBO Max Shows You Should Be Watching,tt15209946,tvEpisode,,
4,GXsRXGAJrtsPDFwEAAAAC,1,GXvzCQg0DQMPDwgEAAAGI,Legendary,series,original,Unscripted Series,2,2020-05-27,2036-02-01,tt11048090,featured_in,Top 10 HBO Max Shows You Should Be Watching,tt15209946,tvEpisode,,


### predict data pull

In [2]:
querystr='''
select distinct
    fp.title as title_name
    , ft.imdb_title_id as imdb_imdb_series_id
    , fp.season as season_number
    , fp.tier
    , fp.category
    , ft.content_category
    , fp.premiere_date as effective_start_date
    , fp.schedule_label
    , it.original_title as imdb_title_name
    , it.number_of_votes as n_votes
    , imc.reference_type
    , itr.original_title as reference_title
    , itr.title_id as reference_title_id
    , itr.title_type as reference_title_type
    , itr.number_of_votes as reference_n_votes
    , imcr.reference_type as reference_reference_type
    , itrr.title_id as reference_reference_title_id
from max_prod.content_analytics.daily_future_programming_schedule fp
left join max_dev.workspace.future_title_imdb_map ft
    on fp.title = ft.title_name
left join enterprise_data.catalog.imdb_title it 
    on ft.imdb_title_id = it.title_id
left join enterprise_data.catalog.imdb_movie_connection imc 
    on it.title_id = imc.title_id
    and imc.reference_type in ('follows','spin_off_from','remake_of','version_of','featured_in')
left join enterprise_data.catalog.imdb_title itr 
    on itr.title_id = imc.reference_title_id
left join enterprise_data.catalog.imdb_movie_connection imcr
    on itr.title_id = imcr.title_id
    and imcr.reference_type in ('featured_in')
left join enterprise_data.catalog.imdb_title itrr 
    on itrr.title_id = imcr.reference_title_id
order by effective_start_date, title_name
;
'''

cursor_list = ctx.execute_string(
    querystr
    )
df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
df.columns= df.columns.str.lower()
display(df.head())
df['title_id'] = 0
df.to_csv('s3://hbo-ingest-datascience-content-dev/psi_firstviews/prod_df_pred_2021-12-13.csv')


Unnamed: 0,title_name,imdb_imdb_series_id,season_number,tier,category,content_category,effective_start_date,schedule_label,imdb_title_name,n_votes,reference_type,reference_title,reference_title_id,reference_title_type,reference_n_votes,reference_reference_type,reference_reference_title_id
0,Curb Your Enthusiasm,tt0264235,11,2,Scripted Comedy Series,series,2021-11-01,beta,Curb Your Enthusiasm,116658,featured_in,Uncensored Comedy: That's Not Funny!,tt0371359,tvMovie,50,,
1,Curb Your Enthusiasm,tt0264235,11,2,Scripted Comedy Series,series,2021-11-01,beta,Curb Your Enthusiasm,116658,featured_in,The 54th Annual Primetime Emmy Awards,tt0313160,tvSpecial,394,,
2,Curb Your Enthusiasm,tt0264235,11,2,Scripted Comedy Series,series,2021-11-01,beta,Curb Your Enthusiasm,116658,featured_in,Top 10 Best TV Shows This Fall,tt9261650,tvEpisode,0,,
3,Curb Your Enthusiasm,tt0264235,11,2,Scripted Comedy Series,series,2021-11-01,beta,Curb Your Enthusiasm,116658,featured_in,Screenwipe USA,tt0937503,tvEpisode,28,featured_in,tt1135730
4,Curb Your Enthusiasm,tt0264235,11,2,Scripted Comedy Series,series,2021-11-01,beta,Curb Your Enthusiasm,116658,featured_in,The 62nd Primetime Emmy Awards,tt1703245,tvSpecial,198,featured_in,tt1718060
