# Run Post Greenlight Model

In [0]:
# sf_creds = 'hbo-max-content-datascience-snowflake-dev'
# database = 'max_dev'
# input_bucket = "hbo-ingest-datascience-content"
# output_bucket = "hbo-outbound-datascience-content-dev"
# schema = 'delphi'

In [0]:
# Import Packages
import sys, os, re 
import io
import pandas as pd
import numpy as np
import itertools as it
import logging
import boto3
import json
from datetime import datetime, timedelta


import lib.util_snowflake as sfk
from snowflake.connector.errors import ProgrammingError
from snowflake.connector.pandas_tools import write_pandas

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
logger.info(f'Starting Notebook')

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [0]:
%load_ext autoreload
%autoreload 2
schema = 'delphi'
current_date = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d')
kpi = 'viewing_subs'
geo_value = 'NORTH AMERICA'

In [0]:
## Run Credentials to connect to Snowflake
logger.info(f'TEST: {sf_creds}')
## Snowflake connection 
conn = sfk.SnowflakeConnector(sfk.SSMPSCredentials(sf_creds))
ctx= conn.connect(database, schema)
cur = ctx.cursor()

## 1.0 Query Data

In [0]:
# train_test_scope = 'lib/dev_train_test_scope.py'
# %run $train_test_scope

In [0]:
## 2.1 Read Metadata New
query_metadata = f"""select * from {database}.content_datascience.viewingsubs_metadata_train where geo_value='{geo_value}'"""
logger.info(f'TEST: {query_metadata}')
metadata_feature = sfk.execute_query(query = query_metadata, ctx=ctx)

## 2.2 Read Future Schedule
query_schedule = f"""select * from {database}.content_datascience.viewingsubs_metadata_pred where geo_value='{geo_value}'"""
logger.info(f'TEST: {query_schedule}')
df_pred = sfk.execute_query(query = query_schedule, ctx = ctx)
# print('Loading SFK table file {}'.format(file_ref))

# 2.3 Import Engagement
query_metric = f"""select * from {database}.content_datascience.viewingsubs_metrics_train where geo_value='{geo_value}'"""
logger.info(f'{query_metric}')
df_metric = sfk.execute_query(query = query_metric, ctx = ctx)

### 1.1 Train Test Scope

In [0]:
from lib.dev_train_test_scope import train_test_scope

current_date = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d')
train_test_data = train_test_scope(current_date, input_bucket, output_bucket, 
                                   database, schema, geo_value, kpi, metadata_feature, df_pred)
train_test_data.run()
train_dataset = train_test_data.train_dataset
score_pgl = train_test_data.score_pgl
score_pre = train_test_data.score_pre
score_post = train_test_data.score_post

## Model

## 2.0 Post-GreenLight Model

In [0]:
from lib.dev_Post_GreenLight_Model import post_greenlight_model

In [0]:
# Post_GreenLight_Model = 'lib/dev_Post_GreenLight_Model.py'
# %run $Post_GreenLight_Model

### 2.1 Pull in Data

In [0]:
# Name train and set
train_data_set = train_test_data.train_dataset
test_data_set = train_test_data.score_pgl

In [0]:
# Run Model with Cross Validation
pgl_model = post_greenlight_model(train_data_set, test_data_set, kpi, 
                                  input_bucket, output_bucket, geo_value, 
                                  database, schema, df_metric)
pgl_model.feature_engineer()
input_train = pgl_model.train_data
input_test = pgl_model.test_data
pgl_model.cv(NUM_FOLD = 3)

# To review predictions and feature importances in notebook
feature_importances = pgl_model.feature_importances
validation_set = pgl_model.validation_set

In [0]:
pgl_model.scoring()
prediction_set_post_gl = pgl_model.prediction_set

### 2.2 Format PostGLight Predictions and Write them to Delphi

In [0]:
# Add Columns needed for Delphi Prediction Tables

data = {
    'model_name': ['pct_viewing_subs_2.0'],
    'table_name': ['pct_viewing_subs_postgl'],
    'model_version': ['2.0'],
    'sub_type': ['Max Retail+Wholesale'],
    'sub_plan': ['Platform'],
    'unit': ['percent'],
    'region': [geo_value],
    'days_after_premiere': [28],
    'publish_date': [current_date],
    'key': [1]
}
df_delphi = pd.DataFrame(data=data, columns = data.keys())

prediction_set_post_gl['key'] = 1
df_delphi_postgl = df_delphi.merge(prediction_set_post_gl, on='key').drop('key', 1).reset_index(drop=True).copy()

# Add imdb_id and ckg_match_id
df_delphi_postgl = df_delphi_postgl.merge(test_data_set[['delphi_id', 'ckg_match_id', 'ckg_series_id', 'imdb_series_id',
                                                         'title_season', 'title_series', 'season_number']], 
                                          on=['title_season', 'title_series', 'season_number'], how='inner')

In [0]:
# Rename Columns to match Delphi
rename_set = {
    'ckg_series_id' : 'title_id',
    'imdb_series_id' : 'imdb_id',
    'title_series' : 'title_name', 
    'derived_genre' :'category',   
    'first_release_date' : 'premiere_date'
}
df_delphi_postgl.rename(columns = rename_set, inplace=True)


In [0]:
# Update dates to datetime object
df_delphi_postgl['premiere_date'] = pd.to_datetime(df_delphi_postgl['premiere_date'])
df_delphi_postgl['publish_date'] = pd.to_datetime(df_delphi_postgl['publish_date'])
df_delphi_postgl['current_days_from_premiere'] = (df_delphi_postgl['publish_date'] - df_delphi_postgl['premiere_date']).dt.days
df_delphi_postgl['target_date'] = df_delphi_postgl['premiere_date'] + df_delphi_postgl['days_after_premiere'].apply(lambda x: pd.DateOffset(days=x))

# Back to string to make compatible with sfk package
df_delphi_postgl['publish_date'] = df_delphi_postgl['publish_date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_delphi_postgl['target_date'] = df_delphi_postgl['target_date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_delphi_postgl['premiere_date'] = df_delphi_postgl['premiere_date'].apply(lambda x: x.strftime('%Y-%m-%d'))


In [0]:
df_delphi_postgl = df_delphi_postgl.drop(columns=['ckg_match_id', 'title_id'])

In [0]:
# sfk_lib = 'lib/util_snowflake.py'
# %run $sfk_lib

In [0]:
# Write to Snowflake
table_name = f'{df_delphi_postgl.table_name.unique()[0]}_staging'
sfk.export_dataframe_to_table(database=database, schema='delphi', df=df_delphi_postgl, 
                              table=table_name, ctx=ctx)

## Clean Up

In [0]:
# # # ## Create table to post predictions into
# drop_table = f"""drop table if exists {database}.delphi.pct_viewing_subs_postgl_staging"""
# logger.info(f'TEST: {drop_table}')
# sfk.execute_query(query = drop_table, ctx=ctx)

# create_table = f"""
# create table if not exists {database}.delphi.pct_viewing_subs_postgl_staging (
#     model_name varchar, table_name varchar, model_version float,
#     sub_type varchar, sub_plan string, unit string, region string,
#     days_after_premiere number(38,0), publish_date date, title_season string,
#     title_name varchar, season_number varchar, premiere_date date, observed_medal_num float,
#     prequel_featured_count float, prequel_count float, category string, lifecycle string,
#     prediction float, delphi_id string, 
#     imdb_id varchar, current_days_from_premiere number(38,0), target_date date);
# """
# sfk.execute_query(query = create_table, ctx=ctx)

In [0]:
# df_delphi_postgl = df_delphi_postgl.astype({
#     'model_name':'string', 'table_name':'string', 'model_version':'string', 
#     'sub_type':'string', 'sub_plan':'string','unit':'string', 'region':'string', 
#     'days_after_premiere':'int', 'title_season':'string', 'title_name':'string', 
#     'season_number':'string', 
#     'category':'string', 'lifecycle':'string', 
#     'delphi_id':'string', 'ckg_match_id':'string', 'title_id':'string', 'imdb_id':'string',
# })

In [0]:
# table_name = 'max_dev.delphi.pct_viewing_subs_postgl_staging'
# sfk.execute_query(query=f'describe table max_dev.delphi.pct_viewing_subs_postgl_staging', ctx=ctx)