# Run Pre Launch Model - Dev
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
- try with cutoff
- try with log
- mae

In [0]:
sf_creds = 'hbo-max-content-datascience-snowflake-dev'
database = 'max_dev'
input_bucket = "hbo-ingest-datascience-content"
output_bucket = "hbo-outbound-datascience-content-dev"


In [0]:
#def smape(y, pred):
#    return 100/len(y) * np.sum(2 * np.abs(pred - y) / (np.abs(y) + np.abs(pred)))

def smape(A, F):
    tmp = 2 * np.abs(F - A) / (np.abs(A) + np.abs(F))
    len_ = np.count_nonzero(~np.isnan(tmp))
    if len_ == 0 and np.nansum(tmp) == 0: # Deals with a special case
        return 100
    return 100 / len_ * np.nansum(tmp)

def evaluate_performance(y_train, y_test, y_pred, is_baseline):
    if is_baseline:
        print("Baseline metrics:")
        # "Learn" the mean from the training data
        mean_train = np.mean(y_train)
        # Get predictions on the test set
        y_pred = np.ones(y_test.shape) * float(mean_train)
        
    mae = mean_absolute_error(y_test, y_pred)
    print("MAE: {:.2f}".format(mae))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print("MAPE: {:.2f}".format(mape))
    smape_res = smape(y_test, y_pred)
    print("sMAPE: {:.2f}".format(smape_res))
    mse = mean_squared_error(y_test, y_pred)
    print("MSE: {:.2f}".format(mse))
    print("RMSE: {:.2f}".format(np.sqrt(mse)))
    print("R^2: {:.2f}".format(r2_score(y_test, y_pred)))
 

In [0]:

#log_wrapped_model = TransformedTargetRegressor(
#    regressor=multioutputregressor,
#    func = np.log1p, 
#    inverse_func=np.expm1
#)


In [0]:
evaluate_performance(
    y_train=y_train,
    y_test=y_test,
    y_pred=None,
    is_baseline=True
)

In [0]:
y_pred_log = np.expm1(y_pred)
y_test_log = np.expm1(y_test)

In [0]:
viewingsubs_df['percentage_of_viewing_subs_log'] = viewingsubs_df.groupby(['days_on_hbo_max'])['percentage_of_viewing_subs'].apply(
    lambda row: np.log1p(row)
)

In [0]:
# Import Packages
import sys, os, re 
import io
import pandas as pd
import numpy as np
import itertools as it
import logging
import boto3
import json
from datetime import datetime, timedelta


import lib.util_snowflake as sfk
from snowflake.connector.errors import ProgrammingError
from snowflake.connector.pandas_tools import write_pandas

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
logger.info(f'Starting Notebook')

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [0]:
%load_ext autoreload
%autoreload 2
schema = 'content_datascience'
current_date = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d')
kpi = 'viewing_subs_log'
geo_value = 'NORTH AMERICA'
schema = 'delphi'

In [0]:
## Run Credentials to connect to Snowflake
logger.info(f'TEST: {sf_creds}')
## Snowflake connection 
conn = sfk.SnowflakeConnector(sfk.SSMPSCredentials(sf_creds))
ctx= conn.connect(database, schema)
cur = ctx.cursor()

In [0]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## 1.0 Query Data

In [0]:
# train_test_scope = 'lib/dev_train_test_scope.py'
# %run $train_test_scope

In [0]:
## 1.0.1 Read Metadata New
query_metadata = f"""select * from {database}.content_datascience.viewingsubs_metadata_train where geo_value='{geo_value}'"""
logger.info(f'TEST: {query_metadata}')
metadata_feature = sfk.execute_query(query = query_metadata, ctx=ctx)

## 1.0.2 Read Future Schedule
query_schedule = f"""select * from {database}.content_datascience.viewingsubs_metadata_pred where geo_value='{geo_value}' and imdb_series_id is not null"""
logger.info(f'TEST: {query_schedule}')
df_pred = sfk.execute_query(query = query_schedule, ctx = ctx)
# print('Loading SFK table file {}'.format(file_ref))

## 1.0.3 Read Metric Data
# query_metric = f"""select * from {database}.content_datascience.viewingsubs_metrics_train where geo_value='{geo_value}'"""
# logger.info(f'{query_metric}')
# df_metric = sfk.execute_query(query = query_metric, ctx = ctx)
# df_metric['first_release_date'] = pd.to_datetime(df_metric['first_release_date']).apply(lambda x: x.strftime('%Y-%m-%d'))

## 1.0.3 Read Metric Data
query_metric = f"""select * from {database}.content_datascience.viewingsubs_metrics_train_test where geo_value='{geo_value}' and 
viewing_subs_pct >1.0"""
logger.info(f'{query_metric}')
df_metric = sfk.execute_query(query = query_metric, ctx = ctx)
df_metric['first_release_date'] = pd.to_datetime(df_metric['first_release_date']).apply(lambda x: x.strftime('%Y-%m-%d'))

In [0]:
df_metric['viewing_subs_log'] = df_metric['viewing_subs'].apply(
    lambda row: np.log1p(row))


## 1.1 Train Test Scope

In [0]:
# Train_Test_Scope = 'lib/dev_Post_GreenLight_Model.py'
# %run $Post_GreenLight_Model

In [0]:
from lib.dev_train_test_scope import train_test_scope

current_date = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d')
train_test_data = train_test_scope(current_date, input_bucket, output_bucket, 
                                   database, schema, geo_value, kpi, metadata_feature, df_pred)
train_test_data.run()
train_dataset = train_test_data.train_dataset
score_pgl = train_test_data.score_pgl
score_pre = train_test_data.score_pre
score_post = train_test_data.score_post

## Model

## 2.0 Post-GreenLight Model

### 2.1 Pull in Data

In [0]:
from lib.dev_Post_GreenLight_Model import post_greenlight_model
# Name train and set
train_data_set = train_test_data.train_dataset
test_data_set = train_test_data.score_pgl


### 2.2 Run Model

In [0]:
from sklearn.metrics import mean_absolute_error
# mean_absolute_error(y_true, y_pred)

In [0]:
# Post_GreenLight_Model = 'lib/dev_Post_GreenLight_Model.py'
# %run $Post_GreenLight_Model

In [0]:
# df_new = {
#     'delphi_id':'df_test', 'ckg_match_id':'df_test', 
#     'ckg_series_id':'df_test',
#     'title_season' : 'The Hedge Knight S1',
#     'title_series' : 'The Hedge Knight',
#     'imdb_series_id': 'None',
#     'season_number' : 1,
#     'first_release_date': '2025-07-02',
#     'observed_medal': 'Gold',
#     'medal_adj':'Gold', 
#     'prequel_count' : 2.0,
#     'prequel_featured_count': 400.0,
#     'derived_genre':'Scripted Drama Series', 
#     'geo_value':'NORTH AMERICA',
#     'observed_medal_num':1.0, 
#     'medal_adj_num':1.0
# }

# test_data_set = test_data_set.append(df_new, ignore_index=True)

In [0]:
# Run Model with Cross Validation
pgl_model = post_greenlight_model(train_data_set, test_data_set, kpi, 
                                  input_bucket, output_bucket, geo_value, 
                                  database, schema, df_metric)
pgl_model.feature_engineer()
input_train = pgl_model.train_data
input_test = pgl_model.test_data
pgl_model.cv(NUM_FOLD = 5)

# To review predictions and feature importances in notebook
feature_importances = pgl_model.feature_importances
validation_set = pgl_model.validation_set

In [0]:
validation_set['prediction_log'] = validation_set['prediction']
validation_set['prediction'] = np.expm1(validation_set['prediction_log'])
validation_set['viewing_subs'] = np.expm1(validation_set['viewing_subs_log'])

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.metrics import mean_absolute_error, r2_score
def mape_score(y_true, y_pred): 
#     y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs(y_true - y_pred) / y_true) * 100

In [0]:
# Evaluation Scores for Absolute Value Viewing Sub Predictions - Title cutoff set to 1
from sklearn.metrics import r2_score

for fold in validation_set['fold'].drop_duplicates().to_list():
    y_true = validation_set.loc[(validation_set['fold']==fold), 'viewing_subs'].copy()
    y_pred = validation_set.loc[(validation_set['fold']==fold), 'prediction'].copy()
    
    r2 = r2_score(y_true.to_list(),
                  y_pred.to_list())
    mape = mape_score(y_true, y_pred)
    
    logger.info(f'Fold {fold}: R2 {r2.round(2)} | MAPE {mape.round()}')

In [0]:
# Evaluation Scores for Absolute Value Viewing Sub Predictions - Title cutoff set to 1
from sklearn.metrics import r2_score

for fold in validation_set['fold'].drop_duplicates().to_list():
    y_true = validation_set.loc[(validation_set['fold']==fold), 'viewing_subs_log'].copy()
    y_pred = validation_set.loc[(validation_set['fold']==fold), 'prediction_log'].copy()
    
    r2 = r2_score(y_true.to_list(),
                  y_pred.to_list())
    mape = mape_score(y_true, y_pred)
    
    logger.info(f'Fold {fold}: R2 {r2.round(2)} | MAPE {mape.round()}')

In [0]:
pgl_model.scoring()
prediction_set_post_gl = pgl_model.prediction_set

In [0]:
feature_importances

In [0]:
data[data['prediction']<0].sort_values(by=y)

In [0]:
# Compare Absolute Value Predictions with cutoff
# fig = plt.figure(figsize=(7,3))
x = f'viewing_subs_log'
y = f'prediction_log'
data = validation_set
sns.set_theme(style="whitegrid")
ax = sns.relplot(data=data, x=data[x]/1e6, y=data[y]/1e6, 
                     hue = 'observed_medal_num',
                     palette = 'pastel', col = 'fold'
#                 col = 'tier',
#                 row='observed_medal_num'
               )
# plt.title(f'Viewing Subs - 28 Training Data')
plt.axvline(x=0, linewidth=.5, color='black', ls= '--')
plt.axhline(y=0, linewidth=.5, color='black', ls= '--')

# plt.xlim(-1,70)
# plt.ylim(-1,70)
sns.move_legend(
    ax, "lower center",
    bbox_to_anchor=(.5, 1), ncol=4, title='Tiers', frameon=False,
)
plt.tight_layout()

In [0]:
# Compare Absolute Value Predictions with cutoff
# fig = plt.figure(figsize=(7,3))
x = f'viewing_subs'
y = f'prediction'
data = validation_set
sns.set_theme(style="whitegrid")
ax = sns.relplot(data=data, x=data[x]/1e6, y=data[y]/1e6, 
                     hue = 'observed_medal_num',
                     palette = 'pastel', col = 'fold'
#                 col = 'tier',
#                 row='observed_medal_num'
               )
# plt.title(f'Viewing Subs - 28 Training Data')
plt.axvline(x=0, linewidth=.5, color='black', ls= '--')
plt.axhline(y=0, linewidth=.5, color='black', ls= '--')

# plt.xlim(-1,70)
# plt.ylim(-1,70)
sns.move_legend(
    ax, "lower center",
    bbox_to_anchor=(.5, 1), ncol=4, title='Tiers', frameon=False,
)
plt.tight_layout()

In [0]:
# Compare Old Dataset to new dataset
# fig = plt.figure(figsize=(7,3))
x = f'viewing_subs'
y = f'prediction'
data = validation_set
sns.set_theme(style="whitegrid")
ax = sns.relplot(data=data, x=x, y=y, 
                     hue = 'observed_medal_num',
                     palette = 'pastel', col = 'fold'
#                 col = 'tier',
#                 row='observed_medal_num'
               )
# plt.title(f'Viewing Subs - 28 Training Data')
plt.axvline(x=0, linewidth=.5, color='black', ls= '--')
plt.axhline(y=0, linewidth=.5, color='black', ls= '--')

# plt.xlim(-1,70)
# plt.ylim(-1,70)
sns.move_legend(
    ax, "lower center",
    bbox_to_anchor=(.5, 1), ncol=4, title='Tiers', frameon=False,
)
plt.tight_layout()

In [0]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [0]:
prediction_set_post_gl[prediction_set_post_gl['observed_medal_num'].isna()]

In [0]:
prediction_set_post_gl[prediction_set_post_gl['title_series'].str.contains('The Hedge')]

In [0]:
from sklearn.metrics import r2_score
validation_set = validation_set[validation_set['derived_genre']=='Scripted Features'].copy()
for fold in validation_set['fold'].drop_duplicates().to_list():
    r2 = r2_score(validation_set.loc[(validation_set['fold']==fold), 'viewing_subs'].to_list(),
                    validation_set.loc[(validation_set['fold']==fold), 'prediction'].to_list())
    mape = mean_absolute_percentage_error(validation_set.loc[(validation_set['fold']==fold), 'viewing_subs'],
                                         validation_set.loc[(validation_set['fold']==fold), 'prediction'])
    logger.info(f'Fold {fold}: R2 {r2.round(2)} | MAPE {mape.round()}')

In [0]:
from sklearn.metrics import r2_score
validation_set = validation_set[validation_set['derived_genre']=='Scripted Features'].copy()
for fold in validation_set['fold'].drop_duplicates().to_list():
    for medal in validation_set['observed_medal_num'].drop_duplicates().to_list():
        r2 = r2_score(validation_set.loc[(validation_set['fold']==fold)&(validation_set['observed_medal_num']==medal), 'viewing_subs'].to_list(),
                        validation_set.loc[(validation_set['fold']==fold)&(validation_set['observed_medal_num']==medal), 'prediction'].to_list())
        mape = mean_absolute_percentage_error(validation_set.loc[(validation_set['fold']==fold)&(validation_set['observed_medal_num']==medal), 'viewing_subs'],
                                             validation_set.loc[(validation_set['fold']==fold)&(validation_set['observed_medal_num']==medal), 'prediction'])
        logger.info(f'Fold {fold} & Medal {medal}: R2 {r2.round(2)} | MAPE {mape.round()}')

In [0]:
from sklearn.metrics import r2_score

for medal in validation_set['observed_medal_num'].drop_duplicates().to_list():
    r2 = r2_score(validation_set.loc[(validation_set['observed_medal_num']==medal), 'viewing_subs'].to_list(),
                    validation_set.loc[(validation_set['observed_medal_num']==medal), 'prediction'].to_list())
    mape = mean_absolute_percentage_error(validation_set.loc[(validation_set['observed_medal_num']==medal), 'viewing_subs'],
                                         validation_set.loc[(validation_set['observed_medal_num']==medal), 'prediction'])
    logger.info(f'Medal {medal}: R2 {r2.round(2)} | MAPE {mape.round()}')

In [0]:
from sklearn.metrics import r2_score

for fold in validation_set['fold'].drop_duplicates().to_list():
    sample = 
    r2 = r2_score(validation_set.loc[(validation_set['fold']==fold), 'viewing_subs'].to_list(),
                    validation_set.loc[(validation_set['fold']==fold), 'prediction'].to_list())
    mape = mean_absolute_percentage_error(validation_set.loc[(validation_set['fold']==fold), 'viewing_subs'],
                                         validation_set.loc[(validation_set['fold']==fold), 'prediction'])
    logger.info(f'Fold {fold}: R2 {r2.round(2)} | MAPE {mape.round()}')

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

In [0]:
# Compare Old Dataset to new dataset
# fig = plt.figure(figsize=(7,3))
x = f'viewing_subs'
y = f'prediction'
data = validation_set
sns.set_theme(style="whitegrid")
ax = sns.scatterplot(data=data, x=x, y=y, 
                     hue = 'observed_medal_num',
                     palette = 'husl', row='tier')
# plt.title(f'Viewing Subs - 28 Training Data')
plt.axvline(x=0, linewidth=.5, color='black', ls= '--')
plt.axhline(y=0, linewidth=.5, color='black', ls= '--')

# plt.xlim(-1,70)
# plt.ylim(-1,70)
sns.move_legend(
    ax, "lower center",
    bbox_to_anchor=(.5, 1), ncol=4, title='Tiers', frameon=False,
)
plt.tight_layout()

In [0]:
prediction_set

## 3.0 Pre-Launch Model

### 3.1 Import Data

In [0]:
dev_pre_launch_model = 'lib/dev_Pre_Launch_Model.py'
%run $dev_pre_launch_model

In [0]:
# Import Datasets

# Import Wikipedia Data
query_wiki_train = f"""select * 
from {database}.content_datascience.title_season_wiki_daily_region 
where geo_value='{geo_value}'
and days_from_premiere >= -38 and days_from_premiere <=0"""
logger.info(f'{query_metric}')
df_wiki_train = sfk.execute_query(query = query_wiki_train, ctx = ctx)

# Import Future Wiki Data
query_wiki_pred = f"""select * 
from {database}.content_datascience.title_season_wiki_daily_pred
where geo_value='{geo_value}'
and days_from_premiere >= -38 and days_from_premiere <=0
"""
# logger.info(f'{wiki_query_pred}')
df_wiki_pred = sfk.execute_query(query = query_wiki_pred, ctx = ctx)

In [0]:
# Name train and set
train_dataset = train_test_data.train_dataset
test_dataset = train_test_data.score_pgl

# Combine train and prediction wikipedia data
wiki_columns = ['title_series','title_season','season_number', 'imdb_series_id',
                'first_release_date', 'days_from_premiere', 'request_date', 'page_views']
df_wiki_train['first_release_date'] = pd.to_datetime(df_wiki_train['first_release_date']).dt.strftime('%Y-%m-%d')
df_wiki_train = df_wiki_train[wiki_columns].drop_duplicates()
df_wiki_pred = df_wiki_pred[wiki_columns].drop_duplicates()

df_wiki_train['type'] = 'train'
df_wiki_pred['type'] = 'test'
df_wiki = pd.concat([df_wiki_train, df_wiki_pred]).drop_duplicates().reset_index(drop=True)


### 3.2 Run Pre-Launch Training Model

In [0]:
# NUM_FOLD = 3
# TARGET_COL =  kpi
# test_data = pre_train_data
# CAT_COL = ['derived_genre']
# FEATURE_COLS = [ 'observed_medal_num', 'prequel_featured_count', 'prequel_count', 
#                              'season_number', 'page_view_smooth']
# META_FEATURE = ['title_season', 'title_series', 'imdb_series_id',
#                             'first_release_date', 'days_from_premiere']

# FEATURE_COLS.remove('page_view_smooth')

In [0]:
dev_pre_launch_model = 'lib/dev_Pre_Launch_Model.py'
%run $dev_pre_launch_model

In [0]:
pre_model = pre_launch_model(train_dataset, test_dataset, kpi, current_date, input_bucket, output_bucket,
                            geo_value, database, df_metric, df_wiki)
pre_model.feature_engineer()
pre_train_data = pre_model.train_data
pre_test_data = pre_model.test_data
pre_model.cv(NUM_FOLD = 3)

# pre_model.scoring()

In [0]:
pre_model.scoring()
prediction_set_pre_launch = pre_model.prediction_set

In [0]:
prediction_set_pre_launch[prediction_set_pre_launch['title_series'].isin(['The Iron Close', 'Dune: Part Two', ''])]

In [0]:
from sklearn.metrics import r2_score
validation_set = pre_model.validation_set
days = validation_set.days_from_premiere.drop_duplicates().to_list()
folds = validation_set.fold.drop_duplicates().to_list()
for day in days:
    fold_score = []
    for fold in folds:
        r2 = r2_score(validation_set.loc[(validation_set['days_from_premiere']==day)&(validation_set['fold']==fold), 'viewing_subs'].to_list(),
                        validation_set.loc[(validation_set['days_from_premiere']==day)&(validation_set['fold']==fold), 'prediction'].to_list())
        fold_score.append(r2.round(2))
    print(f'R2 for {day}: {fold_score}')

In [0]:
from sklearn.metrics import r2_score

validation_set = pre_model.validation_set
feature_importance = pre_model.feature_importances
days = validation_set.days_from_premiere.drop_duplicates().to_list()
folds = validation_set.fold.drop_duplicates().to_list()

for day in days:
    feature_list = feature_importance.loc[(feature_importance['feature_importance']>0.03)&
                       (feature_importance['days_from_premiere']==day)].sort_values(
    by='feature_importance', ascending=False)['features'].to_list()
    logger.info(f'Features for day: {day}')
    
    fold_score = []
    for fold in folds:
        r2 = r2_score(validation_set.loc[(validation_set['days_from_premiere']==day)&(validation_set['fold']==fold), 'viewing_subs'].to_list(),
                        validation_set.loc[(validation_set['days_from_premiere']==day)&(validation_set['fold']==fold), 'prediction'].to_list())
        fold_score.append(r2.round(2))
    logger.info(f'{feature_list}')
    logger.info(f'R2 for {day}: {fold_score}')

In [0]:
# train_data = self.train_data
# TARGET_COL = self.TARGET_COL
# META_FEATURE = self.META_FEATURE
# FEATURE_COLS = self.FEATURE_COLS
prediction_set = pd.DataFrame()
feature_importances = pd.DataFrame()
model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

for day in range(-27, -25):
    print ("run for day: " + str(day))
    train_df = train_data[train_data['days_from_premiere'] == day]
    test_df = test_data[test_data['days_from_premiere'] == day]

    X_train, X_test = train_df[FEATURE_COLS], test_df[FEATURE_COLS]
    y_train = train_df[TARGET_COL]

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred = pd.DataFrame(pred)
    pred.columns = ['prediction']
    test_df.reset_index(inplace = True, drop = True)
    test_df = pd.concat([test_df, pred], axis = 1)
    prediction_set = pd.concat([prediction_set, test_df], axis = 0)

# prediction_set['lifecycle'] = 'prelaunch'
# prediction_set = prediction_set[self.META_FEATURE + [
#     'home_territory_observed_medal', 'prequel_featured_count', 'prequel_count', 'season_number',
#     'wikipedia_page_view_smooth', 'prelaunch_trailer_view_smooth', 'programming_provided_genre', 
#     'lifecycle', 'prediction']].sort_values(by = ['first_release_date'])

# Utils.to_csv_s3(content=prediction_set, bucket=self.output_bucket, 
#                 key_path=self.TARGET_COL,
#                 filename=f'prediction_set_prelaunch_new_{self.geo_value}.csv')
# logger.info(f'model_kernel.feature_engineer.prediction_set')
# self.prediction_set = prediction_set

In [0]:
prediction_set['lifecycle'] = 'prelaunch'
prediction_set = prediction_set[META_FEATURE + [
    'observed_medal_num', 'prequel_featured_count', 'prequel_count', 'season_number', 'derived_genre', 
    'lifecycle', 'prediction']].sort_values(by = ['first_release_date'])

In [0]:
prediction_set[prediction_set['title_series']=='The Iron Claw']

In [0]:
pre_model.scoring()
prediction_set_pre_launch = pre_model.prediction_set

### 3.3 Format PreLaunch Predictions and Write to Delphi

In [0]:
# Add Columns needed for Delphi Prediction Tables

data = {
    'model_name': ['pct_viewing_subs_2.0'],
    'table_name': ['pct_viewing_subs_prelaunch'],
    'model_version': ['2.0'],
    'sub_type': ['Max Retail+Wholesale'],
    'sub_plan': ['Platform'],
    'unit': ['percent'],
    'region': [geo_value],
    'days_after_premiere': [28],
    'publish_date': [current_date],
    'key': [1]
}
df_delphi = pd.DataFrame(data=data, columns = data.keys())




In [0]:
df_pred

In [0]:
prediction_set_pre_launch['key'] = 1
df_delphi_prelaunch = df_delphi.merge(prediction_set_pre_launch, on='key').drop(
    'key', 1).reset_index(drop=True).copy()

# Add imdb_id and ckg_match_id
df_delphi_prelaunch = df_delphi_prelaunch.merge(
    test_data_set[['delphi_id', 'ckg_match_id', 'ckg_series_id'
                   'title_season', 'title_series', 'season_number']], 
    on=['title_season', 'title_series', 'season_number'], how='inner')

In [0]:
df_delphi_prelaunch

### 2.3 Format PostGLight Predictions and Write to Delphi

In [0]:
# Add Columns needed for Delphi Prediction Tables
# df_delphi_postgl = df_delphi.merge(prediction_set_post_gl, on='key').drop('key', 1).reset_index(drop=True).copy()

data = {
    'model_name': ['pct_viewing_subs_2.0'],
    'table_name': ['pct_viewing_subs_postgl'],
    'model_version': ['2.0'],
    'sub_type': ['Max Retail+Wholesale'],
    'sub_plan': ['Platform'],
    'unit': ['percent'],
    'region': [geo_value],
    'days_after_premiere': [28],
    'publish_date': [current_date],
    'key': [1]
}
df_delphi = pd.DataFrame(data=data, columns = data.keys())


prediction_set_post_gl['key'] = 1
df_delphi_postgl = df_delphi.merge(prediction_set_post_gl, on='key').drop(
    'key', 1).reset_index(drop=True).copy()

# Add imdb_id and ckg_match_id
df_delphi_postgl = df_delphi_postgl.merge(
    test_data_set[['delphi_id', 'ckg_match_id', 'ckg_series_id', 'imdb_series_id',
                   'title_season', 'title_series', 'season_number']], 
    on=['title_season', 'title_series', 'season_number'], how='inner')

In [0]:
# Rename Columns to match Delphi
rename_set = {
    'ckg_series_id' : 'title_id',
    'imdb_series_id' : 'imdb_id',
    'title_series' : 'title_name', 
    'derived_genre' :'category',   
    'first_release_date' : 'premiere_date'
}
df_delphi_postgl.rename(columns = rename_set, inplace=True)


In [0]:
# Update dates to datetime object
df_delphi_postgl['premiere_date'] = pd.to_datetime(df_delphi_postgl['premiere_date'])
df_delphi_postgl['publish_date'] = pd.to_datetime(df_delphi_postgl['publish_date'])
df_delphi_postgl['current_days_from_premiere'] = (df_delphi_postgl['publish_date'] - df_delphi_postgl['premiere_date']).dt.days
df_delphi_postgl['target_date'] = df_delphi_postgl['premiere_date'] + df_delphi_postgl['days_after_premiere'].apply(lambda x: pd.DateOffset(days=x))

# Back to string to make compatible with sfk package
df_delphi_postgl['publish_date'] = df_delphi_postgl['publish_date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_delphi_postgl['target_date'] = df_delphi_postgl['target_date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_delphi_postgl['premiere_date'] = df_delphi_postgl['premiere_date'].apply(lambda x: x.strftime('%Y-%m-%d'))


In [0]:
# Write to Snowflake
df_delphi_postgl = df_delphi_postgl[df_delphi_postgl['delphi_id'].notnull()].reset_index(drop=True).copy()
# sfk.export_dataframe_to_table(database=database, schema='delphi', df=df_delphi_postgl, 
#                               table=f'{df_delphi_postgl.loc[0,"table_name"]}_staging', conn=ctx)

In [0]:
from lib.dev_Pre_Launch_Model import post_greenlight_model

In [0]:
dev_pre_launch_model = 'lib/dev_Pre_Launch_Model.py'
%run $dev_pre_launch_model


In [0]:
pre_launch_model = 'lib/Pre_Launch_Model.py'
%run $pre_launch_model

### 3.1 Pull in Data

In [0]:
# create or replace table max_dev.workspace.forecasting_signals_search as (
# select gs.TRACKING_GROUP_ID as imdb_id, date,indexed_volume::float as value, 'search' as metric,geo_name
# ,CONVERT_TIMEZONE('UTC', 'America/Los_Angeles',current_timestamp::TIMESTAMP_NTZ)::timestamp  as create_ts
# from "MAX_PROD"."CKG"."GST_COUNTRY_LEVEL_VOLUME" as gs
# where date::date > '2020-01-01' 
# and gs.TRACKING_GROUP_ID in (select imdb_id from forecasting_premieres)
# and lower(geo_name) in (select lower(country_iso_code) from forecasting_geo_map)
# );