In [1]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io
import logging

import boto3
import sys
import json

from lib.model import ModelMain

# configs
from lib.config import percent_data_process_info
from lib.config import prelaunch_process_info
from lib.config import metadata_process_info
from lib.config import default_params_dict as params_dict
from lib.config import model_name_list
from lib.config import params_tunning_dict

# Reading Data 

Reading data from multiple sources, with the corresponding query

### Step 1 of the Prediction Process: Getting Data
Step 1.1: update the funnel metrics by Sagemaker ipynb file 'query_pipeline' under the '/query' folder

Step 1.2: run each of the query in the '/day28_prediction/query/' to extract each input csv below

In [2]:
input_bucket = 'hbo-ingest-datascience-content-dev'

In [3]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list =[]

s3 = boto3.resource('s3')
bucket = s3.Bucket(input_bucket)
# Iterates through all the objects, doing the pagination for you. Each obj
# is an ObjectSummary, so it doesn't contain the body. You'll need to call
# get to get the whole body.
for obj in bucket.objects.filter(Prefix='input_percent_view'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    # exclude Tom & Jerry due to unresolvable data issue
    exec("{0} = {0}.loc[{0}['match_id_platform'].\
        isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
        .reset_index(drop = True)".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

Reading funnel_metric_feature features
Reading media_cost_postlaunch_feature features
Reading media_cost_prelaunch_feature features
Reading metadata_feature features
Reading prelaunch_trailer_feature features
Reading prelaunch_trailer_feature_before28 features
Reading sub_total_feature features
Reading trailer_feature features
Reading vtp_feature features
Reading wiki_view_feature_before28 features
Reading wiki_view_post_feature features
Reading wiki_view_pre_feature features


In [4]:
bucket = s3.Bucket(input_bucket)
for obj in bucket.objects.filter(Prefix='pct_actives_prediction/pct_actives'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    # exclude Tom & Jerry due to unresolvable data issue
    exec("{0} = {0}.loc[{0}['match_id'].\
        isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
        .reset_index(drop = True)".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

Reading pct_actives features


In [5]:
active_data = data_list[-1][['match_id_platform', 'days_after_launch', 'pct_actives']]
active_data['pct_actives_values'] = active_data.groupby(['match_id_platform', 'days_after_launch'])['pct_actives'].transform('mean')
active_data = active_data[['match_id_platform', 'days_after_launch', 'pct_actives_values']]
active_data = active_data[(active_data['match_id_platform'].notnull())
                         &(active_data['days_after_launch'].notnull())]
active_data.drop_duplicates(inplace = True)

In [6]:
active_data = active_data.pivot(index='match_id_platform', columns='days_after_launch', values=['pct_actives_values']).reset_index()
columns = ['day00' + str(i) + '_percent_actives' for i in range(1, 10) ]
columns = columns + ['day0' + str(i) + '_percent_actives' for i in range(10, 29)]
active_data.columns = ['match_id_platform_actives'] + columns

In [7]:
funnel_metric_feature = data_list[0]
active_data = pd.merge(funnel_metric_feature[['match_id_platform']],
               active_data, left_on = 'match_id_platform', right_on = 'match_id_platform_actives', 
               how = 'inner')
active_data.drop(['match_id_platform_actives'], axis =1, inplace = True)
data_list.pop(-1)
data_list.append(active_data)

In [8]:
# start a object
logger.info('Setting up the prediction model')
percentile_used = 0.8
back_consideration_date = 180
nfold = np.floor(back_consideration_date/30)
cv_func = ModelMain(data_list, metadata_process_info['label_columns'], metadata_process_info['num_columns'])

Final title size: 6058, All title size: 6058


# New Title Prediction, Post Launch 

### Cross Validations

In [9]:
percent_data_process_info['exact_X_pred'] = False
output_flag = True
new_title_output = pd.DataFrame()
existing_title_output = pd.DataFrame()
back_consideration_date = 180

for day in range(-27,28):
    # renew the percent_data_process_info data very time
    from lib.config import percent_data_process_info
    from lib.config import prelaunch_process_info
    from lib.config import metadata_process_info

    # determine prelaunch or postlaunch
    if day < 1:
        input_process_info = dict(prelaunch_process_info)
        percent_data_process_info['target_log_transformation'] = False
        percent_data_process_info['log_ratio_transformation'] = False
        input_percentile_used = percentile_used
        model_name = 'lr'
        model_name_list = [model_name]
    elif day<14:
        input_process_info = dict(metadata_process_info)
        percent_data_process_info['target_log_transformation'] = True
        percent_data_process_info['log_ratio_transformation'] = True
        input_percentile_used = percentile_used
        model_name = 'lgb'
        model_name_list = [model_name]
    else:
        input_process_info = dict(metadata_process_info)
        percent_data_process_info['target_log_transformation'] = False
        percent_data_process_info['log_ratio_transformation'] = False
        input_percentile_used = percentile_used
        model_name = 'lr'
        model_name_list = [model_name]

    # just to make the values in the dict back to the initial values
    percent_data_process_info = dict(percent_data_process_info)
    percent_data_process_info['max_num_day'] = day
    
    # get x and y
    logger.info('Get X and y for day {}'.format(day))
    cv_func.get_X_y(percent_data_process_info, 
                     input_process_info, 
                     day001_popularity_threshold = input_percentile_used)

    # tune parameter
    if model_name not in  ['lr', 'enet']:
        logger.info('Tune parameter for day {}'.format(day))
        print('Tune parameter for day {}'.format(day))
        cv_func.parameter_tuning(model_name, 
                            params_tunning_dict, 
                            percent_data_process_info,
                            nfold = nfold,
                            back_consideration_date = back_consideration_date)
        
        params_dict = cv_func.min_smape_param['min_smape_original']
        param_stats = cv_func.parameter_tuning_stats
        logger.info('SMAPE for all titles {}'.format(param_stats['min_smape_all']))
        logger.info('SMAPE for the originals {}'.format(param_stats['min_smape_original']))
        print('SMAPE for all titles {}'.format(param_stats['min_smape_all']))
        print('SMAPE for the originals {}'.format(param_stats['min_smape_original']))
    
    else:
        logger.info('Do cross prediction for day {}'.format(day))
        print('Do cross prediction for day {}'.format(day))
        cv_func.cross_prediction(
                         model_name_list, 
                         params_dict, 
                         percent_data_process_info, 
                         nfold = nfold, 
                         back_consideration_date = back_consideration_date)
        
        logger.info('SMAPE for all titles {}'.format(cv_func.output['smape_' + model_name].mean()))
        logger.info('SMAPE for the originals {}'.format(cv_func.output.loc[cv_func.output['program_type']==1,'smape_' + model_name].mean()))
        print('SMAPE for all titles {}'.format(cv_func.output['smape_' + model_name].mean()))
        print('SMAPE for the originals {}'.format(cv_func.output.loc[cv_func.output['program_type']==1,'smape_' + model_name].mean()))
    
    # make prediction
    print('running cvs at day {}'.format(day))
    cur_existing_title_output = cv_func.output
    pred_column = cur_existing_title_output.columns[cur_existing_title_output.columns.str.contains(model_name)][0]
    cur_existing_title_output['pred_day'] = day
    cur_existing_title_output = cur_existing_title_output.rename(columns = {pred_column:'prediction'})
    cur_existing_title_output = cur_existing_title_output.rename(columns = {'smape_lgb':'smape'
                                                                    ,'smape_lr':'smape'
                                                                    ,'smape_enet':'smape'
                                                                    ,'mae_lgb':'mae'
                                                                    ,'mae_lr':'mae'
                                                                    ,'mae_enet':'mae'
                                                                    })
    existing_title_output = pd.concat([existing_title_output, cur_existing_title_output], axis = 0)
            
    if cv_func.pred_empty_flag == True:
        pass
    else:
        logger.info('Making prediction for day {}'.format(day))
        print('Making prediction for day {}'.format(day))
        cv_func.predict_new_titles(model_name_list, 
                                   params_dict, 
                                   percent_data_process_info)
    
        # process the output
        cur_new_title_output = cv_func.new_title_output
        pred_column = cur_new_title_output.columns[cur_new_title_output.columns.str.contains(model_name)][0]
        cur_new_title_output['pred_day'] = day
        cur_new_title_output = cur_new_title_output.rename(columns = {pred_column:'prediction'})

        new_title_output = pd.concat([new_title_output,cur_new_title_output], axis = 0)
        print (new_title_output.head())

          
# final formatting

if new_title_output.shape[0]>0:    
    new_title_output = new_title_output.drop(columns = ['target']).sort_values(['title_name','pred_day'])
    new_title_output = new_title_output[['title_name'
                                        ,'match_id'
                                        ,'match_id_platform'
                                        ,'platform_name'
                                        ,'program_type'
                                        ,'pred_day'
                                        ,'prediction']]

if existing_title_output.shape[0]>0:           
    existing_title_output = existing_title_output.sort_values(['match_id_platform','pred_day'])
    existing_title_output['platform_name'] = existing_title_output['match_id_platform'].apply(lambda x: x[0])
    existing_title_output = existing_title_output[['title_name'
                                                ,'match_id'
                                                ,'match_id_platform'
                                                ,'platform_name'
                                                ,'program_type'
                                                ,'target'
                                                ,'pred_day'
                                                ,'prediction'
                                                ,'smape'
                                                ,'mae'
                                                ,'fold']]

keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 993 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -27


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.6892789385508999
SMAPE for the originals 0.8053692753949397
running cvs at day -27
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 992 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -26


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7224919786427927
SMAPE for the originals 0.8135550222991741
running cvs at day -26
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 994 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -25


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.720684213780345
SMAPE for the originals 0.8201911438119611
running cvs at day -25
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 994 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


X and y are ready based on the input params
Do cross prediction for day -24
SMAPE for all titles 0.7196267551566051
SMAPE for the originals 0.818568891346505
running cvs at day -24
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 994 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -23


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7112178195195015
SMAPE for the originals 0.8206968855479955
running cvs at day -23
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 996 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -22


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7106892433935055
SMAPE for the originals 0.8272565388587921
running cvs at day -22
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 999 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -21


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7109344690732016
SMAPE for the originals 0.8285112461700308
running cvs at day -21
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1000 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -20


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7062918140860899
SMAPE for the originals 0.8274208224525689
running cvs at day -20
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1000 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -19


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7164243590378155
SMAPE for the originals 0.8781070173788124
running cvs at day -19
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 999 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -18


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7169330312021511
SMAPE for the originals 0.8744296741454818
running cvs at day -18
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1000 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -17


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7201957362941823
SMAPE for the originals 0.8689358057171779
running cvs at day -17
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1002 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -16


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7206514777265453
SMAPE for the originals 0.8674555497231291
running cvs at day -16
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1000 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -15


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.721049061684952
SMAPE for the originals 0.8618147402497474
running cvs at day -15
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1003 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -14


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7207082729803872
SMAPE for the originals 0.848575820410195
running cvs at day -14
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1004 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -13


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7186609435660929
SMAPE for the originals 0.8121537045984324
running cvs at day -13
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1005 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -12


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7296510786961585
SMAPE for the originals 0.8758684941519528
running cvs at day -12
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1006 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -11


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7311438058103419
SMAPE for the originals 0.8756003381516732
running cvs at day -11
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1005 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -10


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.730008367014203
SMAPE for the originals 0.8779151396716488
running cvs at day -10
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1006 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -9


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7418947232231934
SMAPE for the originals 0.9585026581451902
running cvs at day -9
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1009 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -8


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7392090114309577
SMAPE for the originals 0.9555031814412321
running cvs at day -8
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1009 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -7


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7123056035898216
SMAPE for the originals 0.9301351005924875
running cvs at day -7
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1008 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -6


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7103329818635723
SMAPE for the originals 0.9833651583690267
running cvs at day -6
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1009 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -5


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7059065128434913
SMAPE for the originals 0.9785188514428446
running cvs at day -5
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1009 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -4


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.710932326459334
SMAPE for the originals 1.0222691655414486
running cvs at day -4
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1010 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -3


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7104734733580301
SMAPE for the originals 1.003011046980316
running cvs at day -3
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1010 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -2


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7098344394002768
SMAPE for the originals 1.0024526182152693
running cvs at day -2
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1010 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -1


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7085588187699436
SMAPE for the originals 0.998992023494787
running cvs at day -1
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
only 1012 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day 0


  self.X_base = self.X[self.y!=-100]
  self.X_pred = self.X[((self.y==-100) & (self.X['platform_name']==1))]


SMAPE for all titles 0.7007345368586007
SMAPE for the originals 0.9816108033813208
running cvs at day 0
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 1
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.4151634736840162
SMAPE for the originals 0.5103643156060886
running cvs at day 1
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 2
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.4075364644239609
SMAPE for the originals 0.46232007755722615
running cvs at day 2
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 3
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.3938157992282658
SMAPE for the originals 0.4723289914705869
running cvs at day 3
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 4
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.3804796080933705
SMAPE for the originals 0.4639024385618051
running cvs at day 4
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 5
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.3609728561559843
SMAPE for the originals 0.4453458588800561
running cvs at day 5
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 6
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.343647421674003
SMAPE for the originals 0.44280944724597415
running cvs at day 6
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 7
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.35777074572888784
SMAPE for the originals 0.46683887197272705
running cvs at day 7
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 8
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.3817567795051302
SMAPE for the originals 0.470195059669256
running cvs at day 8
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 9
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.3856724840121662
SMAPE for the originals 0.505154403735527
running cvs at day 9
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 10
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.38901328525859225
SMAPE for the originals 0.4967494900422976
running cvs at day 10
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 11
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.3831687831690932
SMAPE for the originals 0.5006495813370867
running cvs at day 11
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 12
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.3764173633955396
SMAPE for the originals 0.46258376613064406
running cvs at day 12
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Tune parameter for day 13
parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




SMAPE for all titles 0.37037184384905447
SMAPE for the originals 0.4572588119833669
running cvs at day 13
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Do cross prediction for day 14
SMAPE for all titles 0.23377340765298835
SMAPE for the originals 0.34909102694102534
running cvs at day 14
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Do cross prediction for day 15
SMAPE for all titles 0.2361763450753097
SMAPE for the originals 0.3705051122289715
running cvs at day 15
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1314 titles considered after popularity filter
X and y are ready based on the input params
Do cross prediction for day 16
SMAPE for all titles 0.2333421914967058
SMAPE for the originals 0.3675488516694943
ru

# Write csvs to S3

### Step 3: Write the prediction result to S3

In [13]:
output_bucket = 'hbo-outbound-datascience-content-dev'

In [14]:
def to_s3(filename, output_bucket, content):
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)

In [15]:
logger.info('Writing new title predictions over time to S3 as an csv file')
print('Writing new title predictions over time to S3 as an csv file')
csv_buffer = io.StringIO()
new_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'output_percent_actives/new_title_prediction.csv'

to_s3(filename, output_bucket, content)

Writing new title predictions over time to S3 as an csv file


In [16]:
logger.info('Writing existing title predictions over time to S3 as an csv file')
print('Writing existing title predictions over time to S3 as an csv file')
csv_buffer = io.StringIO()
existing_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'output_percent_actives/existing_title_prediction.csv'

to_s3(filename, output_bucket, content)

Writing existing title predictions over time to S3 as an csv file


# Results

In [14]:
# df = pd.read_csv('df.csv')
# df[df['match_id_platform'] == '1-GYUjdLgBiJp5otAEAAAAJ']

In [28]:
# import os
# import sys
# import logging
# import boto3
# import itertools as it
# import io
# # from utils import *
# import snowflake.connector

In [29]:
# from matplotlib.pyplot import figure
# import matplotlib.pyplot as plt
# import datetime
# from datetime import timedelta
# import scipy.stats as st

In [30]:
# pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', None)
# pd.options.mode.chained_assignment = None  # default='warn'

In [32]:
# import numpy as np
# import math
# import pandas as pd
# import re
# from datetime import date, datetime
# import json
# from abc import ABCMeta, abstractmethod
# import boto3

# class Credentials(metaclass=ABCMeta):
#     pass
    
    
# class SSMPSCredentials(Credentials):
#     def __init__(self, secretid: str):
#         self._secretid = secretid
#         self._secrets = {}
        
#     def get_keys(self):
#         """
#         credential fetching 
#         """
#         _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
#         secrets_client = boto3.client(**_aws_sm_args)
#         get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
#         return get_secret_value_response
    
    
# class BaseConnector(metaclass=ABCMeta):
#     @abstractmethod
#     def connect(self):
#         raise NotImplementedError

In [33]:
# class SnowflakeConnector(BaseConnector):
#     def __init__(self, credentials: Credentials):
#         keys = credentials.get_keys()
#         self._secrets = json.loads(keys.get('SecretString', "{}"))

#     def connect(self, dbname: str, schema: str = 'DEFAULT'):
#         ctx = snowflake.connector.connect(
#             user=self._secrets['login_name'],
#             password=self._secrets['login_password'],
#             account=self._secrets['account'],
#             warehouse=self._secrets['warehouse'],
#             database=dbname,
#             schema=schema
#         )

#         return ctx
    
# ## Credentials
# SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

# ## Snowflake connection 
# conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
# ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")

In [34]:
# def run_query(query):
#     cursor = ctx.cursor()
#     cursor.execute(query)
#     df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
#     df.columns= df.columns.str.lower()
#     return df

In [18]:
# existing_title_output.to_csv('existing_title_output.csv')
# new_title_output.to_csv('new_title_output.csv')

In [13]:
# existing_title_output = pd.read_csv('existing_title_output.csv')
# new_title_output = pd.read_csv('new_title_output.csv')

In [25]:
# metadata_feature = data_list[3]

In [35]:
# popcorn_titles = run_query('''
# SELECT * FROM MAX_PROD.CATALOG.POPCORN_TITLES
# ''')