In [1]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io
import logging

import boto3
import sys
import json

from lib.model import ModelMain

# configs
from lib.config import percent_data_process_info
from lib.config import prelaunch_process_info
from lib.config import metadata_process_info
from lib.config import default_params_dict as params_dict
from lib.config import model_name_list
from lib.config import params_tunning_dict

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Reading Data 

Reading data from multiple sources, with the corresponding query

### Step 1 of the Prediction Process: Getting Data
Step 1.1: update the funnel metrics by Sagemaker ipynb file 'query_pipeline' under the '/query' folder

Step 1.2: run each of the query in the '/day28_prediction/query/' to extract each input csv below

In [2]:
input_bucket = 'hbo-ingest-datascience-content-dev'

In [3]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list =[]

s3 = boto3.resource('s3')
bucket = s3.Bucket(input_bucket)
# Iterates through all the objects, doing the pagination for you. Each obj
# is an ObjectSummary, so it doesn't contain the body. You'll need to call
# get to get the whole body.
for obj in bucket.objects.filter(Prefix='input_percent_view'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    # exclude Tom & Jerry due to unresolvable data issue
    exec("{0} = {0}.loc[{0}['match_id_platform'].\
        isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
        .reset_index(drop = True)".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

Reading funnel_metric_feature features
Reading media_cost_postlaunch_feature features
Reading media_cost_prelaunch_feature features
Reading metadata_feature features
Reading prelaunch_trailer_feature features
Reading prelaunch_trailer_feature_before28 features
Reading sub_total_feature features
Reading trailer_feature features
Reading vtp_feature features
Reading wiki_view_feature_before28 features
Reading wiki_view_post_feature features
Reading wiki_view_pre_feature features


In [4]:
bucket = s3.Bucket(input_bucket)
for obj in bucket.objects.filter(Prefix='pct_actives_prediction/pct_actives_metric_values'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    # exclude Tom & Jerry due to unresolvable data issue
    exec("{0} = {0}.loc[{0}['match_id'].\
        isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
        .reset_index(drop = True)".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

Reading pct_actives_metric_values features


In [5]:
metadata_feature = data_list[3]

In [6]:
active_data = data_list[-1][['match_id', 'days_on_hbo_max', 'pct_actives']]
active_data = active_data.merge(metadata_feature[['match_id', 'match_id_platform']], on = 'match_id')
active_data.drop(['match_id'], axis = 1, inplace = True)

In [7]:
active_data['pct_actives_values'] = active_data.groupby(['match_id_platform', 'days_on_hbo_max'])['pct_actives'].transform('mean')
active_data = active_data[['match_id_platform', 'days_on_hbo_max', 'pct_actives_values']]
active_data = active_data[(active_data['match_id_platform'].notnull())
                         &(active_data['days_on_hbo_max'].notnull())]
active_data.drop_duplicates(inplace = True)

In [8]:
active_data = active_data.pivot(index='match_id_platform', columns='days_on_hbo_max', values=['pct_actives_values']).reset_index()
columns = ['day00' + str(i) + '_percent_actives' for i in range(1, 10) ]
columns = columns + ['day0' + str(i) + '_percent_actives' for i in range(10, 29)]
active_data.columns = ['match_id_platform'] + columns

In [9]:
data_list.pop(-1)
data_list.append(active_data)

In [10]:
# start a object
logger.info('Setting up the prediction model')
percentile_used = 0.8
back_consideration_date = 180
nfold = np.floor(back_consideration_date/30)
cv_func = ModelMain(data_list, metadata_process_info['label_columns'], metadata_process_info['num_columns'])

Final title size: 8052, All title size: 8052


# New Title Prediction, Post Launch 

### Cross Validations

In [None]:
percent_data_process_info['exact_X_pred'] = False
output_flag = True
new_title_output = pd.DataFrame()
existing_title_output = pd.DataFrame()
back_consideration_date = 180

for day in range(-27,28):
    # renew the percent_data_process_info data very time
    from lib.config import percent_data_process_info
    from lib.config import prelaunch_process_info
    from lib.config import metadata_process_info

    # determine prelaunch or postlaunch
    if day < 1:
        input_process_info = dict(prelaunch_process_info)
        percent_data_process_info['target_log_transformation'] = False
        percent_data_process_info['log_ratio_transformation'] = False
        input_percentile_used = percentile_used
        model_name = 'lr'
        model_name_list = [model_name]
    elif day<14:
        input_process_info = dict(metadata_process_info)
        percent_data_process_info['target_log_transformation'] = True
        percent_data_process_info['log_ratio_transformation'] = True
        input_percentile_used = percentile_used
        model_name = 'lgb'
        model_name_list = [model_name]
    else:
        input_process_info = dict(metadata_process_info)
        percent_data_process_info['target_log_transformation'] = False
        percent_data_process_info['log_ratio_transformation'] = False
        input_percentile_used = percentile_used
        model_name = 'lr'
        model_name_list = [model_name]

    # just to make the values in the dict back to the initial values
    percent_data_process_info = dict(percent_data_process_info)
    percent_data_process_info['max_num_day'] = day
    
    # get x and y
    logger.info('Get X and y for day {}'.format(day))
    cv_func.get_X_y(percent_data_process_info, 
                     input_process_info, 
                     day001_popularity_threshold = input_percentile_used)

    # tune parameter
    if model_name not in  ['lr', 'enet']:
        logger.info('Tune parameter for day {}'.format(day))
        print('Tune parameter for day {}'.format(day))
        cv_func.parameter_tuning(model_name, 
                            params_tunning_dict, 
                            percent_data_process_info,
                            nfold = nfold,
                            back_consideration_date = back_consideration_date)
        
        params_dict = cv_func.min_smape_param['min_smape_original']
        param_stats = cv_func.parameter_tuning_stats
        logger.info('SMAPE for all titles {}'.format(param_stats['min_smape_all']))
        logger.info('SMAPE for the originals {}'.format(param_stats['min_smape_original']))
        print('SMAPE for all titles {}'.format(param_stats['min_smape_all']))
        print('SMAPE for the originals {}'.format(param_stats['min_smape_original']))
    
    else:
        logger.info('Do cross prediction for day {}'.format(day))
        print('Do cross prediction for day {}'.format(day))
        cv_func.cross_prediction(
                         model_name_list, 
                         params_dict, 
                         percent_data_process_info, 
                         nfold = nfold, 
                         back_consideration_date = back_consideration_date)
        
        logger.info('SMAPE for all titles {}'.format(cv_func.output['smape_' + model_name].mean()))
        logger.info('SMAPE for the originals {}'.format(cv_func.output.loc[cv_func.output['program_type']==1,'smape_' + model_name].mean()))
        print('SMAPE for all titles {}'.format(cv_func.output['smape_' + model_name].mean()))
        print('SMAPE for the originals {}'.format(cv_func.output.loc[cv_func.output['program_type']==1,'smape_' + model_name].mean()))
    
    # make prediction
    print('running cvs at day {}'.format(day))
    cur_existing_title_output = cv_func.output
    pred_column = cur_existing_title_output.columns[cur_existing_title_output.columns.str.contains(model_name)][0]
    cur_existing_title_output['pred_day'] = day
    cur_existing_title_output = cur_existing_title_output.rename(columns = {pred_column:'prediction'})
    cur_existing_title_output = cur_existing_title_output.rename(columns = {'smape_lgb':'smape'
                                                                    ,'smape_lr':'smape'
                                                                    ,'smape_enet':'smape'
                                                                    ,'mae_lgb':'mae'
                                                                    ,'mae_lr':'mae'
                                                                    ,'mae_enet':'mae'
                                                                    })
    existing_title_output = pd.concat([existing_title_output, cur_existing_title_output], axis = 0)
            
    if cv_func.pred_empty_flag == True:
        pass
    else:
        logger.info('Making prediction for day {}'.format(day))
        print('Making prediction for day {}'.format(day))
        cv_func.predict_new_titles(model_name_list, 
                                   params_dict, 
                                   percent_data_process_info)
    
        # process the output
        cur_new_title_output = cv_func.new_title_output
        pred_column = cur_new_title_output.columns[cur_new_title_output.columns.str.contains(model_name)][0]
        cur_new_title_output['pred_day'] = day
        cur_new_title_output = cur_new_title_output.rename(columns = {pred_column:'prediction'})

        new_title_output = pd.concat([new_title_output,cur_new_title_output], axis = 0)
#         print (new_title_output.head())

          
# final formatting

if new_title_output.shape[0]>0:    
    new_title_output = new_title_output.drop(columns = ['target']).sort_values(['title_name','pred_day'])
    new_title_output = new_title_output[['title_name'
                                        ,'match_id'
                                        ,'match_id_platform'
                                        ,'platform_name'
                                        ,'program_type'
                                        ,'pred_day'
                                        ,'prediction']]

if existing_title_output.shape[0]>0:           
    existing_title_output = existing_title_output.sort_values(['match_id_platform','pred_day'])
    existing_title_output['platform_name'] = existing_title_output['match_id_platform'].apply(lambda x: x[0])
    existing_title_output = existing_title_output[['title_name'
                                                ,'match_id'
                                                ,'match_id_platform'
                                                ,'platform_name'
                                                ,'program_type'
                                                ,'target'
                                                ,'pred_day'
                                                ,'prediction'
                                                ,'smape'
                                                ,'mae'
                                                ,'fold']]

keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1909 titles considered after popularity filter
only 1416 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -27
SMAPE for all titles 0.6229156161907226
SMAPE for the originals 0.6865927899132753
running cvs at day -27
Making prediction for day -27
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1909 titles considered after popularity filter
only 1416 titles considered after prelaunch filter
the number of days is not large enough to use log ratio transformation
X and y are ready based on the input params
Do cross prediction for day -26
SMAPE for all titles 0.6431574958552408
SMAPE for the originals 0.686821528926303
running cvs at day -26
Making prediction for day -26
keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1909 tit

# Write csvs to S3

### Step 3: Write the prediction result to S3

In [None]:
output_bucket = 'hbo-outbound-datascience-content-dev'

In [None]:
def to_s3(filename, output_bucket, content):
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)

In [None]:
logger.info('Writing new title predictions over time to S3 as an csv file')
print('Writing new title predictions over time to S3 as an csv file')
csv_buffer = io.StringIO()
new_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'output_percent_actives/new_title_prediction.csv'

to_s3(filename, output_bucket, content)

In [None]:
logger.info('Writing existing title predictions over time to S3 as an csv file')
print('Writing existing title predictions over time to S3 as an csv file')
csv_buffer = io.StringIO()
existing_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'output_percent_actives/existing_title_prediction.csv'

to_s3(filename, output_bucket, content)