In [None]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io
import logging

import boto3
import sys

from lib.model import ModelMain

# configs
from lib.config import percent_data_process_info
from lib.config import prelaunch_process_info
from lib.config import metadata_process_info
from lib.config import default_params_dict as params_dict
from lib.config import model_name_list
from lib.config import params_tuning_dict

In [None]:
!pip freeze

# Reading Data 

Reading data from multiple sources, with the corresponding query

### Step 1 of the Prediction Process: Getting Data
Step 1.1: update the funnel metrics by Sagemaker ipynb file 'query_pipeline' under the '/query' folder

Step 1.2: run each of the query in the '/day28_prediction/query/' to extract each input csv below

In [None]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list =[]

s3 = boto3.resource('s3')
bucket = s3.Bucket(input_bucket)
# Iterates through all the objects, doing the pagination for you. Each obj
# is an ObjectSummary, so it doesn't contain the body. You'll need to call
# get to get the whole body.
for obj in bucket.objects.filter(Prefix='input_percent_view'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

In [None]:
# start a object
logger.info('Setting up the prediction model')
percentile_used = 0.8
nfold = 10
cv_func = ModelMain(data_list, metadata_process_info['label_columns'], metadata_process_info['num_columns'])
# movie only
#cv_func.df = cv_func.df.loc[cv_func.df['content_category']=='movies',:]

# New Title Prediction, Post Launch 

### Step 2: Make Prediction
Note: Because it is a post launch prediction, only the titles with partial percent view and view through portion data will be predicted

In [None]:
'''
Get the prediction tarjectory over length of data
'''

model_name = 'lgb'
model_name_list = ['lgb']
percent_data_process_info['exact_X_pred'] = False
output_flag = False
new_title_output = pd.DataFrame()
existing_title_output = pd.DataFrame()

for day in range(-27,27):
    # renew the percent_data_process_info data very time
    from lib.config import percent_data_process_info
    from lib.config import prelaunch_process_info
    from lib.config import metadata_process_info

    # determine prelaunch or postlaunch
    if day < 1:
        input_process_info = dict(prelaunch_process_info)
        input_percentile_used = 1
    else:
        input_process_info = dict(metadata_process_info)
        input_percentile_used = percentile_used

    # just to make the values in the dict back to the initial values
    percent_data_process_info = dict(percent_data_process_info)
    percent_data_process_info['max_num_day'] = day
    
    # get x and y
    logger.info('Get X and y for day {}'.format(day))
    cv_func.get_X_y(percent_data_process_info, 
                     input_process_info, 
                     day001_popularity_threshold = input_percentile_used)
                     
    if cv_func.pred_empty_flag == True:
        print('no title needs to be predicted at day {}'.format(day))
        continue

    # tune parameter
    logger.info('Tune parameter for day {}'.format(day))
    print('Tune parameter for day {}'.format(day))
    cv_func.parameter_tuning(model_name, 
                          params_tuning_dict, 
                          percent_data_process_info,
                          nfold = nfold)
    
    params_dict = cv_func.min_smape_param['min_smape_original']
    param_stats = cv_func.parameter_tuning_stats
    logger.info('SMAPE for all titles {}'.format(param_stats['min_smape_all']))
    logger.info('SMAPE for the originals {}'.format(param_stats['min_smape_original']))
    print('SMAPE for all titles {}'.format(param_stats['min_smape_all']))
    print('SMAPE for the originals {}'.format(param_stats['min_smape_original']))
    
    # make prediction
    logger.info('Making prediction for day {}'.format(day))
    print('Making prediction for day {}'.format(day))
    cv_func.predict_new_titles(model_name_list, 
                               params_dict, 
                               percent_data_process_info)
    
    # process the output
    cur_new_title_output = cv_func.new_title_output
    pred_column = cur_new_title_output.columns[cur_new_title_output.columns.str.contains('lgb')][0]
    cur_new_title_output['pred_day'] = day
    cur_new_title_output = cur_new_title_output.rename(columns = {pred_column:'percent_view_pred'})
    
    # process the existing titles
    cur_existing_title_output = cv_func.output
    pred_column = cur_existing_title_output.columns[cur_existing_title_output.columns.str.contains('lgb')][0]
    cur_existing_title_output['pred_day'] = day
    cur_existing_title_output = cur_existing_title_output.rename(columns = {pred_column:'percent_view_pred'})
    
    if output_flag:
        new_title_output = pd.concat([new_title_output,cur_new_title_output], axis = 0)
        existing_title_output = pd.concat([existing_title_output, cur_existing_title_output], axis = 0)
    else:
        new_title_output = cur_new_title_output
        existing_title_output = cur_existing_title_output
        output_flag = True
          
# final formatting

if new_title_output.shape[0]>0:    
    new_title_output = new_title_output.drop(columns = ['target']).sort_values(['title_name','pred_day'])
    new_title_output = new_title_output[['title_name'
                                        ,'match_id'
                                        ,'match_id_platform'
                                        ,'platform_name'
                                        ,'program_type'
                                        ,'pred_day'
                                        ,'percent_view_pred']]

if existing_title_output.shape[0]>0:           
    existing_title_output = existing_title_output.sort_values(['match_id_platform','pred_day'])
    existing_title_output['platform_name'] = existing_title_output['match_id_platform'].apply(lambda x: x[0])
    existing_title_output = existing_title_output[['title_name'
                                                ,'match_id'
                                                ,'match_id_platform'
                                                ,'platform_name'
                                                ,'program_type'
                                                ,'target'
                                                ,'pred_day'
                                                ,'percent_view_pred'
                                                ,'smape_lgb'
                                                ,'mae_lgb'
                                                ,'fold']]

# Write csvs to S3

### Step 3: Write the prediction result to S3

In [None]:
def to_s3(filename, output_bucket, content):
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)

In [None]:
logger.info('Writing new title predictions over time to S3 as an csv file')
print('Writing new title predictions over time to S3 as an csv file')
csv_buffer = io.StringIO()
new_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'output_percent_view/new_title_prediction.csv'

to_s3(filename, output_bucket, content)

In [None]:
logger.info('Writing existing title predictions over time to S3 as an csv file')
print('Writing existing title predictions over time to S3 as an csv file')
csv_buffer = io.StringIO()
existing_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'output_percent_view/existing_title_prediction.csv'

to_s3(filename, output_bucket, content)