In [None]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io
import logging

import boto3
import sys

from lib.model import ModelMain

# configs
from lib.config import percent_data_process_info
from lib.config import metadata_process_info
from lib.config import default_params_dict as params_dict
from lib.config import model_name_list
from lib.config import params_tunning_dict

In [None]:
!pip freeze

# Reading Data 

Reading data from multiple sources, with the corresponding query

### Step 1 of the Prediction Process: Getting Data
Step 1.1: update the funnel metrics by Sagemaker ipynb file 'query_pipeline' under the '/query' folder

Step 1.2: run each of the query in the '/day28_prediction/query/' to extract each input csv below

In [None]:
logger = logging.getLogger()
logger.info(f'Loading inputs')

s3 = boto3.resource('s3')
bucket = s3.Bucket('hbo-ingest-datascience-content-dev')
# Iterates through all the objects, doing the pagination for you. Each obj
# is an ObjectSummary, so it doesn't contain the body. You'll need to call
# get to get the whole body.
for obj in bucket.objects.all():
    key = obj.key
    if 'sagemaker' not in key:
        logger.info('Loading csv file {}'.format(key))
        body = obj.get()['Body']
        var_name = key.split('.')[0]
        exec("{}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
        exec("{}.columns = {}.columns.str.lower()".format(var_name, var_name))

In [None]:
funnel_metric_feature = funnel_metric_feature.loc[:,funnel_metric_feature.isnull().sum()!=funnel_metric_feature.shape[0]]
vtp_feature = vtp_feature.loc[:,vtp_feature.isnull().sum()!=vtp_feature.shape[0]]
sub_total_feature = sub_total_feature.loc[:,sub_total_feature.isnull().sum()!=sub_total_feature.shape[0]]


In [None]:
percentile_used = 0.8
data_list = [funnel_metric_feature, 
             metadata_feature, 
             mp_click_prelaunch_feature, 
             trailer_feature, 
             sub_total_feature,
             vtp_feature]

In [None]:
# start a object
logger.info('Setting up the prediction model')
nfold = 10
cv_func = ModelMain(data_list, metadata_process_info['label_columns'], metadata_process_info['num_columns'])
# movie only
#cv_func.df = cv_func.df.loc[cv_func.df['content_category']=='movies',:]

# New Title Prediction, Post Launch 

### Step 2: Make Prediction
Note: Because it is a post launch prediction, only the titles with partial percent view and view through portion data will be predicted

In [None]:
'''
Get the prediction tarjectory over length of data
'''

model_name = 'lgb'
model_name_list = ['lgb']
percent_data_process_info['exact_X_pred'] = False
output_flag = False
percent_data_process_info_copy = dict(percent_data_process_info)

for day in [1,2,3]:
    # renew the percent_data_process_info data very time
    from lib.config import percent_data_process_info
    # just to make the values in the dict back to the initial values
    percent_data_process_info = dict(percent_data_process_info_copy)
    percent_data_process_info['max_num_day'] = day
    
    # get x and y
    logger.info('Get X and y for day {}'.format(day))
    cv_func.get_X_y(percent_data_process_info, 
                     metadata_process_info, 
                     day001_popularity_threshold = percentile_used)
    # tune parameter
    logger.info('Tune parameter for day {}'.format(day))
    cv_func.parameter_tunning(model_name, 
                          params_tunning_dict, 
                          percent_data_process_info,
                          nfold = nfold)
    params_dict = cv_func.min_smape_param['min_smape_original']
    param_stats = cv_func.parameter_tunning_stats
    logger.info('SMAPE for all titles {}'.format(param_stats['min_smape_all']))
    logger.info('SMAPE for the originals {}'.format(param_stats['min_smape_original']))
    
    # make prediction
    logger.info('Making prediction for day {}'.format(day))
    cv_func.predict_new_titles(model_name_list, 
                               params_dict, 
                               percent_data_process_info)
    
    if output_flag:
        new_title_output = new_title_output.merge(cv_func.new_title_output, 
                                                  how = 'outer', 
                                                  on = ['title_name', 'match_id', 'target', 'program_type'])
    else:
        new_title_output = cv_func.new_title_output
        output_flag = True
        
new_title_output = new_title_output.drop(columns = ['target', 'program_type']).sort_values('day_1_lgb', ascending = False)

# Write csvs to S3

### Step 3: Write the prediction result to S3

In [None]:
def to_s3(filename, content):
    client = boto3.client('s3')
    client.put_object(Bucket='hbo-outbound-datascience-content-dev', Key=filename, Body=content)

In [None]:
logger.info('Writing prediction over time to S3 as an csv file')
csv_buffer = io.StringIO()
new_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'prediction_over_time.csv'

to_s3(filename, content)

In [None]:
last_pred = new_title_output.loc[:,new_title_output.columns.str.contains('lgb')].ffill(axis=1).iloc[:,-1]
last_pred = new_title_output[['title_name']].merge(last_pred, left_index = True, right_index = True).rename(columns = {new_title_output.columns[-1]:'last_pred'})
last_pred = last_pred.sort_values('last_pred', ascending = False)

In [None]:
logger.info('Writing past prediction to S3 as an csv file')
csv_buffer = io.StringIO()
last_pred.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'last_prediction.csv'

to_s3(filename, content)