In [1]:
#!pip install lightgbm

In [2]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from pre_post_launch.model import ModelMain
from pre_post_launch.data_preprocessing import DataPreprocessing

# configs
from pre_post_launch.config import percent_data_process_info
from pre_post_launch.config import metadata_process_info
from pre_post_launch.config import default_params_dict as params_dict
from pre_post_launch.config import model_name_list
from pre_post_launch.config import params_tunning_dict

# Reading Data 

Reading data from multiple sources, with the corresponding query

### Step 1 of the Prediction Process: Getting Data
Step 1.1: update the funnel metrics by Sagemaker ipynb file 'query_pipeline' under the '/query' folder

Step 1.2: run each of the query in the '/day28_prediction/query/' to extract each input csv below

In [3]:
import boto3
import sys
s3 = boto3.resource('s3')
bucket = s3.Bucket('hbo-ingest-datascience-content-dev')
# Iterates through all the objects, doing the pagination for you. Each obj
# is an ObjectSummary, so it doesn't contain the body. You'll need to call
# get to get the whole body.
for obj in bucket.objects.all():
    key = obj.key
    print('reading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0]
    exec("{}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{}.columns = {}.columns.str.lower()".format(var_name, var_name))

reading csv file funnel_metric_feature.csv
reading csv file metadata_feature.csv
reading csv file mp_click_feature.csv
reading csv file mp_click_prelaunch_feature.csv
reading csv file sub_count_feature.csv
reading csv file sub_total_feature.csv
reading csv file trailer_feature.csv
reading csv file vtp_feature.csv


In [4]:
funnel_metric_feature = funnel_metric_feature.loc[:,funnel_metric_feature.isnull().sum()!=funnel_metric_feature.shape[0]]
vtp_feature = vtp_feature.loc[:,vtp_feature.isnull().sum()!=vtp_feature.shape[0]]
sub_total_feature = sub_total_feature.loc[:,sub_total_feature.isnull().sum()!=sub_total_feature.shape[0]]


In [5]:
percentile_used = 0.8
data_list = [funnel_metric_feature, 
             metadata_feature, 
             mp_click_prelaunch_feature, 
             trailer_feature, 
             sub_total_feature,
             vtp_feature]

In [6]:
# start a object
nfold = 10
cv_func = ModelMain(data_list, metadata_process_info['label_columns'], metadata_process_info['num_columns'])
# movie only
#cv_func.df = cv_func.df.loc[cv_func.df['content_category']=='movies',:]

Final title size: 7714, All title size: 7714


# Parameter Tunning (when 3 days of data is available)

### Step 2 (optional): Tune the Parameters

If skipped the default param values in the 'config.py' file will be used 

In [7]:
'''
If True, trained the model with maximum percent view data length, then predict, 
if False, for each length of data build a model 
(e.g. if day3 is the max length of data, then build day1 model, day2 model, day3 model)
'''
percent_data_process_info['exact_X_pred'] = False

'''
the name of the model to do parameter tunning
'''
model_name = 'lgb'

'''
max_num_day:The length of percent view and view through portion data
'''
percent_data_process_info['max_num_day']=3

# get X and y before tunning
cv_func.get_X_y(percent_data_process_info, 
                     metadata_process_info, 
                     day001_popularity_threshold = percentile_used)

keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params


In [8]:
'''
inputs:
model_name: name of the model to tune, only lgb available for now
params_tunning_dict: values for each parameters, new parameters could be added, old could be deleted
percent_data_process_info: parameters related to post-launch viewership
nfolds: number of folds to use
'''

cv_func.parameter_tunning(model_name, 
                          params_tunning_dict, 
                          percent_data_process_info,
                          nfold = nfold)


parameter combination 1




parameter combination 2




parameter combination 3




parameter combination 4




parameter combination 5




parameter combination 6




parameter combination 7




parameter combination 8




# New Title Prediction, Post Launch 

### Step 3: Make Prediction
Note: Because it is a post launch prediction, only the titles with partial percent view and view through portion data will be predicted

In [9]:
# set the parameter to the one that minimize the smape
params_dict = cv_func.min_smape_param['min_smape_original']
percent_data_process_info['exact_X_pred'] = False

In [10]:
'''
Get the prediction tarjectory over length of data
'''

output_flag = False
percent_data_process_info_copy = dict(percent_data_process_info)
trained_model_list = []
train_data = []
test_data = []

for day in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]:
    # renew the percent_data_process_info data very time
    from pre_post_launch.config import percent_data_process_info
    # just to make the values in the dict back to the initial values
    percent_data_process_info = dict(percent_data_process_info_copy)
    percent_data_process_info['max_num_day'] = day
    model_name_list = ['lgb']
    cv_func.get_X_y(percent_data_process_info, 
                     metadata_process_info, 
                     day001_popularity_threshold = percentile_used)
    cv_func.predict_new_titles(model_name_list, 
                               params_dict, 
                               percent_data_process_info)
    trained_model_list.append(cv_func.trained_model)
    train_data.append(cv_func.X_base)
    test_data.append(cv_func.X_pred)
    
    if output_flag:
        new_title_output = new_title_output.merge(cv_func.new_title_output, 
                                                  how = 'outer', 
                                                  on = ['title_name', 'match_id', 'target', 'program_type'])
    else:
        new_title_output = cv_func.new_title_output
        output_flag = True
        
new_title_output = new_title_output.drop(columns = ['target', 'program_type']).sort_values('day_1_lgb', ascending = False)

keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




keeps the titles above 80.0 percentile day1 viewed over all titles only
only 1543 titles considered
X and y are ready based on the input params




In [11]:
'''
Fill in the title names in the format below to get the predictions
*The title with 28 days of history does not show in the output here
'''

'\nFill in the title names in the format below to get the predictions\n*The title with 28 days of history does not show in the output here\n'

# Write csvs to S3

### Step 4: Write the prediction result to S3

In [12]:
def to_s3(filename, content):
    client = boto3.client('s3')
    client.put_object(Bucket='hbo-outbound-datascience-content-dev', Key=filename, Body=content)

In [13]:
csv_buffer = io.StringIO()
new_title_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'prediction_over_time.csv'

to_s3(filename, content)

In [14]:
last_pred = new_title_output.loc[:,new_title_output.columns.str.contains('lgb')].ffill(axis=1).iloc[:,-1]
last_pred = new_title_output[['title_name']].merge(last_pred, left_index = True, right_index = True).rename(columns = {'day_21_lgb':'last_pred'})
last_pred = last_pred.sort_values('last_pred', ascending = False)

In [15]:
csv_buffer = io.StringIO()
last_pred.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()

filename = 'last_prediction.csv'

to_s3(filename, content)