In [5]:
import os
import sys

import pandas as pd
import numpy as np
import pickle
from datetime import date, datetime, timedelta

from category_encoders import OneHotEncoder
import xgboost as xgb

import boto3
import logging 

logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
logger.info(f'Starting Notebook')

INFO:root:Starting Notebook


In [6]:
from io import StringIO
class Utils():
    @staticmethod
    def to_csv_s3(content, bucket, key_path, filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        csv_buffer = StringIO()
        content.to_csv(csv_buffer)
        client.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue(), ACL="bucket-owner-full-control")
        logger.info(f'Saved to {bucket}/{key}')
    
    @staticmethod
    def to_pkl_s3(content, bucket, key_path, filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        obj = pickle.dumps(content)
        client.put_object(Bucket=bucket, Key=key, Body=obj)
        logger.info(f'Saved model to {os.path.join(bucket, key)}')
        logger.info(f'Saved to {bucket}/{key}')

    @staticmethod
    def read_csv_s3(bucket, key_path,filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        logger.info(f'Reading from {bucket}/{key}')
        obj = client.get_object(Bucket=bucket, Key=key)
        df = pd.read_csv(obj['Body'], na_values="\\N")
        return df
        
    @staticmethod
    def read_pkl_s3(bucket, key_path,filename):
        client = boto3.client('s3')
        key = os.path.join(key_path, filename)
        logger.info(f'Reading from {bucket}/{key}')
        obj = client.get_object(Bucket=bucket, Key=key)
        body = obj['Body'].read()
        model = pickle.loads(body)
        return model


class BaseFeatureMunger():
    def __init__(self, df_in, df_imdb_munged, df_talent, mode, date_run):
        self.df_in = df_in
        self.df_imdb_munged = df_imdb_munged
        self.df_talent = df_talent
        self.mode = mode
        self.date_run = date_run
        self.clean_df()
        self.fill_missing_data()
        if mode=='train':
            self.filter_df()
        self.get_datetime_features()
        self.aggregate_df()
        self.munge_talent_feature()
        self.merge_features()
        self.get_first_views()
        self.return_df()
        
    def return_df(self):
        col_base = ['title_id', 'title_name', 'season_number',
       'content_category', 'content_source', 'category',
       'tier', 'effective_start_date', 'request_date', 'first_views',
       'days_since_premiere', 'days_on_platform', 'start_year']
        
        col_title = ['title_name', 'title_id','season_number','tier','content_category','category',
                'effective_start_date', 'schedule_label', 'prequel_count', 'prequel_featured_count','page_views','first_views','first_views_log']
        if self.mode=='train':
            return self.df_in[col_base], self.df_in_title[col_title]
        elif self.mode=='predict':
            return self.df_in_title[col_title]

    def clean_df(self):
        logger.info(f'base_data shape: {self.df_in.shape}')
        logger.info(f'base_data null: {self.df_in.isnull().sum()}')
        self.df_in.columns= self.df_in.columns.str.lower()
        self.dic_dtype = {'season_number':int, 'tier':int}
        self.df_in = self.df_in.astype(self.dic_dtype)
        self.df_in['effective_start_date'] = pd.to_datetime(self.df_in['effective_start_date'])
        
    def fill_missing_data(self):
        self.df_in['content_category'] = self.df_in['content_category'].fillna('series')
        self.df_in.loc[self.df_in['category'].isin(['Specials']), 'content_category']='special'
        self.df_in.loc[self.df_in['category'].isin(['Popcorn','Pay 1 WB Theatricals','Scripted Features', 'Pay1']), 'content_category']='movies'
        self.df_in.loc[self.df_in['title_name'].str.contains('Harry Potter'), 'content_category']='special'
        self.df_in = self.df_in.fillna(0)

    def filter_df(self):
        self.date_max = datetime.strptime(self.date_run, '%Y-%m-%d')- timedelta(days=60)
        self.df_in = self.df_in[(self.df_in['effective_start_date']<=self.date_max)]

    def get_datetime_features(self):
        self.df_in['start_year'] = self.df_in['effective_start_date'].dt.year
        self.df_in['start_quarter'] = self.df_in['effective_start_date'].dt.quarter
        
    def aggregate_df(self):
        grpby_title= ['title_name', 'title_id','tier','content_category','category','season_number',
                'effective_start_date', 'schedule_label']
        self.df_in_title = self.df_in[grpby_title + ['first_views']].groupby(by=grpby_title).sum().reset_index()
    
    def munge_talent_feature(self):
        grpby = ['title_name', 'season_number', 'category', 'tier', 'effective_start_date'] 
        self.df_talent.columns= self.df_talent.columns.str.lower()
        self.df_talent = self.df_talent.fillna(0)
        self.df_talent['effective_start_date'] = pd.to_datetime(self.df_talent['effective_start_date']).dt.date
        self.df_talent['effective_start_date'] = pd.to_datetime(self.df_talent['effective_start_date'])
        self.df_talent['tier'] = self.df_talent['tier'].astype(int)
        
        self.df_pv_title = self.df_talent[self.df_talent.talent_category=='cast']
        self.df_pv_title = self.df_pv_title.sort_values(by=grpby+['page_views'], ascending=False)
        self.df_pv_title = self.df_pv_title.groupby(by=grpby).head(3).reset_index(drop=True).groupby(by=grpby).sum().reset_index()

    def merge_features(self):
        key_merge = ['title_name','season_number', 'category', 'tier', 'effective_start_date']
        self.df_in_title = self.df_in_title.merge(self.df_imdb_munged, on= key_merge+['content_category'], how='left')\
                                        .merge(self.df_pv_title, on=key_merge, how='left')
        logger.info(f'title imdb shape: {self.df_in_title.shape}')
#         logger.info(f'title imdb features null: {self.df_in_title.isnull().sum()}')
        self.df_in_title = self.df_in_title.fillna(0)
        
    def get_first_views(self):
        ## Scale first_views 
        self.df_in_title['first_views_log'] = np.log(self.df_in_title['first_views']) 
        self.df_in_title.loc[(self.df_in_title.first_views==0), 'first_views_log']=0
        

class IMDBFeatureMunger(BaseFeatureMunger):
    
    def __init__(self, df_in, mode, date_run):
        self.df_in = df_in
        self.date_run = date_run 
        
        self.clean_df()
        if mode=='train':
            self.filter_df()
        self.get_series_features()
#         if mode=='train':
#             self.adjust_series_for_training_data()
        self.get_non_series_prequel_features()
        self.get_non_series_prequel_ref_features()
        self.merge_non_series_features()
        self.return_df()
        
    def return_df(self):
        col_imdb = ['title_name', 'season_number', 'content_category', 'category', 
                    'tier', 'effective_start_date', 'prequel_count', 'prequel_featured_count']
        self.df_imdb_munged = pd.concat([self.df_series_title, self.df_nseries_title])
#         self.df_imdb_munged = self.df_nseries_title.copy()
        self.df_imdb_munged = self.df_imdb_munged[col_imdb].groupby(by=col_imdb[:-2]).sum().reset_index()
        return self.df_imdb_munged
    
    def get_series_features(self):
        grpby_series = ['title_name','season_number','content_category','category','tier','effective_start_date']
        self.df_series = self.df_in[(self.df_in.content_category.isin(['series'])) & (self.df_in.reference_type.isin(['featured_in']))] 
        self.df_series_title = self.df_series.groupby(by=grpby_series).agg({'reference_title_id':'nunique'}).reset_index()
        self.df_series_title = self.df_series_title.rename(columns={'reference_title_id':'ref_ref_featured_in'})
        self.df_series_title['prequel_count'] = self.df_series_title['season_number']-1
        self.df_series_title['prequel_featured_count'] = self.df_series_title['ref_ref_featured_in']

    def adjust_series_for_training_data(self):     
        self.df_series_title['ref_ref_featured_in'] = (self.df_series_title['ref_ref_featured_in'] * (self.df_series_title['season_number']-1))/self.df_series_title['season_number']
        self.df_series_title.loc[self.df_series_title.season_number>1, 'prequel_featured_count'] = (self.df_series_title['ref_ref_featured_in'])/(self.df_series_title['season_number']-1)     
        
        
    def get_non_series_prequel_features(self):
        self.grpby_nseries=['title_name','season_number','content_category','category','tier','effective_start_date','imdb_imdb_series_id']
        self.df_nseries = self.df_in[(self.df_in.reference_type.isin(['follows','spin_off_from','remake_of','version_of'])) & ((self.df_in.reference_title_type.isin(['movie','tvSeries', 'videoGame'])))] 
        self.df_nseries['ref_ref_count'] = self.df_nseries.groupby('reference_title_id').reference_reference_title_id.transform('nunique')
        self.df_nseries = self.df_nseries[self.df_nseries['ref_ref_count']>0] ## Only count references that have secondary references 
        
        self.df_nseries_preq = self.df_nseries.groupby(by=self.grpby_nseries+['reference_type']).agg({'reference_title_id':'nunique'}).reset_index()
        self.df_nseries_preq = pd.pivot_table(self.df_nseries_preq, 
                          index = self.grpby_nseries, 
                          columns = 'reference_type', 
                          values = 'reference_title_id').reset_index()
        self.df_nseries_preq = self.df_nseries_preq.rename(columns={'follows':'ref_follows',  
                                                                          'spin_off_from':'ref_spin_off_from', 
                                                                          'remake_of':'ref_remake_of',
                                                                          'version_of':'ref_version_of'})
    def get_non_series_prequel_ref_features(self):
        self.df_nseries_preq_ref = self.df_nseries.groupby(by=self.grpby_nseries+['reference_reference_type']).agg({'reference_reference_title_id':'nunique'}).reset_index()
#         display(self.df_nseries_preq_ref[self.df_nseries_preq_ref.title_name=='The Matrix Resurrections'])
        self.df_nseries_preq_ref = pd.pivot_table(self.df_nseries_preq_ref, 
                  index = self.grpby_nseries, 
                  columns = 'reference_reference_type', 
                  values = 'reference_reference_title_id').reset_index()        
        self.df_nseries_preq_ref = self.df_nseries_preq_ref.rename(columns={'featured_in':'ref_ref_featured_in'})
        
    def merge_non_series_features(self):
        col_preq = ['title_name', 'tier','season_number','content_category','category','effective_start_date','imdb_imdb_series_id','ref_follows','ref_spin_off_from','ref_remake_of', 'ref_version_of']
        col_ref_ref = ['title_name','season_number','category','ref_ref_featured_in']
        self.df_nseries_title = self.df_nseries_preq[col_preq].merge(self.df_nseries_preq_ref[col_ref_ref], how='outer', on=['title_name','season_number','category'])
        self.df_nseries_title['prequel_count'] = self.df_nseries_title[['ref_follows','ref_spin_off_from','ref_remake_of','ref_version_of']].sum(axis=1)
        self.df_nseries_title['prequel_featured_count'] = self.df_nseries_title['ref_ref_featured_in']/self.df_nseries_title['prequel_count']

        
        
class PreProcessor():
    def __init__(self, df_in, mode, date_run):
        self.df_in = df_in
        self.date_run = date_run
        self.get_parameters()
        self.adjust_tiers()
        self.get_series_nonseries_feature()
        self.categorize_talent_feature()
        self.winsorize_feature()
        if mode=='train':
            self.winsorize_label()
            self.resample_data()
        self.return_df()
        
    def return_df(self):
        return self.df_in
    
    def get_parameters(self):
        self.max_firstviews = 1000000
        self.old_sampling_rate = 0.4
        self.recent_sampling_rate = 0.6
        self.pay1_sampling_rate = 0.5
        
    def adjust_tiers(self):
        self.df_in['tier_adj'] = self.df_in['tier']
        self.df_in.loc[(self.df_in.title_name=='Reminiscence'), 'tier_adj'] = 2
        self.df_in.loc[(self.df_in.category=='Pay1') & (self.df_in.tier==1), 'tier_adj'] = 2
        self.df_in.loc[(self.df_in.category=='Pay1') & (self.df_in.tier==2), 'tier_adj'] = 3
        self.df_in.loc[(self.df_in.effective_start_date < '2022-04-01') & (self.df_in.category=='Popcorn') & (self.df_in.tier==1), 'tier_adj'] = 0
        self.df_in.loc[self.df_in.title_name=='Avatar 2', 'tier_adj'] = 1
        
    def get_series_nonseries_feature(self):
        self.df_in['prequel_count_s'] = self.df_in['prequel_count']
        self.df_in['prequel_featured_count_s'] = self.df_in['prequel_featured_count']
        self.df_in['page_views_s'] = self.df_in['page_views']
        
        self.df_in.loc[(self.df_in.content_category!='movies'),['page_views','prequel_count','prequel_featured_count']] = -1
        self.df_in.loc[(self.df_in.content_category=='movies'), ['page_views_s','prequel_count_s','prequel_featured_count_s']] = -1
        
        self.df_in.loc[(self.df_in.category=='Kids & Family') & ~(self.df_in.title_name.str.contains('Degrassi')), ['page_views_s','prequel_count_s','prequel_featured_count_s']] = 0
        self.df_in.loc[(self.df_in.tier_adj>=2) & (self.df_in.category!='Popcorn'), ['page_views','prequel_featured_count','page_views_s','prequel_featured_count_s']] = -2
        self.df_in.loc[(self.df_in.tier_adj==2) & (self.df_in.category=='Popcorn'), ['page_views','page_views_s','prequel_count','prequel_featured_count_s']] = 0
        
    def categorize_talent_feature(self):
        page_views_bin = [-10, -2, -1, 100000, 100000000]  # -1: series, -2: low tier, 
        page_views_s_bin = [-10, -2, -1, 100000, 100000000]  # -1: movie, -2: low tier, 
        page_views_label = [-2, -1, 1000, 100000]
        page_views_s_label = [-2, -1, 100000, 200000]
        
        self.df_in['page_views']= pd.cut(self.df_in['page_views'], bins=page_views_bin,labels=page_views_label).fillna(100000).astype(int)
        self.df_in['page_views_s']= pd.cut(self.df_in['page_views_s'], bins=page_views_s_bin,labels=page_views_s_label).fillna(200000).astype(int)        

    def winsorize_feature(self):
        ## IMDB Features 
        self.df_in.loc[(self.df_in['prequel_count']>5)& (self.df_in.category!='Popcorn'), ['prequel_count']] = 5
        self.df_in.loc[(self.df_in['prequel_count_s']>5)& (self.df_in.category!='Popcorn'), ['prequel_count_s']] = 5
        self.df_in.loc[(self.df_in['prequel_featured_count']>20)& (self.df_in.category!='Popcorn'), ['prequel_featured_count']] = 20   
        self.df_in.loc[(self.df_in['prequel_featured_count_s']>40)& (self.df_in.category!='Popcorn'), 'prequel_featured_count_s'] = 40
#         self.df_in.loc[(self.df_in.tier_adj==2) & (self.df_in.category=='Popcorn') & (self.df_in['prequel_count']>2), ['prequel_count']] = 1
        self.df_in.loc[(self.df_in.tier_adj==2) & (self.df_in.category=='Popcorn') & (self.df_in['prequel_featured_count']<=20), ['prequel_featured_count']] = 0
#         self.df_in.loc[(self.df_in.tier_adj==2) & (self.df_in.category=='Popcorn') & (self.df_in['prequel_featured_count']>30), ['prequel_featured_count']] = 1

    def winsorize_label(self):
        self.df_in.loc[self.df_in.first_views>self.max_firstviews, 'first_views'] = self.max_firstviews
    
    def resample_data(self):  
        self.df_in_old = self.df_in[(self.df_in.effective_start_date < datetime.strptime(self.date_run, '%Y-%m-%d')- timedelta(days=270))]
        self.df_in_recent = self.df_in[(self.df_in.effective_start_date >= datetime.strptime(self.date_run, '%Y-%m-%d')- timedelta(days=270))]
        target_count = self.df_in.groupby(by=['tier_adj','category'])['title_name'].count().max()
        print(target_count)
        target_count_old = int(round(target_count*self.old_sampling_rate))
        target_count_new = int(round(target_count*self.recent_sampling_rate))
        
        ## Loop through old and recent data
        df_resample_list = []
        for df_sample, target_count in zip([self.df_in_old, self.df_in_recent], [target_count_old, target_count_new]):
            list_tier_cat= df_sample[['tier_adj','category']].drop_duplicates(subset=['tier_adj','category']).values.tolist()
            
            ## Loop through unique combination of tier & category to resample 
            for tier_cat in list_tier_cat:
                if tier_cat[1]=='Pay1':
                    df_resample_list.append(df_sample[(df_sample.tier_adj==tier_cat[0]) & (df_sample.category==tier_cat[1])]\
                                        .sample(int(round(target_count*self.pay1_sampling_rate)), replace=True))                
                else:
                    df_resample_list.append(df_sample[(df_sample.tier_adj==tier_cat[0]) & (df_sample.category==tier_cat[1])]\
                                        .sample(target_count, replace=True))
        self.df_in = pd.concat(df_resample_list, axis=0)

    
    
class XGB(Utils):
    def __init__(self, df_in, mode, date_run):
        self.df_in = df_in    
        self.mode = mode
        self.date_run = date_run
        self.get_parameters()
        
        if mode=='train':
            self.train_xgb()
            self.save_model()
        elif mode=='predict':
            self.get_model()
            self.predict_xgb()
            self.return_df()
            
    @staticmethod
    def _encode(df, categoricals):
        
        """
        perform category encoding on the data
        :param df: dataframe to be encoded
        :param categoricals: list of name of categorical columns
        :return ohe, x_ohe: OHE object and OHE-encoded data
        """
        ohe = OneHotEncoder(cols=categoricals, 
                            handle_unknown='return_nan',
                           handle_missing='return_nan',  
                           use_cat_names=True) 
        x_ohe = ohe.fit_transform(df)
        return ohe, x_ohe
    
    def return_df(self):
        return self.df_pred
    
    def get_parameters(self):
        self.target = 'first_views_log'
        self.features_cat=['tier_adj','category','content_category','prequel_count', 'page_views','page_views_s'] 
        self.features_cont= ['prequel_featured_count','prequel_featured_count_s'] 
        self.param_xgb = {"booster":"gbtree",
                     "objective":"reg:squarederror",
                    "gamma":1}

    def train_xgb(self):
        x_train = self.df_in[self.features_cat + self.features_cont]
        y_train = self.df_in[self.target]
        self.ohe, x_ohe = self._encode(x_train, self.features_cat)
        dm_train = xgb.DMatrix(x_ohe, label=y_train)

        ## train 
        self.model = xgb.train(params = self.param_xgb, dtrain = dm_train, num_boost_round = 10)
        
    def save_model(self):
        dict_model = {'model': self.model, 'ohe': self.ohe}
        Utils.to_pkl_s3(dict_model, input_bucket, key_path, f'fv_{self.date_run}.pkl')
        logger.info(f'Done model training {self.date_run}')
    
    def get_model(self):
        dict_model = Utils.read_pkl_s3(input_bucket, key_path, f'fv_{self.date_run}.pkl')
        self.ohe = dict_model['ohe']
        self.model = dict_model['model']
        
    def predict_xgb(self):
        x_test = self.df_in[self.features_cat + self.features_cont]
        x_ohe_test = self.ohe.transform(x_test)
        dm_test = xgb.DMatrix(x_ohe_test)
        pred = self.model.predict(dm_test)

        self.df_pred = self.df_in[list(set(['title_name','category','season_number', 'effective_start_date', 'tier','tier_adj', 'schedule_label'] + self.features_cat + self.features_cont))]
        self.df_pred['first_views_log_pred'] = pred
        self.df_pred[f'first_views_pred'] = np.exp(self.df_pred[f'first_views_log_pred'])
        self.df_pred['pred_date'] = self.date_run
        self.df_pred['pred_date'] = pd.to_datetime(self.df_pred['pred_date'])
        self.df_pred.loc[(self.df_pred.category=='Popcorn') & (self.df_pred.effective_start_date>='2022-04-01'), 'category']='Pay 1 WB Theatricals'


class FVDecay():
    def __init__(self, df_in, mode, date_run, grpby=''):
        self.df_in = df_in    
        self.mode = mode
        self.date_run = date_run
        self.grpby = grpby

        if mode=='train':
            self.get_decay_data_title()
            self.get_decay_curve()
            self.return_df()

    def return_df(self):
        return self.df_decay
        
    @staticmethod
    def apply_decay_curve(df_in, df_decay):
        category_list_train = df_decay.category.unique().tolist()
        df_in['decay_category'] = df_in['category']
        df_in.loc[(~df_in['decay_category'].isin(category_list_train)) & (df_in['decay_category']!='Popcorn'), 'decay_category'] = 'Pay1'
        
        
        ## Apply decay to prediction 
        df_pred_decay = df_in.merge(df_decay[['category','days_since_premiere','first_views_pct']], 
                                                left_on=['decay_category'], right_on=['category'], how='left', suffixes=['', '_category'])
#                             .merge(df_popcorn_decay[['category','days_since_premiere','first_views_pct']], 
#                                         on=['category','days_since_premiere'], how='left', suffixes=['', '_popcorn'])
        print(df_pred_decay.columns)
#         df_pred_decay.loc[(df_pred_decay.category=='Popcorn') & (df_pred_decay.first_views_pct.isnull()), 'first_views_pct'] = df_pred_decay['first_views_pct_popcorn']
        
        df_pred_decay[f'first_views_pred_decay'] = df_pred_decay[f'first_views_pred'] * df_pred_decay['first_views_pct']
        df_pred_decay['start_month'] = df_pred_decay['effective_start_date'].dt.to_period('M').dt.to_timestamp()
        df_pred_decay['request_date'] = df_pred_decay['effective_start_date'] + pd.to_timedelta(df_pred_decay.days_since_premiere, unit="D")
        df_pred_decay['first_view_quarter'] = df_pred_decay['request_date'].dt.to_period('Q').dt.to_timestamp()
        df_pred_decay['first_view_month'] = df_pred_decay['request_date'].dt.to_period('M').dt.to_timestamp()

        return df_pred_decay
    
    def get_decay_data_title(self):
        self.key_col = ['title_name','tier','content_category','category','season_number','effective_start_date']
        self.df_in = self.df_in[self.key_col + ['title_id','days_since_premiere','first_views']].sort_values(by=self.key_col+['days_since_premiere'])
        self.df_in['first_views_sum'] = self.df_in.groupby(self.key_col)['first_views'].transform('sum')
        self.df_in['first_views_cumsum'] = self.df_in.groupby(by=self.key_col)['first_views'].cumsum()
        self.df_in['first_views_cumpct'] = self.df_in['first_views_cumsum'] / self.df_in['first_views_sum']
        self.df_in.reset_index(inplace=True)
            
    def get_decay_curve(self):
        self.df_decay = self.df_in[[self.grpby] + ['title_id','days_since_premiere','first_views_cumpct']].sort_values(by=[self.grpby]+['days_since_premiere'])
        self.df_decay = self.df_decay.groupby(by=[self.grpby] + ['days_since_premiere']).agg({'first_views_cumpct':'median','title_id':'nunique'})
        self.df_decay = self.df_decay.rename(columns={'title_id':'title_count_training'})
        self.df_decay = self.df_decay.reset_index()
        self.df_decay['first_views_pct'] = self.df_decay.groupby(self.grpby)['first_views_cumpct'].diff()
        self.df_decay.reset_index(inplace=True)
        
        self.df_decay['days_since_premiere'] = self.df_decay['days_since_premiere'].astype(int)
        self.df_decay.loc[(self.df_decay.days_since_premiere==0),'first_views_pct'] = self.df_decay['first_views_cumpct']



In [None]:
        
#     def adjust_tiers_label(self):
#         self.df_in.loc[(self.df_in.first_views<=75000) & (self.df_in.tier_adj==1), 'tier_adj'] = 2
#         self.df_in.loc[(self.df_in.first_views<=25000) & (self.df_in.tier_adj==2), 'tier_adj'] = 3
    

In [7]:
from io import StringIO

dic_key_path = {'prod':'psi_first_views', 'dev':'psi_first_views', 'backtest':'psi_first_views/dev'}
dic_input_bucket = {'prod':'hbo-ingest-datascience-content', 'dev':"hbo-ingest-datascience-content-dev", 'backtest':"hbo-ingest-datascience-content-dev"}
dic_output_bucket = {'prod':'hbo-outbound-datascience-content', 'dev':"hbo-outbound-datascience-content-dev", 'backtest':"hbo-outbound-datascience-content-dev"}
# input_bucket="hbo-ingest-datascience-content-dev"
# output_bucket="hbo-outbound-datascience-content-dev"

prod_backtest = 'backtest' ## 'prod', 'backtest'
key_path = dic_key_path[prod_backtest]
input_bucket = dic_input_bucket[prod_backtest]
output_bucket = dic_output_bucket[prod_backtest]

if (prod_backtest =='prod') | (prod_backtest =='dev') :
    list_date_train = [(datetime.today()).strftime('%Y-%m-%d')]
elif prod_backtest =='backtest':
    list_date_train = ['2022-06-24']#['2021-01-01', '2021-02-01', '2021-03-01','2021-04-01', '2021-05-01', '2021-06-01','2021-07-01', '2021-08-01', '2021-09-01']

df_pred_list = []
# df_pred_decay_list = [] 
for date_train in list_date_train:
    date_pred = date_train 
   
# #     #### Train 
#     mode = 'train'
#     logger.info(f'TRAINING MODEL FOR {date_train}, Mode:{prod_backtest}')
    
#     ## Read data 
#     df_raw = Utils.read_csv_s3(input_bucket, key_path, f'fv_train_fv_{date_train}.csv')
#     df_imdb = Utils.read_csv_s3(input_bucket, key_path, f'fv_train_imdb_{date_train}.csv')
#     df_talent = Utils.read_csv_s3(input_bucket, key_path, f'fv_train_talent_{date_train}.csv')
#     df_raw = df_raw.assign(schedule_label='beta')
    
#     ## Munge features, get decay curves, resample  
#     df_imdb_munged = IMDBFeatureMunger(df_imdb, mode, date_train).df_imdb_munged
#     df_base_munged_decay, df_base_munged = BaseFeatureMunger(df_raw, df_imdb_munged, df_talent, mode, date_train).return_df()
#     df_decay_category = FVDecay(df_base_munged_decay, mode, date_train, 'category').return_df()
#     df_in_train = PreProcessor(df_base_munged, mode, date_train).return_df()  
#     Utils.to_csv_s3(df_decay_category, input_bucket, key_path, f'fv_decay_category_{date_train}.csv')
#     Utils.to_csv_s3(df_in_train, input_bucket, key_path, f'fv_train_munged_{date_train}.csv')
#     Utils.to_csv_s3(df_in_train, 'hbo-ingest-datascience-content-dev', key_path, f'fv_train_munged_{date_train}.csv')
    
#     df_in_train.assign(upload_date_time=datetime.now().strftime("%Y-%m-%d %H:%M"))
#     Utils.to_csv_s3(df_in_train, output_bucket, key_path, f'fv_train_munged.csv')
    
#     # Train xgb
#     XGB(df_in_train, mode, date_train)
    
    #### Predict 
    mode = 'predict'
    logger.info(f'PREDICTING FOR {date_pred}, Mode:{prod_backtest}')
    
    ## Read data  
    df_imdb_pred = Utils.read_csv_s3(input_bucket, key_path, f'fv_pred_{date_pred}.csv')
    df_imdb_pred = df_imdb_pred.assign(first_views=0, title_id=0)
    df_raw_pred = df_imdb_pred.copy()
    df_talent_pred = Utils.read_csv_s3(input_bucket, key_path, f'fv_pred_talent_{date_pred}.csv')
    

    df_raw_pred.loc[df_raw_pred.title_name.str.contains('The Back Nine'), 'tier']=2
    ## Munge features and resample 
    df_imdb_munged_pred = IMDBFeatureMunger(df_imdb_pred, mode, date_pred).return_df()
    df_base_munged_pred = BaseFeatureMunger(df_raw_pred, df_imdb_munged_pred, df_talent_pred, mode, date_pred).return_df()
    df_in_pred = PreProcessor(df_base_munged_pred, mode, date_pred).return_df()      
    
    
    ## Data adjustments 

    df_in_pred.loc[df_in_pred.title_name.str.contains('The Back Nine'), 'tier']=2
    df_in_pred.loc[df_in_pred.title_name.str.contains('The Back Nine'), 'tier_adj']=2    
    df_in_pred.loc[df_in_pred.title_name.str.contains('Love & Death'), 'page_views_s']=100000
    df_in_pred.loc[df_in_pred.title_name.str.contains('Degrassi'), 'category']='Unscripted Series'
#     df_in_pred.loc[df_in_pred.title_name.str.contains('Meg'), ['prequel_count']]=0
    
    ## Predict and apply decay 
    df_out_pred = XGB(df_in_pred, mode, '2022-06-14').return_df()
#     df_decay_category = Utils.read_csv_s3(input_bucket, key_path, f'fv_decay_category_{date_train}.csv') 
#     df_pred_decay = FVDecay.apply_decay_curve(df_out_pred, df_decay_category)
    df_pred_list.append(df_out_pred)
#     df_pred_decay_list.append(df_pred_decay)

df_pred = pd.concat(df_pred_list)
df_pred.loc[df_pred.title_name.str.contains('Degrassi'), 'category']='Kids & Family'
df_pred.loc[df_pred.title_name.str.contains('Shazam! Fury Of The Gods')& (df_pred.effective_start_date=='2023-02-04'), 'first_views_pred']=224421.110
df_pred.loc[(df_pred.title_name.str.contains('And Just Like That')) & (df_pred.effective_start_date=='2023-06-08'), 'first_views_pred']=368094.280


# df_pred_decay = pd.concat(df_pred_decay_list)
# df_pred_decay = df_pred_decay.assign(upload_date_time=datetime.now().strftime("%Y-%m-%d %H:%M"))


df_pred = df_pred[['title_name','season_number','category', 
           'tier', 'effective_start_date','first_views_pred',  'content_category', 'prequel_count', 'prequel_featured_count','prequel_featured_count_s',
            'page_views','page_views_s',
           'tier_adj', 'pred_date','schedule_label']]

df_pred = df_pred[(df_pred.effective_start_date>='2022-06-01') & (df_pred.schedule_label=='alpha')].sort_values(by=['effective_start_date','tier'])

# ## save daily forecast to local file, identified by upload_date_time 
# Utils.to_csv_s3(df_pred, input_bucket, key_path, f'fv_pred_munged_{date_train}_adhoc.csv')
# Utils.to_csv_s3(df_pred_decay, input_bucket, key_path, f'psi_daily_xgb_forecast_{date_train}.csv')

# # Make duplicates in dev bucket 
# Utils.to_csv_s3(df_pred, 'hbo-ingest-datascience-content-dev', key_path, f'fv_pred_munged_{date_train}.csv')
# Utils.to_csv_s3(df_pred_decay, 'hbo-ingest-datascience-content-dev', key_path, f'psi_daily_xgb_forecast_{date_train}.csv')

# df_pred.assign(upload_date_time=datetime.now().strftime("%Y-%m-%d %H:%M"))
# Utils.to_csv_s3(df_pred, output_bucket, key_path, f'fv_pred_munged.csv')

INFO:root:PREDICTING FOR 2022-06-24, Mode:backtest
INFO:root:Reading from hbo-ingest-datascience-content-dev/psi_first_views/dev/fv_pred_2022-06-24.csv
INFO:root:Reading from hbo-ingest-datascience-content-dev/psi_first_views/dev/fv_pred_talent_2022-06-24.csv
INFO:root:base_data shape: (15399, 20)
INFO:root:base_data null: Unnamed: 0                         0
title_name                         0
imdb_imdb_series_id              571
season_number                      0
tier                               0
category                           0
content_category                 254
effective_start_date               0
schedule_label                     0
imdb_title_name                  571
n_votes                          571
reference_type                   744
reference_title                  744
reference_title_id               744
reference_title_type             744
reference_n_votes                744
reference_reference_type        2767
reference_reference_title_id    2767
first_vie

In [108]:
df_imdb_munged_pred.describe()

Unnamed: 0,season_number,tier,prequel_count,prequel_featured_count
count,121.0,121.0,121.0,121.0
mean,5.256198,1.966942,5.710744,36.053768
std,10.728101,0.874966,11.453848,61.25363
min,0.0,0.0,0.0,1.0
25%,1.0,1.0,1.0,2.0
50%,2.0,2.0,2.0,10.0
75%,4.0,3.0,4.0,38.0
max,55.0,3.0,61.0,374.0


In [107]:
df_imdb_munged_pred[(df_imdb_munged_pred.category=='Popcorn') & (df_imdb_munged_pred.tier==2)]

Unnamed: 0,title_name,season_number,content_category,category,tier,effective_start_date,prequel_count,prequel_featured_count
25,Dune: Part Two,0,movies,Popcorn,2,2023-12-04,2.0,38.0
33,Fantastic Beasts: The Secrets of Dumbledore,0,movies,Popcorn,2,2022-05-30,4.0,32.0
106,The Meg 2,0,movies,Popcorn,2,2023-09-18,2.0,30.0
120,Wonka,0,movies,Popcorn,2,2024-01-29,3.0,58.666667


In [156]:
date_train = '2022-06-14'
df_old = Utils.read_csv_s3('hbo-ingest-datascience-content-dev', 'psi_first_views', f'fv_pred_munged_{date_train}.csv')
df_old = df_old[df_old.schedule_label=='alpha']
df_old = df_old[(df_old.effective_start_date<'2024-01-01') & (df_old.effective_start_date>='2022-07-01')]

df = df_pred[df_pred.schedule_label=='alpha'].copy()
df = df[(df.effective_start_date<'2024-01-01') & (df.effective_start_date>='2022-07-01')]
df = df.merge(df_old[['title_name','category','season_number','first_views_pred','tier']], on=['title_name','category','season_number',], suffixes=('', '_old'), how='outer')
display(df.isnull().sum())
df['diff'] = (df['first_views_pred_old'] - df['first_views_pred']).abs()
df.loc[df.title_name.str.contains('Degrassi'), 'tier_old'] = 2
df.loc[df.title_name.str.contains('Degrassi'), 'first_views_pred_old'] = 454

display(df[(df['diff']>0.1) & (df.first_views_pred_old.notnull()) & (df.first_views_pred.notnull()) & ~(df.title_name=='Love & Death') | (df['title_name'].str.contains('Degrassi'))][['title_name','effective_start_date','tier_old','tier','category', 'first_views_pred_old','first_views_pred']].sort_values(by='effective_start_date')[:-1])

# display(df[(df.schedule_label=='alpha')  & ((df.title_name.str.contains('Degrassi')) | (df.title_name.str.contains('Barbie'))| (df.title_name.str.contains('Dune')))].sort_values(by=['effective_start_date']))


INFO:root:Reading from hbo-ingest-datascience-content-dev/psi_first_views/fv_pred_munged_2022-06-14.csv


title_name                   0
season_number                0
category                     0
tier                        81
effective_start_date        81
first_views_pred            81
content_category            81
prequel_count               81
prequel_featured_count      81
prequel_featured_count_s    81
page_views                  81
page_views_s                81
tier_adj                    81
pred_date                   81
schedule_label              81
first_views_pred_old        73
tier_old                    73
dtype: int64

Unnamed: 0,title_name,effective_start_date,tier_old,tier,category,first_views_pred_old,first_views_pred
28,Elvis,2022-09-02,1.0,2.0,Pay 1 WB Theatricals,129818.35,42327.269531
52,Salem's Lot,2022-10-24,1.0,2.0,Pay 1 WB Theatricals,148872.1,42327.269531
62,Don't Worry Darling,2022-11-07,1.0,2.0,Pay 1 WB Theatricals,129818.35,42327.269531
218,Barbie,2023-09-04,1.0,2.0,Pay 1 WB Theatricals,129818.35,42327.269531
223,The Meg 2,2023-09-18,1.0,2.0,Pay 1 WB Theatricals,154323.14,45502.605469
224,Degrassi (New),2023-09-21,2.0,2.0,Kids & Family,454.0,20461.474609
240,The Nun 2,2023-10-23,1.0,2.0,Pay 1 WB Theatricals,148872.1,42327.269531
243,The Back Nine,2023-11-02,1.0,2.0,Scripted Features,174372.77,30147.78125
248,Wise Guys,2023-11-09,1.0,2.0,Scripted Features,174372.77,30147.78125
259,Untitled Holiday Movie #1,2023-11-23,1.0,2.0,Scripted Features,174372.77,30147.78125


In [99]:
df_pred[df_pred.title_name.str.contains('Degrassi')]

Unnamed: 0,title_name,season_number,tier,category,effective_start_date,first_views_pred,content_category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,tier_adj,pred_date,schedule_label
146,Degrassi (New),1,2,Kids & Family,2023-09-21,20461.474609,series,-1.0,-2.0,-2.0,-2,-2,2,2022-06-14,alpha


In [65]:
display(df[(df['diff']>0.1) & ~(df.title_name=='Love & Death') | (df['title_name'].str.contains('Degrassi'))].sort_values(by='effective_start_date'))
a

Unnamed: 0,title_name,season_number,tier,category,effective_start_date,first_views_pred,content_category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,tier_adj,pred_date,schedule_label,first_views_pred_old,tier_old,diff
28,Elvis,0,2.0,Pay 1 WB Theatricals,2022-09-02,123419.429688,movies,0.0,0.0,-1.0,100000.0,-1.0,2.0,2022-06-14,alpha,129818.35,1.0,6398.920313
52,Salem's Lot,0,2.0,Pay 1 WB Theatricals,2022-10-24,42327.269531,movies,0.0,0.0,-1.0,1000.0,-1.0,2.0,2022-06-14,alpha,148872.1,1.0,106544.830469
62,Don't Worry Darling,0,2.0,Pay 1 WB Theatricals,2022-11-07,123419.429688,movies,0.0,0.0,-1.0,100000.0,-1.0,2.0,2022-06-14,alpha,129818.35,1.0,6398.920313
218,Barbie,0,2.0,Pay 1 WB Theatricals,2023-09-04,123419.429688,movies,0.0,0.0,-1.0,100000.0,-1.0,2.0,2022-06-14,alpha,129818.35,1.0,6398.920313
223,The Meg 2,0,2.0,Pay 1 WB Theatricals,2023-09-18,461561.1875,movies,2.0,20.0,-1.0,100000.0,-1.0,2.0,2022-06-14,alpha,154323.14,1.0,307238.0475
240,The Nun 2,0,2.0,Pay 1 WB Theatricals,2023-10-23,42327.269531,movies,0.0,0.0,-1.0,1000.0,-1.0,2.0,2022-06-14,alpha,148872.1,1.0,106544.830469
243,The Back Nine,0,2.0,Scripted Features,2023-11-02,30147.78125,movies,0.0,-2.0,-2.0,-2.0,-2.0,2.0,2022-06-14,alpha,174372.77,1.0,144224.98875
248,Wise Guys,0,2.0,Scripted Features,2023-11-09,30147.78125,movies,0.0,-2.0,-2.0,-2.0,-2.0,2.0,2022-06-14,alpha,174372.77,1.0,144224.98875
259,Untitled Holiday Movie #1,0,2.0,Scripted Features,2023-11-23,30147.78125,movies,0.0,-2.0,-2.0,-2.0,-2.0,2.0,2022-06-14,alpha,174372.77,1.0,144224.98875
261,Untitled Holiday Movie #2,0,2.0,Scripted Features,2023-11-30,30147.78125,movies,0.0,-2.0,-2.0,-2.0,-2.0,2.0,2022-06-14,alpha,174372.77,1.0,144224.98875


In [29]:
df_old[df_old.title_name.str.contains('Santa')]

Unnamed: 0.1,Unnamed: 0,title_name,season_number,effective_start_date,tier_adj,prequel_featured_count,prequel_featured_count_s,schedule_label,prequel_count,page_views_s,tier,content_category,page_views,category,first_views_log_pred,first_views_pred,pred_date,decay_category
936,936,Santa Camp,0,2022-11-17,3,-2.0,-2.0,alpha,0.0,-2,3,movies,-2,Documentary Features,7.450268,1720.3237,2022-06-14,Documentary Features
938,938,Santa's Stolen Jingle Bells,0,2022-12-15,2,-2.0,-2.0,alpha,0.0,-2,2,movies,-2,Scripted Features,10.313867,30147.781,2022-06-14,Scripted Features
940,940,"Santa, Inc.",1,2021-12-02,2,-2.0,-2.0,alpha,-1.0,-2,2,series,-2,Scripted Comedy Series,10.134343,25193.545,2022-06-14,Scripted Comedy Series


In [23]:
display(df[df.tier_old.isnull()].head(30))

Unnamed: 0,title_name,season_number,tier,category,effective_start_date,first_views_pred,content_category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,tier_adj,pred_date,schedule_label,first_views_pred_old,tier_old,diff
17,The Anarchists,0,2.0,Docu-Series,2022-07-10,25193.544922,series,-1.0,-2.0,-2.0,-2.0,-2.0,2.0,2022-06-14,alpha,0.0,,25193.544922
23,The Rehearsal,1,3.0,Unscripted Series,2022-07-15,2947.108398,series,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,2947.108398
24,Nikki Glaser: Good Clean Filth,0,3.0,Specials,2022-07-16,1585.492554,special,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,1585.492554
27,The Last Movie Stars,0,3.0,Docu-Series,2022-07-21,2666.045166,series,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,2666.045166
50,Dionne Warwick/TBD,0,3.0,Documentary Features,2022-09-15,2228.288574,series,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,2228.288574
54,Barbarians,0,2.0,Pay1,2022-09-27,10103.203125,movies,0.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,10103.203125
55,Hostages,0,3.0,Docu-Series,2022-09-28,2666.045166,series,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,2666.045166
62,38 At The Garden,0,3.0,Documentary Features,2022-10-11,2228.288574,series,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,2228.288574
66,Yvonne Orji Special,0,3.0,Specials,2022-10-15,1585.492554,special,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,1585.492554
82,Ramy Youssef Special,0,3.0,Specials,2022-11-12,1585.492554,special,-1.0,-2.0,-2.0,-2.0,-2.0,3.0,2022-06-14,alpha,0.0,,1585.492554


In [18]:
df[df.title_name.str.contains('Girls')]

Unnamed: 0,title_name,season_number,tier,category,effective_start_date,first_views_pred,content_category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,tier_adj,pred_date,schedule_label,first_views_pred_old,tier_old,diff
87,The Sex Lives of College Girls,2,1,Scripted Comedy Series,2022-11-17,95436.648438,series,-1.0,-1.0,0.0,-1,100000,1,2022-06-14,alpha,95436.65,1.0,0.001562
206,The Girls on the Bus,1,2,Scripted Drama Series,2023-06-15,26966.986328,series,-1.0,-2.0,-2.0,-2,-2,2,2022-06-14,alpha,,,
272,The Sex Lives of College Girls,3,1,Scripted Comedy Series,2023-11-16,95436.648438,series,-1.0,-1.0,0.0,-1,100000,1,2022-06-14,alpha,95436.65,1.0,0.001562


In [7]:
df_pred[(df_pred.effective_start_date>='2022-06-01') & (df_pred.schedule_label=='alpha')].sort_values(by=['effective_start_date','tier'])

Unnamed: 0,title_name,effective_start_date,season_number,tier,content_category,category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,tier_adj,first_views_pred,pred_date,schedule_label
425,Irma Vep,2022-06-06,0,3,series,Scripted Drama Series,-1.0,-2.0,-2.0,-2,-2,3,7015.209473,2022-06-14,alpha
782,The Janes,2022-06-08,0,3,movies,Documentary Features,0.0,-2.0,-2.0,-2,-2,3,1720.323730,2022-06-14,alpha
624,Roadrunner: A Film About Anthony Bourdain,2022-06-09,0,2,movies,Documentary Features,0.0,-2.0,-2.0,-2,-2,2,6854.866211,2022-06-14,alpha
689,Summer Camp Island,2022-06-09,6,3,series,Kids & Family,-1.0,-2.0,-2.0,-2,-2,3,702.428772,2022-06-14,alpha
738,The Card Counter,2022-06-10,0,3,movies,Pay1,0.0,-2.0,-2.0,-2,-2,3,10103.203125,2022-06-14,alpha
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,HBO 2024 TBD Doc Feature 31,2024-12-15,0,3,movies,Documentary Features,0.0,-2.0,-2.0,-2,-2,3,1720.323730,2022-06-14,alpha
398,Holiday TBD 3,2024-12-19,0,2,movies,Scripted Features,0.0,-2.0,-2.0,-2,-2,2,30147.781250,2022-06-14,alpha
120,Co-Pro Series #2,2024-12-19,1,3,series,Kids & Family,-1.0,-2.0,-2.0,-2,-2,3,702.428772,2022-06-14,alpha
355,HBO 2024 TBD Docu-Series 11,2024-12-19,0,3,series,Docu-Series,-1.0,-2.0,-2.0,-2,-2,3,2666.045166,2022-06-14,alpha


In [189]:
display(df_in_train[df_in_train['tier']==2].groupby(by=['tier','category']).median())

Unnamed: 0_level_0,Unnamed: 1_level_0,season_number,prequel_count,prequel_featured_count,page_views,first_views,first_views_log,tier_adj,prequel_count_s,prequel_featured_count_s,page_views_s
tier,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,Docu-Series,1,-1.0,-2.0,-2,34646,10.452938,2,0.0,-2.0,-2
2,Documentary Features,0,0.0,-2.0,-2,5217,8.559678,2,-1.0,-2.0,-2
2,International,1,-1.0,-2.0,-2,8841,9.087155,2,0.0,-2.0,-2
2,Kids & Family,1,-1.0,-2.0,-2,1531,7.333676,2,0.0,-2.0,-2
2,Pay1,0,0.0,-2.0,-2,14812,9.603193,3,-1.0,-2.0,-2
2,Scripted Comedy Series,1,-1.0,-2.0,-2,44513,10.703537,2,0.0,-2.0,-2
2,Scripted Drama Series,1,-1.0,-2.0,-2,38471,10.55766,2,0.0,-2.0,-2
2,Scripted Features,0,0.0,-2.0,-2,59759,10.998075,2,-1.0,-2.0,-2
2,Specials,0,-1.0,-2.0,-2,112661,11.632139,2,0.0,-2.0,-2
2,Unscripted Series,9,-1.0,-2.0,-2,18655,9.833869,2,5.0,-2.0,-2


In [181]:
df.groupby(by=['tier','category']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,tier_adj,prequel_featured_count_s,season_number,prequel_count,page_views,prequel_featured_count,page_views_s,first_views_log_pred,first_views_pred,first_views_pred_old,tier_old,diff
tier,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Scripted Drama Series,0.0,40.0,1.0,-1.0,-1.0,-1.0,200000.0,13.193606,536921.0625,536921.06,0.0,0.0025
1,Pay 1 WB Theatricals,1.0,-1.0,0.0,1.0,100000.0,20.0,-1.0,12.019143,165900.421875,165900.42,1.0,0.001875
1,Pay1,1.0,-1.0,0.0,0.0,1000.0,0.0,-1.0,11.910843,148872.09375,148872.1,1.0,0.00625
1,Scripted Comedy Series,1.0,0.0,2.0,-1.0,-1.0,-1.0,100000.0,11.466218,95436.648438,95436.65,1.0,0.001562
1,Scripted Drama Series,1.0,3.5,2.0,-1.0,-1.0,-1.0,100000.0,11.529366,101893.046875,152880.56,1.0,0.004375
1,Scripted Features,1.0,-1.0,0.0,0.0,1000.0,0.0,-1.0,12.068951,174372.765625,174372.77,1.0,0.004375
2,Docu-Series,2.0,-2.0,0.5,-1.0,-2.0,-2.0,-2.0,10.134343,25193.544922,25193.545,2.0,7.8e-05
2,Documentary Features,2.0,-2.0,0.0,0.0,-2.0,-2.0,-2.0,8.832714,6854.866211,6854.866,2.0,0.000211
2,Kids & Family,2.0,-2.0,3.5,-1.0,-2.0,-2.0,-2.0,6.117097,453.546082,453.54608,2.0,2e-06
2,Pay 1 WB Theatricals,2.0,-2.0,0.0,0.0,-2.0,-2.0,-2.0,10.097023,24270.646484,148872.1,1.0,124601.453516


In [14]:
def get_agg_first_views(df_in, agg_col, grpby_title, first_view_date_col, first_view_col):
    grpby_title_agg = grpby_title + [agg_col]
    df_in[first_view_date_col] = pd.to_datetime(df_in[first_view_date_col])
    df_in['first_view_quarter'] = df_in[first_view_date_col].dt.to_period("Q").dt.to_timestamp()   
    df_in['first_view_month'] = df_in[first_view_date_col].dt.to_period('M').dt.to_timestamp()
    df_in = df_in[grpby_title_agg + first_view_col].groupby(by=grpby_title_agg).sum().reset_index()
    return df_in

dic_rename = {'tier_adj':'tier_adjusted', 'effective_start_date':'premiere_date', 'pred_date':'model_train_date'} 
out_col = ['title_name', 'first_view_month', 'premiere_date', 'season_number',
           'tier','content_category', 'category', 'prequel_count', 'prequel_featured_count','prequel_featured_count_s',
            'page_views','page_views_s',
           'tier_adjusted', 'first_views_pred_decay', 'model_train_date','schedule_label']

dtype = {'season_number':int}

if  (prod_backtest =='prod') | (prod_backtest =='dev'):
    agg_var = 'first_view_month'    
    df_pred_future_out = df_pred_decay.rename(columns=dic_rename)
    df_pred_future_out = df_pred_future_out[df_pred_future_out.premiere_date>=date_pred]
    df_pred_future_out = df_pred_future_out[out_col].groupby(by=out_col[:-4]+['model_train_date', 'schedule_label']).sum().reset_index()
    df_pred_future_out = df_pred_future_out[out_col].set_index('title_name')
    df_pred_future_out['season_number'] = df_pred_future_out['season_number'].astype(int)
    Utils.to_csv_s3(df_pred_future_out, output_bucket, key_path, f'psi_monthly_xgb_forecast.csv')
    
display(df_pred_future_out.head(2))
display(df_pred_future_out[(df_pred_future_out.tier<=1)].groupby(by=['title_name']+out_col[2:-4]+['schedule_label']).sum().reset_index().sort_values(by=['premiere_date']))

INFO:root:Saved to hbo-outbound-datascience-content-dev/psi_first_views/psi_monthly_xgb_forecast.csv


Unnamed: 0_level_0,first_view_month,premiere_date,season_number,tier,content_category,category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,tier_adjusted,first_views_pred_decay,model_train_date,schedule_label
title_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
#BAMARush,2023-01-01,2023-01-19,0,3,movies,Documentary Features,0.0,-2.0,-2.0,-2,-2,39,930.2851,2022-06-20,alpha
#BAMARush,2023-01-01,2023-01-19,0,3,movies,Documentary Features,0.0,-2.0,-2.0,-2,-2,39,930.2851,2022-06-20,beta


Unnamed: 0,title_name,premiere_date,season_number,tier,content_category,category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,schedule_label,tier_adjusted,first_views_pred_decay
115,Westworld,2022-06-26,4,1,series,Scripted Drama Series,-1.0,-1.0,40.0,-1,200000,beta,91,160158.484375
114,Westworld,2022-06-26,4,1,series,Scripted Drama Series,-1.0,-1.0,40.0,-1,200000,alpha,91,160158.484375
61,Pretty Little Liars: Original Sin,2022-07-07,1,1,series,Scripted Drama Series,-1.0,-1.0,40.0,-1,100000,alpha,91,89476.687500
62,Pretty Little Liars: Original Sin,2022-07-07,1,1,series,Scripted Drama Series,-1.0,-1.0,40.0,-1,100000,beta,91,89476.687500
35,Elvis,2022-08-08,0,1,movies,Pay 1 WB Theatricals,0.0,0.0,-1.0,100000,-1,alpha,91,141247.062500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,The Last Of Us,2024-10-27,2,0,series,Scripted Drama Series,-1.0,-1.0,40.0,-1,200000,beta,0,412245.656250
97,The Sex Lives of College Girls,2024-11-14,4,1,series,Scripted Comedy Series,-1.0,-1.0,0.0,-1,100000,beta,91,69348.523438
106,Untitled DC Event Film #4,2024-11-18,0,1,movies,Pay 1 WB Theatricals,0.0,0.0,-1.0,1000,-1,beta,91,141247.062500
63,Pretty Little Liars: Original Sin,2024-11-28,3,1,series,Scripted Drama Series,-1.0,-1.0,0.0,-1,100000,beta,91,69348.523438


In [19]:
import snowflake.connector
from abc import ABCMeta, abstractmethod
import boto3
import json
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")
cur = ctx.cursor()


class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError
        

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx


def run_query(query, dbname, schema):
    SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

    conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
    ctx=conn.connect(dbname,schema)
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df    


def cvdf_to_snowflake(df, table_name):
    stage = '@HBO_OUTBOUND_DATASCIENCE_CONTENT_DEV'
    output_bucket = "hbo-outbound-datascience-content-dev"
    dbname, schema = 'MAX_DEV', 'WORKSPACE'
    
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index = False)
    content = csv_buffer.getvalue()
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=table_name, Body=content)

    print ('Create Table: ' + table_name)
 
    run_query('''
    create or replace table {table_name} (
    title_name varchar,
    effective_start_date varchar,
    season_number int, 
    tier varchar,
    content_category  varchar,
    category varchar,
    prequel_count int,
    prequel_featured_count int,
    prequel_featured_count_s int,
    page_views int,
    page_views_s int,
    tier_adjusted int,
    first_views_pred float,
    model_pred_date varchar,
    schedule_label varchar
    )
    '''.format(table_name = table_name), dbname, schema)

    print ('Begin Uploading')
    run_query('''
    insert into max_dev.workspace.{table_name}

    select 
              $1
            , $2
            , $3
            , $4
            , $5
            , $6
            , $7
            , $8
            , $9
            , $10
            , $11
            , $12
            , $13
            , $14
            , $15
    from {stage}/psi_first_views/dev/fv_pred_munged_2022-06-23_adhoc.csv

     (FILE_FORMAT => csv_v2)

    '''.format(stage = stage, table_name = table_name,
              file_name = table_name+'.csv')
            , dbname, schema)

    print ('Finish Uploading')   
    
    
import io
from abc import ABCMeta, abstractmethod
# output_bucket = 'hbo-outbound-datascience-content-dev'
# key_path = 'psi_first_views/dev'

# Utils.read_csv_s3(output_bucket, key_path, f'fv_pred_munged_2022-06-23_adhoc.csv')
df_pred_future_out = df_pred.reset_index()
col =  ['title_name' ,
    'effective_start_date' ,
    'season_number' , 
    'tier' ,
    'content_category'  ,
    'category' ,
    'prequel_count' ,
    'prequel_featured_count' ,
    'prequel_featured_count_s' ,
    'page_views' ,
    'page_views_s' ,
    'tier_adj' ,
    'first_views_pred' ,
    'pred_date' ,
    'schedule_label' ]
df_pred_future_out = df_pred_future_out[col]
cvdf_to_snowflake(df_pred_future_out, 'firstview_postgl_temp')

INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 2.7.4, Python Version: 3.6.13, Platform: Linux-4.14.252-131.483.amzn1.x86_64-x86_64-with-glibc2.9
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 2.7.4, Python Version: 3.6.13, Platform: Linux-4.14.252-131.483.amzn1.x86_64-x86_64-with-glibc2.9
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.


Create Table: firstview_postgl_temp


INFO:snowflake.connector.cursor:query: [create or replace table firstview_postgl_temp ( title_name varchar, effective_st...]
INFO:snowflake.connector.cursor:query execution done
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 2.7.4, Python Version: 3.6.13, Platform: Linux-4.14.252-131.483.amzn1.x86_64-x86_64-with-glibc2.9
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.


Begin Uploading


INFO:snowflake.connector.cursor:query: [insert into max_dev.workspace.firstview_postgl_temp  select $1 , $2 , $3 , $4 , ...]
INFO:snowflake.connector.cursor:query execution done


ProgrammingError: 100038 (22018): 01a54154-0504-bfd4-00f8-41026224003f: Numeric value '2023-01-19' is not recognized

In [21]:
df_pred_future_out[df_pred_future_out.effective_start_date=='2023-01-19']

Unnamed: 0,title_name,effective_start_date,season_number,tier,content_category,category,prequel_count,prequel_featured_count,prequel_featured_count_s,page_views,page_views_s,tier_adj,first_views_pred,pred_date,schedule_label
122,#BAMARush,2023-01-19,0,3,movies,Documentary Features,0.0,-2.0,-2.0,-2,-2,3,1720.32373,2022-06-14,alpha
123,HBO 2023 TBD Doc Feature 1,2023-01-19,0,3,movies,Documentary Features,0.0,-2.0,-2.0,-2,-2,3,1720.32373,2022-06-14,alpha
124,I Hate Suzie,2023-01-19,2,3,series,International,-1.0,-2.0,-2.0,-2,-2,3,794.719666,2022-06-14,alpha


In [22]:
df_pred_future_out.isnull().sum()

title_name                  0
effective_start_date        0
season_number               0
tier                        0
content_category            0
category                    0
prequel_count               0
prequel_featured_count      0
prequel_featured_count_s    0
page_views                  0
page_views_s                0
tier_adj                    0
first_views_pred            0
pred_date                   0
schedule_label              0
dtype: int64