In [22]:
import os
import sys
import boto3
import pandas as pd
import numpy as np
import pickle
from datetime import date, datetime, timedelta
import logging 
sys.path.append('/home/ec2-user/SageMaker/jupyter-notebooks/')
from utils import * 
from category_encoders import OneHotEncoder
import xgboost as xgb
import lightgbm as lgbm

logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)


input_bucket="hbo-ingest-datascience-content-dev"
output_bucket="hbo-outbound-datascience-content-dev"



In [26]:
from io import StringIO
class Utils():
    @staticmethod
    def to_csv_s3(content, bucket, filename):
        client = boto3.client('s3')
        key = os.path.join('psi_first_views', filename)
        csv_buffer = StringIO()
        content.to_csv(csv_buffer)
        client.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
        logger.info(f'Saved to {key}')
    
    @staticmethod
    def to_pkl_s3(content, filename):
        client = boto3.client('s3')
        bucket = input_bucket
        key = os.path.join('psi_first_views', filename)
        obj = pickle.dumps(content)
        client.put_object(Bucket=bucket, Key=key, Body=obj)
        logger.info(f'Saved model to {os.path.join(bucket, key)}')
        logger.info(f'Saved to {key}')

    @staticmethod
    def read_csv_s3(filename):
        client = boto3.client('s3')
        bucket = input_bucket
        key = os.path.join('psi_first_views', filename)
        obj = client.get_object(Bucket=bucket, Key=key)
        df = pd.read_csv(obj['Body'])
        logger.info(f'Read from {key}')
        return df
        
    @staticmethod
    def read_pkl_s3(filename):
        client = boto3.client('s3')
        bucket = input_bucket
        key = os.path.join('psi_first_views', filename)
        obj = client.get_object(Bucket=bucket, Key=key)
        body = obj['Body'].read()
        model = pickle.loads(body)
        logger.info(f'Read from {key}')
        return model


class BaseFeatureMunger():
    def __init__(self, df_in, df_imdb_munged, mode, date_run):
        self.df_in = df_in
        self.df_imdb_munged = df_imdb_munged
        self.mode = mode
        self.date_run = date_run
        self.clean_df()
        self.fill_missing_data()
        if mode=='train':
            self.filter_df()
        self.get_datetime_features()
        self.aggregate_df()
        self.merge_imdb_features()
        self.get_first_views()
        self.return_df()
        
    def return_df(self):
        col_base = ['title_id', 'title_name', 'season_number',
       'content_category', 'content_source', 'category',
       'tier', 'effective_start_date', 'request_date', 'first_views',
       'days_since_premiere', 'days_on_platform', 'start_year']
        col_title = ['tier','content_category','category','title_name', 'title_id','season_number',
                'effective_start_date', 'prequel_count', 'prequel_featured_count','first_views','first_views_log']
        if self.mode=='train':
            return self.df_in[col_base], self.df_in_title[col_title]
        elif self.mode=='predict':
            return self.df_in_title[col_title]

    def clean_df(self):
        logger.info(f'base_data shape: {self.df_in.shape}')
#         logger.info(f'base_data null: {self.df_in.isnull().sum()}')
        self.dic_dtype = {'season_number':int, 'tier':int}
        self.df_in = self.df_in.astype(self.dic_dtype)
        self.df_in['effective_start_date'] = pd.to_datetime(self.df_in['effective_start_date'])
        
    def fill_missing_data(self):
        self.df_in['content_category'] = self.df_in['content_category'].fillna('series')
        self.df_in.loc[self.df_in['category'].isin(['Specials']), 'content_category']='special'
        self.df_in.loc[self.df_in['category'].isin(['Popcorn','Pay 1 WB Theatricals','Scripted Features', 'Pay1']), 'content_category']='movies'
        self.df_in.loc[self.df_in['title_name'].str.contains('Harry Potter'), 'content_category']='special'
        self.df_in = self.df_in.fillna(0)

    def filter_df(self):
        self.date_max = datetime.strptime(self.date_run, '%Y-%m-%d')- timedelta(days=60)
        self.df_in = self.df_in[(self.df_in['effective_start_date']<=self.date_max)]

    def get_datetime_features(self):
        self.df_in['start_year'] = self.df_in['effective_start_date'].dt.year
        self.df_in['start_quarter'] = self.df_in['effective_start_date'].dt.quarter
        
    def aggregate_df(self):
        grpby_title= ['title_name', 'title_id','tier','content_category','category','season_number',
                'effective_start_date']
        self.df_in_title = self.df_in[grpby_title + ['first_views']].groupby(by=grpby_title).sum().reset_index()

    def merge_imdb_features(self):
        key_merge = ['title_name','season_number', 'content_category', 'category', 'tier', 'effective_start_date']
        self.df_in_title = self.df_in_title.merge(self.df_imdb_munged, 
                                         on= key_merge, how='left')
        logger.info(f'title imdb shape: {self.df_in_title.shape}')
#         logger.info(f'title imdb features null: {self.df_in_title.isnull().sum()}')
        self.df_in_title = self.df_in_title.fillna(0)
        
    def get_first_views(self):
        ## Scale first_views 
        self.df_in_title['first_views_log'] = np.log(self.df_in_title['first_views']) 
        self.df_in_title.loc[(self.df_in_title.first_views==0), 'first_views_log']=0
        

class IMDBFeatureMunger(BaseFeatureMunger):
    
    def __init__(self, df_in, mode, date_run):
        self.df_in = df_in
        self.date_run = date_run 
        
        self.clean_df()
        if mode=='train':
            self.filter_df()
        self.get_series_features()
        if mode=='train':
            self.adjust_series_for_training_data()
        self.get_non_series_prequel_features()
        self.get_non_series_prequel_ref_features()
        self.merge_non_series_features()
        self.return_df()
        
    def return_df(self):
        col_imdb = ['title_name', 'season_number', 'content_category', 'category', 
                    'tier', 'effective_start_date', 'prequel_count', 'prequel_featured_count']
        self.df_imdb_munged = pd.concat([self.df_series_title, self.df_nseries_title])
        self.df_imdb_munged = self.df_imdb_munged[col_imdb]
        return self.df_imdb_munged
    
    def get_series_features(self):
        grpby_series = ['title_name','title_id','season_number','content_category','category','tier','effective_start_date']
        self.df_series = self.df_in[(self.df_in.content_category=='series') & (self.df_in.reference_type.isin(['featured_in']))] 
        self.df_series_title = self.df_series.groupby(by=grpby_series).agg({'reference_title_id':'nunique'}).reset_index()
        self.df_series_title = self.df_series_title.rename(columns={'reference_title_id':'ref_ref_featured_in'})
#         self.df_series_title[['ref_follows','ref_spin_off_from','ref_remake_of']] = 0
        self.df_series_title['prequel_count'] = self.df_series_title['season_number']

    def adjust_series_for_training_data(self):     
        self.df_series_title['ref_ref_featured_in'] = (self.df_series_title['ref_ref_featured_in'] * (self.df_series_title['season_number']-1))/self.df_series_title['season_number']
        self.df_series_title.loc[self.df_series_title.season_number>1, 'prequel_featured_count'] = (self.df_series_title['ref_ref_featured_in'])/(self.df_series_title['season_number']-1)

    def get_non_series_prequel_features(self):
        self.grpby_nseries=['title_name','title_id','season_number','content_category','category','tier','effective_start_date','imdb_imdb_series_id']
        
        self.df_nseries = self.df_in[(self.df_in.content_category!='series')\
                                     & (self.df_in.reference_type.isin(['follows','spin_off_from','remake_of','version_of']))] 
        self.df_nseries_preq = self.df_nseries.groupby(by=self.grpby_nseries+['reference_type']).agg({'reference_title_id':'nunique'}).reset_index()
        self.df_nseries_preq = self.df_nseries_preq.pivot(index=self.grpby_nseries, 
                                                                columns='reference_type', 
                                                                values='reference_title_id').reset_index()
        self.df_nseries_preq = self.df_nseries_preq.rename(columns={'follows':'ref_follows',  
                                                                          'spin_off_from':'ref_spin_off_from', 
                                                                          'remake_of':'ref_remake_of',
                                                                          'version_of':'ref_version_of'})
        
    def get_non_series_prequel_ref_features(self):
        self.df_nseries_preq_ref = self.df_nseries.groupby(by=self.grpby_nseries+['reference_reference_type']).agg({'reference_reference_title_id':'nunique'}).reset_index()
        self.df_nseries_preq_ref = self.df_nseries_preq_ref.pivot(index=self.grpby_nseries, \
                                                                        columns='reference_reference_type', values='reference_reference_title_id').reset_index()
        self.df_nseries_preq_ref = self.df_nseries_preq_ref.rename(columns={'featured_in':'ref_ref_featured_in'})

    def merge_non_series_features(self):
        col_preq = ['title_name', 'tier','season_number','content_category','category','effective_start_date','imdb_imdb_series_id','ref_follows','ref_spin_off_from','ref_remake_of', 'ref_version_of']
        col_ref_ref = ['title_name','season_number','category','ref_ref_featured_in']
        self.df_nseries_title = self.df_nseries_preq[col_preq].merge(self.df_nseries_preq_ref[col_ref_ref], how='outer', on=['title_name','season_number','category'])
        self.df_nseries_title['prequel_count'] = self.df_nseries_title[['ref_follows','ref_spin_off_from','ref_remake_of','ref_version_of']].sum(axis=1)
        self.df_nseries_title['prequel_featured_count'] = self.df_nseries_title['ref_ref_featured_in']/self.df_nseries_title['prequel_count']
        
        
class PreProcessor():
    def __init__(self, df_in, mode, date_run):
        self.df_in = df_in
        self.date_run = date_run
        self.get_parameters()
        self.winsorize_features()
        self.adjust_tiers()
        if mode=='train':
            self.winsorize_label()
            self.resample_data()
        self.return_df()
        
    def return_df(self):
        return self.df_in
    
    def get_parameters(self):
        self.max_firstviews = 1000000
        self.old_sampling_rate = 0.25
        self.recent_sampling_rate = 0.75
        self.popcorn_old_sampling_rate = 0.2
        self.popcorn_recent_sampling_rate = 0.3
    
    def adjust_tiers(self):
        self.df_in['tier_adj'] = self.df_in['tier']
        self.df_in.loc[(self.df_in.title_name=='Reminiscence'), 'tier_adj'] = 2
        self.df_in.loc[(self.df_in.category=='Pay1') & (self.df_in.tier==1), 'tier_adj'] = 2
        self.df_in.loc[(self.df_in.category=='Pay1') & (self.df_in.tier==2), 'tier_adj'] = 3
#         self.df_in.loc[(self.df_in.effective_start_date < '2022-04-01') & (self.df_in.category=='Popcorn') & (self.df_in.tier==1), 'tier_adj'] = 0
#         self.df_in.loc[(self.df_in.effective_start_date < '2022-04-01') & (self.df_in.category=='Popcorn') & (self.df_in.tier==2), 'tier_adj'] = 1
        
    def winsorize_label(self):
        self.df_in.loc[self.df_in.first_views>self.max_firstviews, 'first_views'] = self.max_firstviews
    
    def winsorize_features(self):
        ## winsorize season to 10
        ## winsorize ref_tot to 5 
        ## winsorize ref_ref tot to 20
        ## Penalize kids & family 
        ## for series future titles, nullify ref_ features to keep consistent 
        self.df_in.loc[(self.df_in['prequel_count']>5), ['prequel_count']] = 5
        self.df_in.loc[(self.df_in['prequel_featured_count']>20), ['prequel_featured_count']] = 20   
#         self.df_in.loc[(self.df_in.content_category=='series') & (self.df_in['prequel_count']>3), 'prequel_count'] = 3
#         self.df_in.loc[(self.df_in['category']=='Kids & Family'), ['prequel_featured_count']] = -1
        self.df_in.loc[(self.df_in['content_category']=='series'), ['prequel_count']] = -1
        self.df_in.loc[(self.df_in['content_category']=='series'), ['prequel_featured_count']] = -1
       
    def resample_data(self):  
        self.df_in_old = self.df_in[(self.df_in.effective_start_date < datetime.strptime(self.date_run, '%Y-%m-%d')- timedelta(days=120))]
        self.df_in_recent = self.df_in[(self.df_in.effective_start_date >= datetime.strptime(self.date_run, '%Y-%m-%d')- timedelta(days=120))]
        target_count = self.df_in[self.df_in.tier==3].shape[0]
        target_count_old = int(round(target_count*self.old_sampling_rate))
        target_count_new = int(round(target_count*self.recent_sampling_rate))
        
        df_resample_list = []
        
        ## resample old data. try/except in place in case categories are unavailable  
        for i in [0,1,2]:
            try:
                df_resample_list.append(self.df_in_old[(self.df_in_old.tier==i) & (self.df_in_old.category!='Popcorn')].sample(target_count_old, replace=True))
            except:
                pass
            try:    
                df_resample_list.append(self.df_in_recent[(self.df_in_recent.tier==i) & (self.df_in_recent.category!='Popcorn')].sample(target_count_new, replace=True))
            except:
                pass
        try:
            df_resample_list.append(self.df_in_old[(self.df_in_old.category=='Popcorn')].sample(int(round(target_count_old*self.popcorn_old_sampling_rate)), replace=True))
        except:
            pass
        try:
            df_resample_list.append(self.df_in_recent[(self.df_in_recent.category=='Popcorn')].sample(int(round(target_count_new*self.popcorn_recent_sampling_rate)), replace=True))
        except:
            pass
        df_resample_list.append(self.df_in_old[(self.df_in_old.tier==3) & (self.df_in_old.category!='Popcorn')])
        df_resample_list.append(self.df_in_recent[(self.df_in_recent.tier==3) & (self.df_in_recent.category!='Popcorn')])
        self.df_in = pd.concat(df_resample_list, axis=0)
    
    
class XGB(Utils):
    def __init__(self, df_in, mode, date_run):
        self.df_in = df_in    
        self.mode = mode
        self.date_run = date_run
        self.get_parameters()
        
        if mode=='train':
            self.train_xgb()
            self.save_model()
        elif mode=='predict':
            self.get_model()
            self.predict_xgb()
            self.return_df()
            
    @staticmethod
    def _encode(df, categoricals):
        
        """
        perform category encoding on the data
        :param df: dataframe to be encoded
        :param categoricals: list of name of categorical columns
        :return ohe, x_ohe: OHE object and OHE-encoded data
        """
        ohe = OneHotEncoder(cols=categoricals, 
                            handle_unknown='return_nan',
                           handle_missing='return_nan',  
                           use_cat_names=True) 
        x_ohe = ohe.fit_transform(df)
        return ohe, x_ohe
    
    def return_df(self):
        return self.df_pred
    
    def get_parameters(self):
        self.target = 'first_views_log'
        self.features_cat=['tier_adj','category','content_category','prequel_count'] #
        self.features_cont=['prequel_featured_count']
        self.param_xgb = {"booster":"gbtree",
                     "objective":"reg:squarederror",
                    "gamma":1}

    def train_xgb(self):
        x_train = self.df_in[self.features_cat + self.features_cont]
        y_train = self.df_in[self.target]
        self.ohe, x_ohe = self._encode(x_train, self.features_cat)
        dm_train = xgb.DMatrix(x_ohe, label=y_train)

        ## train 
        self.model = xgb.train(params = self.param_xgb, dtrain = dm_train, num_boost_round = 10)
        
    def save_model(self):
        dict_model = {'model': self.model, 'ohe': self.ohe}
        Utils.to_pkl_s3(dict_model, f'fv_{date_train}.pkl')
        logger.info(f'Done model training {date_train}')
    
    def get_model(self):
        dict_model = Utils.read_pkl_s3(f'fv_{date_train}.pkl')
        self.ohe = dict_model['ohe']
        self.model = dict_model['model']
        
    def predict_xgb(self):
        x_test = self.df_in[self.features_cat + self.features_cont]
        x_ohe_test = self.ohe.transform(x_test)
        dm_test = xgb.DMatrix(x_ohe_test)
        pred = self.model.predict(dm_test)

        self.df_pred = self.df_in[list(set(['title_name','category','season_number', 'effective_start_date', 'tier','tier_adj'] + self.features_cat + self.features_cont))]
        self.df_pred['first_views_log_pred'] = pred
        self.df_pred[f'first_views_pred'] = np.exp(self.df_pred[f'first_views_log_pred'])
        self.df_pred['pred_date'] = self.date_run
        self.df_pred['pred_date'] = pd.to_datetime(self.df_pred['pred_date'])
        self.df_pred.loc[(self.df_pred.category=='Popcorn') & (self.df_pred.effective_start_date>='2022-04-01'), 'category']='Pay 1 WB Theatricals'


class FVDecay(Utils):
    def __init__(self, df_in, mode, date_run, grpby=''):
        self.df_in = df_in    
        self.mode = mode
        self.date_run = date_run
        self.grpby = grpby

        if mode=='train':
            self.get_decay_data_title()
            self.get_decay_curve()
            self.save_decay_curve()
            self.return_df()

    def return_df(self):
        return self.df_decay
        
    @staticmethod
    def apply_decay_curve(df_in, df_decay, df_popcorn_decay):
        category_list_train = df_decay.category.unique().tolist()
        df_in['decay_category'] = df_in['category']
        df_in.loc[(~df_in['decay_category'].isin(category_list_train)) & (df_in['decay_category']!='Popcorn'), 'decay_category'] = 'Pay1'
        
        
        ## Apply decay to prediction 
        df_pred_decay = df_in.merge(df_decay[['category','days_since_premiere','first_views_pct']], 
                                                left_on=['decay_category'], right_on=['category'], how='left', suffixes=['', '_category'])\
                            .merge(df_popcorn_decay[['category','days_since_premiere','first_views_pct']], 
                                        on=['category','days_since_premiere'], how='left', suffixes=['', '_popcorn'])
        print(df_pred_decay.columns)
        df_pred_decay.loc[(df_pred_decay.category=='Popcorn') & (df_pred_decay.first_views_pct.isnull()), 'first_views_pct'] = df_pred_decay['first_views_pct_popcorn']
        
        df_pred_decay[f'first_views_pred_decay'] = df_pred_decay[f'first_views_pred'] * df_pred_decay['first_views_pct']
        df_pred_decay['start_month'] = df_pred_decay['effective_start_date'].dt.to_period('M').dt.to_timestamp()
        df_pred_decay['request_date'] = df_pred_decay['effective_start_date'] + pd.to_timedelta(df_pred_decay.days_since_premiere, unit="D")
        df_pred_decay['first_view_quarter'] = df_pred_decay['request_date'].dt.to_period('Q').dt.to_timestamp()
        df_pred_decay['first_view_month'] = df_pred_decay['request_date'].dt.to_period('M').dt.to_timestamp()

        return df_pred_decay
    
    def get_decay_data_title(self):
        self.key_col = ['title_name','tier','content_category','category','season_number','effective_start_date']
        self.df_in = self.df_in[self.key_col + ['title_id','days_since_premiere','first_views']].sort_values(by=self.key_col+['days_since_premiere'])
        self.df_in['first_views_sum'] = self.df_in.groupby(self.key_col)['first_views'].transform('sum')
        self.df_in['first_views_cumsum'] = self.df_in.groupby(by=self.key_col)['first_views'].cumsum()
        self.df_in['first_views_cumpct'] = self.df_in['first_views_cumsum'] / self.df_in['first_views_sum']
        self.df_in.reset_index(inplace=True)
            
    def get_decay_curve(self):
        self.df_decay = self.df_in[[self.grpby] + ['title_id','days_since_premiere','first_views_cumpct']].sort_values(by=[self.grpby]+['days_since_premiere'])
        self.df_decay = self.df_decay.groupby(by=[self.grpby] + ['days_since_premiere']).agg({'first_views_cumpct':'median','title_id':'nunique'})
        self.df_decay = self.df_decay.rename(columns={'title_id':'title_count_training'})
        self.df_decay = self.df_decay.reset_index()
        self.df_decay['first_views_pct'] = self.df_decay.groupby(self.grpby)['first_views_cumpct'].diff()
        self.df_decay.reset_index(inplace=True)
        
        self.df_decay['days_since_premiere'] = self.df_decay['days_since_premiere'].astype(int)
        self.df_decay.loc[(self.df_decay.days_since_premiere==0),'first_views_pct'] = self.df_decay['first_views_cumpct']
        
    def save_decay_curve(self):
        Utils.to_csv_s3(self.df_decay, input_bucket, f'fv_decay_{self.grpby}_{self.date_run}.csv')

mode = 'test' ## 'prod', 'test', 'backtest'
df_pred_list = []
df_pred_decay_list = [] ##
if mode=='backtest':
    list_date_train = ['2021-01-01', '2021-02-01', '2021-03-01','2021-04-01', '2021-05-01', '2021-06-01','2021-07-01', '2021-08-01', '2021-09-01']
elif mode =='test':
    list_date_train = ['2021-12-01']
elif mode =='prod':
    list_date_train = [(datetime.today()).strftime('%Y-%m-%d')]
    
for date_train in list_date_train:
    date_pred = date_train 
    logger.info(f'TRAINING MODEL FOR {date_train}, Mode:{mode}')

    #### Train 
    mode = 'train'
    key_col = ['title_name', 'title_id','tier','content_category','category','season_number','effective_start_date']

    ## Read data 
    df_raw = Utils.read_csv_s3(f'fv_train_fv_{date_train}.csv')
    df_imdb = Utils.read_csv_s3(f'fv_train_imdb_{date_train}.csv')

    ## Munge features 
    df_imdb_munged = IMDBFeatureMunger(df_imdb, mode, date_train).df_imdb_munged
    df_base_munged_decay, df_base_munged = BaseFeatureMunger(df_raw, df_imdb_munged, mode, date_train).return_df()
    df_decay_category = FVDecay(df_base_munged_decay, mode, date_train, 'category').return_df()

    df_in_train = PreProcessor(df_base_munged, mode, date_train).return_df()  
    XGB(df_in_train, mode, date_train)


    #### Predict 
    mode = 'predict'
    logger.info(f'PREDICTING FOR {date_pred}, Mode:{mode}')
    
    ## Read data and munge features 
    if mode=='backtest':
        df_imdb_pred = Utils.read_csv_s3(f'fv_train_imdb_2021-12-01.csv')
        df_raw_pred = Utils.read_csv_s3(f'fv_train_2021-12-15.csv')
        df_imdb_munged_pred = IMDBFeatureMunger(df_imdb_pred, mode, date_pred).return_df()
        df_base_munged_pred = BaseFeatureMunger(df_raw_pred, df_imdb_munged_pred, mode, date_pred).return_df()
        df_base_munged_pred = df_base_munged_pred[(df_base_munged_pred.effective_start_date >= date_pred)\
                                  & (df_base_munged_pred.effective_start_date < datetime.strptime(date_pred, '%Y-%m-%d')+ timedelta(days=92))]
    else:
        df_imdb_pred = Utils.read_csv_s3(f'fv_pred_{date_pred}.csv')
        df_raw_pred = df_imdb_pred[key_col]
        df_raw_pred['first_views'] = 0
        df_imdb_munged_pred = IMDBFeatureMunger(df_imdb_pred, mode, date_pred).return_df()
        df_base_munged_pred = BaseFeatureMunger(df_raw_pred, df_imdb_munged_pred, mode, date_pred).return_df()
        
#     print(df_base_munged_pred[df_base_munged_pred.title_name.str.contains('Curb')].season_number.unique())
    df_in_pred = PreProcessor(df_base_munged_pred, mode, date_pred).return_df()  
    print(df_in_pred[df_in_pred.title_name.str.contains('Curb')].season_number.unique())
    ## Predict and apply decay 
    df_out_pred = XGB(df_in_pred, mode, date_pred).return_df()
    df_decay_category = Utils.read_csv_s3(f'fv_decay_category_{date_train}.csv') 
    df_decay_popcorn = Utils.read_csv_s3(f'fv_decay_category_2021-09-01.csv') 
    df_pred_decay = FVDecay.apply_decay_curve(df_out_pred, df_decay_category, df_decay_popcorn)
    
    df_pred_list.append(df_out_pred)
    df_pred_decay_list.append(df_pred_decay)

df_pred = pd.concat(df_pred_list)
df_pred_decay = pd.concat(df_pred_decay_list)



INFO:root:TRAINING MODEL FOR 2021-12-01, Mode:test
INFO:root:Read from psi_first_views/fv_train_2021-12-01.csv
INFO:root:Read from psi_first_views/fv_train_imdb_2021-12-01.csv
INFO:root:base_data shape: (270369, 18)
INFO:root:base_data shape: (34160, 19)
INFO:root:title imdb shape: (329, 10)
INFO:root:Saved to psi_first_views/fv_decay_category_2021-12-01.csv
INFO:root:Saved model to hbo-ingest-datascience-content-dev/psi_first_views/fv_2021-12-01.pkl
INFO:root:Saved to psi_first_views/fv_2021-12-01.pkl
INFO:root:Done model training 2021-12-01
INFO:root:PREDICTING FOR 2021-12-01, Mode:predict
INFO:root:Read from psi_first_views/fv_pred_2021-12-01.csv

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:root:base_data shape: (26821, 19)
INFO:root:base_data shape: (26821, 8)
I

[11 12]


INFO:root:Read from psi_first_views/fv_decay_category_2021-09-01.csv


Index(['content_category', 'tier', 'tier_adj', 'title_name',
       'prequel_featured_count', 'season_number', 'prequel_count',
       'effective_start_date', 'category', 'first_views_log_pred',
       'first_views_pred', 'pred_date', 'decay_category', 'category_category',
       'days_since_premiere', 'first_views_pct', 'first_views_pct_popcorn'],
      dtype='object')


In [21]:
df_pred_future_out.head()

Unnamed: 0,title_name,first_view_month,premiere_date,season_number,tier,content_category,category,prequel_count,prequel_featured_count,tier_adjusted,first_views_pred,model_train_date
0,100 Foot Wave,2022-07-01,2022-07-27,2,3,series,Docu-Series,-1.0,-1.0,3,17440.642578,2021-12-01
1,100 Foot Wave,2022-08-01,2022-07-27,2,3,series,Docu-Series,-1.0,-1.0,3,108131.992188,2021-12-01
2,100 Foot Wave,2022-09-01,2022-07-27,2,3,series,Docu-Series,-1.0,-1.0,3,104643.859375,2021-12-01
3,100 Foot Wave,2022-10-01,2022-07-27,2,3,series,Docu-Series,-1.0,-1.0,3,87203.21875,2021-12-01
4,12 Dates of Christmas,2021-11-01,2021-11-25,2,3,series,Unscripted Series,-1.0,-1.0,3,22811.865234,2021-12-01


In [27]:
def get_agg_first_views(df_in, agg_col, grpby_title, first_view_date_col, first_view_col):
    grpby_title_agg = grpby_title + [agg_col]
    df_in[first_view_date_col] = pd.to_datetime(df_in[first_view_date_col])
    df_in['first_view_quarter'] = df_in[first_view_date_col].dt.to_period("Q").dt.to_timestamp()   
    df_in['first_view_month'] = df_in[first_view_date_col].dt.to_period('M').dt.to_timestamp()
    df_in = df_in[grpby_title_agg + first_view_col].groupby(by=grpby_title_agg).sum().reset_index()
    return df_in

### Save to csv     
dic_rename = {'tier_adj':'tier_adjusted', 'effective_start_date':'premiere_date', 'pred_date':'model_train_date'} 
out_col = ['title_name', 'first_view_month', 'premiere_date', 'season_number',
           'tier','content_category', 'category', 'prequel_count', 'prequel_featured_count',
           'tier_adjusted', 'first_views_pred', 'model_train_date']
agg_var = 'first_view_month'

df_pred_future_out = df_pred_decay.rename(columns=dic_rename)
df_pred_future_out = df_pred_future_out[out_col].groupby(by=out_col[:-2]+['model_train_date']).sum().reset_index()
df_pred_future_out = df_pred_future_out[out_col]

Utils.to_csv_s3(df_pred_future_out, output_bucket, f'future_program_xgb_pred.csv')
Utils.to_csv_s3(df_pred_future_out, input_bucket, f'future_program_xgb_pred_{date_train}.csv')


INFO:root:Saved to psi_first_views/future_program_xgb_pred.csv
INFO:root:Saved to psi_first_views/future_program_xgb_pred_2021-12-01.csv


In [164]:
# ### Publish to output table 


# def cvdf_to_snowflake(df, table_name):
#     stage = '@HBO_OUTBOUND_DATASCIENCE_CONTENT_DEV'
#     output_bucket = "hbo-outbound-datascience-content-dev"
#     filename ='psi/' + table_name + '.csv'
#     dbname, schema = 'MAX_DEV', 'WORKSPACE'
    
#     csv_buffer = io.StringIO()
#     df.to_csv(csv_buffer, index = False)
#     content = csv_buffer.getvalue()
#     client = boto3.client('s3')
#     client.put_object(Bucket=output_bucket, Key=filename, Body=content)

#     print ('Create Table: ' + table_name)
#     run_query('''
#     create or replace table {table_name}(
#     title_name varchar,
#     season_number int, 
#     content_category  varchar,
#     category varchar,
#     tier varchar,
#     premiere_date varchar,
#     first_view_month varchar,
#     imdb_prequel_count int,
#     imdb_prequel_references int,
#     tier_adjusted int,
#     category_adjusted varchar,
#     first_views_pred float,
#     model_train_date varchar
#     )
#     '''.format(table_name = table_name), dbname, schema)

#     print ('Begin Uploading')
#     run_query('''
#     insert into max_dev.workspace.{table_name}

#     select 
#           $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13   
#     from {stage}/psi/{file_name}

#      (FILE_FORMAT => csv_v2)

#     '''.format(stage = stage, table_name = table_name,
#               file_name = table_name+'.csv')
#             , dbname, schema)

#     print ('Finish Uploading')    


# def run_query(query, dbname, schema):
#     SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

#     conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
#     ctx=conn.connect(dbname,schema)
#     cursor = ctx.cursor()
#     cursor.execute(query)
#     df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
#     df.columns= df.columns.str.lower()
#     return df


    
# dic_rename = {'tier_new':'tier_adjusted', 'effective_start_date':'premiere_date', 
#               'category':'category_adjusted','category_original':'category', 
#               'ref_tot':'imdb_prequel_count', 'ref_ref_tot':'imdb_prequel_references',
#              'first_views_decay_dev_b':'first_views_pred'}

# out_col = ['title_name',  'season_number','content_category', 'category',
#            'tier', 'premiere_date','first_view_month', 
#            'imdb_prequel_count', 'imdb_prequel_references', 'tier_adjusted', 'category_adjusted','first_views_pred', 'model_train_date']


# grpby_title= out_col.copy()
# agg_var = 'first_view_month'
# grpby_title.remove('first_views_pred')
# grpby_title.remove(agg_var)

# df_pred_future_out = df_pred_decay_future.rename(columns=dic_rename)
# df_pred_future_out = get_agg_first_views(df_pred_future_out, agg_var, grpby_title,'request_date', ['first_views_pred'])
# df_pred_future_out = df_pred_future_out[out_col]
# ## QA 
# display(df_pred_future_out.head())
# a = df_pred_future_out.groupby(by=['title_name','category','tier','premiere_date']).sum().reset_index()
# a[a.category=='Pay 1 WB Theatricals']

# print(a.premiere_date.min(),a.premiere_date.max()) 
# print(a.shape, a.title_name.nunique())

# ## Upload to snowflake 

# import io
# df_pred_future_out.to_csv('s3://datascience-hbo-users/users/tjung/psi/future_program_xgb_pred.csv')
# cvdf_to_snowflake(df_pred_future_out, 'future_program_xgb_pred')




Unnamed: 0,title_name,season_number,content_category,category,tier,premiere_date,first_view_month,imdb_prequel_count,imdb_prequel_references,tier_adjusted,category_adjusted,first_views_pred,model_train_date
0,100 Foot Wave,2,series,Docu-Series,3,2022-07-27,2022-07-01,0.0,0.0,3,Docu-Series,900.102632,2021-09-01
1,100 Foot Wave,2,series,Docu-Series,3,2022-07-27,2022-08-01,0.0,0.0,3,Docu-Series,2654.441796,2021-09-01
2,100 Foot Wave,2,series,Docu-Series,3,2022-07-27,2022-09-01,0.0,0.0,3,Docu-Series,576.575886,2021-09-01
3,100 Foot Wave,2,series,Docu-Series,3,2022-07-27,2022-10-01,0.0,0.0,3,Docu-Series,184.206834,2021-09-01
4,12 Dates of Christmas,2,series,Unscripted Series,3,2021-11-25,2021-11-01,0.0,0.0,3,Unscripted Series,1211.584539,2021-09-01


2021-09-02 00:00:00 2024-12-19 00:00:00
(725, 9) 591
Create Table: future_program_xgb_pred
Begin Uploading
Finish Uploading
