In [19]:
##nodejs:  https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/setting-up-node-on-ec2-instance.html

# !pip install "jupyterlab>=3" "ipywidgets>=7.6"
# !pip install jupyter-dash
# !jupyter lab build


# !pip install snowflake --user
# !pip install snowflake-connector-python --user
# !pip install category_encoders
# !pip install xgboost
# !pip install lightgbm --user
import os
import sys
path=!pwd
sys.path.append(os.path.join(path[0], '..'))
from utils import *
import snowflake.connector
from datetime import timedelta

from category_encoders import OneHotEncoder
import xgboost as xgb
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score as r2_score
import sklearn.model_selection

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")
cur = ctx.cursor()

## Assets_dim

In [49]:
querystr = '''
select * from max_prod.catalog.asset_dim a
join max_prod.catalog.reporting_asset_offering_dim raod
on a.viewable_id = raod.viewable_id
and brand = 'HBO MAX'
and territory = 'HBO MAX DOMESTIC'
and channel = 'HBO MAX SUBSCRIPTION'
right outer join
(select title, tier, season, category, premiere_date from max_prod.content_analytics.daily_future_programming_schedule group by title, tier, season, category, premiere_date) fp
on a.asset_title_short = fp.title
and zeroifnull(a.season_number) = zeroifnull(fp.season)
;
'''

cursor_list = ctx.execute_string(
    querystr
    )
df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
df.columns= df.columns.str.lower()
display(df.head(2))
display(df.shape)
df.to_csv('s3://datascience-hbo-users/users/tjung/psi/future_program_metadata.csv')

Unnamed: 0,asset_id,wm_id,viewable_id,source_asset_id,series_id,season_id,franchise_id,catalog_geo_id,imdb_id,csm_id,grace_notes_id,asset_title_long,asset_title_short,asset_summary_long,asset_summary_short,asset_tags_age,asset_tags_content_category,asset_tags_genre,asset_tags_other,brand_hubs,asset_type,catalog_asset_type,series_title_long,series_title_short,series_type,season_title_long,season_title_short,promoted_asset_id,season_number,season_premiere_ind,season_finale_ind,series_premiere_ind,episode_number_in_season,episode_number_in_series,program_type,content_category,primary_genre_desc,navigation_genre_desc,descriptive_genre_desc,wm_enterprise_genres,leaf_level_genres,animation_type,script_type,scripted_flag,sports_flag,kids_flag,international_flag,latino_flag,short_flag,age_group,original_network,release_year,license_contract_type,license_contract_term,licensor,original_language,first_offered_date_hbo,first_offered_date_now,first_offered_date_max,air_date,latest_availability_start_date,latest_availability_end_date,premiere_air_date,theatrical_release_date,original_linear_air_date,full_asset_tags_age,full_asset_tags_content_category,full_asset_tags_genre,full_asset_tags_other,full_brand_hubs,is_downloadable,asset_url,creation_time,last_modified_time,last_rollup_trigger_time,effective_start_date,effective_end_date,active_flag,title_id,viewable_id.1,start_utc,end_utc,latest_asset_release_date,season_first_offered_date,title_first_offered_date,first_offered_date,latest_availability_start_date.1,latest_availability_end_date.1,brand,territory,channel,ever_popcorn_title,creation_time.1,last_modified_time.1,title,tier,season,category,premiere_date
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,,NaT,,NaT,NaT,NaT,,,,,,,,NaT,NaT,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,NaT,NaT,Legendary,3,3,Unscripted Series,2022-05-05
1,6.880571200656195e+18,urn:warnermedia:wmid:product:714a9d96bfcb15d26...,GXodE-Q8aYbDCYwEAAAQT,PROD6988,,,,,tt0087182,4a5cb47d-dc8d-478d-8502-ecd33e9609fc,,Dune,Dune,The year: 10191. Two factions war bitterly ove...,Two factions in the year 10191 war bitterly ov...,,movie,fantasy-sci-fi,action | feature | horror-sci-fi | movie | movies,hbo | turner-classic-movies,FEATURE,movie,,,,,,,,0.0,0.0,0.0,,,acquired,movies,action,Fantasy & Sci-Fi,Sci-Fi|Adventure,action | fantasy & sci-fi,action | adventure | fantasy | sci-fi,Live Action,Scripted,1.0,0.0,0.0,0.0,0.0,0.0,,HBO,1984.0,LIBRARY,3.0,"UNIVERSAL CITY STUDIOS PRODUCTIONS, LLLP",,2020-06-01 10:00:00,2020-06-01 10:00:00,2020-06-01 10:00:00,1984-12-14,2020-06-01 12:00:00,2021-11-30 12:00:00,1984-12-14 12:00:00,1984-12-14,1985-11-09 20:00:00,,urn:hbo:content-category:movie,urn:hbo:genre:fantasy-sci-fi,urn:tag:action | urn:tag:feature | urn:tag:hor...,urn:warnermedia:brand:hbo | urn:warnermedia:br...,0.0,/movies/GXodE-Q8aYbDCYwEAAAQT,2020-10-16 22:52:56.548,2021-10-05 14:37:17.321,2020-10-16 22:52:56.548,2020-10-16 22:52:56.548,2999-12-31 00:00:00,1.0,GXodE-Q8aYbDCYwEAAAQT,GXodE-Q8aYbDCYwEAAAQT,2020-06-01 10:00:00,2021-12-01 10:59:00,2020-06-01 10:00:00,NaT,2020-06-01 10:00:00,2020-06-01 10:00:00,2020-06-01 12:00:00,2021-11-30 12:00:00,HBO MAX,HBO MAX DOMESTIC,HBO MAX SUBSCRIPTION,0.0,2021-10-06 00:42:02.338,2021-10-06 00:42:02.338,Dune,1,0,Popcorn,2021-10-22


(857, 99)

In [4]:
df = pd.read_csv('s3://datascience-hbo-users/users/tjung/psi/future_program_metadata.csv')
names = df.columns.tolist()
df = df[names[-5:] + names[:-5]]
df['premiere_date'] = pd.to_datetime(df['premiere_date'])
df['premiere_year_month'] = df['premiere_date'].dt.strftime('%Y-%m')
display(df.head(2))
print(df.shape)
print(df.title.nunique())
print(df.isnull().sum())


Unnamed: 0.1,title,tier,season,category,premiere_date,Unnamed: 0,asset_id,wm_id,viewable_id,source_asset_id,series_id,season_id,franchise_id,catalog_geo_id,imdb_id,csm_id,grace_notes_id,asset_title_long,asset_title_short,asset_summary_long,asset_summary_short,asset_tags_age,asset_tags_content_category,asset_tags_genre,asset_tags_other,brand_hubs,asset_type,catalog_asset_type,series_title_long,series_title_short,series_type,season_title_long,season_title_short,promoted_asset_id,season_number,season_premiere_ind,season_finale_ind,series_premiere_ind,episode_number_in_season,episode_number_in_series,program_type,content_category,primary_genre_desc,navigation_genre_desc,descriptive_genre_desc,wm_enterprise_genres,leaf_level_genres,animation_type,script_type,scripted_flag,sports_flag,kids_flag,international_flag,latino_flag,short_flag,age_group,original_network,release_year,license_contract_type,license_contract_term,licensor,original_language,first_offered_date_hbo,first_offered_date_now,first_offered_date_max,air_date,latest_availability_start_date,latest_availability_end_date,premiere_air_date,theatrical_release_date,original_linear_air_date,full_asset_tags_age,full_asset_tags_content_category,full_asset_tags_genre,full_asset_tags_other,full_brand_hubs,is_downloadable,asset_url,creation_time,last_modified_time,last_rollup_trigger_time,effective_start_date,effective_end_date,active_flag,title_id,viewable_id.1,start_utc,end_utc,latest_asset_release_date,season_first_offered_date,title_first_offered_date,first_offered_date,latest_availability_start_date.1,latest_availability_end_date.1,brand,territory,channel,ever_popcorn_title,creation_time.1,last_modified_time.1,premiere_year_month
0,Legendary,3,3,Unscripted Series,2022-05-05,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-05
1,Dune,1,0,Popcorn,2021-10-22,1,6.880571e+18,urn:warnermedia:wmid:product:714a9d96bfcb15d26...,GXodE-Q8aYbDCYwEAAAQT,PROD6988,,,,,tt0087182,4a5cb47d-dc8d-478d-8502-ecd33e9609fc,,Dune,Dune,The year: 10191. Two factions war bitterly ove...,Two factions in the year 10191 war bitterly ov...,,movie,fantasy-sci-fi,action | feature | horror-sci-fi | movie | movies,hbo | turner-classic-movies,FEATURE,movie,,,,,,,,0.0,0.0,0.0,,,acquired,movies,action,Fantasy & Sci-Fi,Sci-Fi|Adventure,action | fantasy & sci-fi,action | adventure | fantasy | sci-fi,Live Action,Scripted,1.0,0.0,0.0,0.0,0.0,0.0,,HBO,1984.0,LIBRARY,3.0,"UNIVERSAL CITY STUDIOS PRODUCTIONS, LLLP",,2020-06-01 10:00:00.000,2020-06-01 10:00:00.000,2020-06-01 10:00:00.000,1984-12-14,2020-06-01 12:00:00.000,2021-11-30 12:00:00,1984-12-14 12:00:00.000,1984-12-14 00:00:00.000,1985-11-09 20:00:00.000,,urn:hbo:content-category:movie,urn:hbo:genre:fantasy-sci-fi,urn:tag:action | urn:tag:feature | urn:tag:hor...,urn:warnermedia:brand:hbo | urn:warnermedia:br...,0.0,/movies/GXodE-Q8aYbDCYwEAAAQT,2020-10-16 22:52:56.548,2021-10-05 14:37:17.321,2020-10-16 22:52:56.548,2020-10-16 22:52:56.548,2999-12-31 00:00:00,1.0,GXodE-Q8aYbDCYwEAAAQT,GXodE-Q8aYbDCYwEAAAQT,2020-06-01 10:00:00.000,2021-12-01 10:59:00.000,2020-06-01 10:00:00.000,,2020-06-01 10:00:00.000,2020-06-01 10:00:00.000,2020-06-01 12:00:00.000,2021-11-30 12:00:00,HBO MAX,HBO MAX DOMESTIC,HBO MAX SUBSCRIPTION,0.0,2021-10-06 00:42:02.338,2021-10-06 00:42:02.338,2021-10


(857, 101)
640
title                     0
tier                      0
season                    0
category                  0
premiere_date             0
                       ... 
channel                 761
ever_popcorn_title      761
creation_time.1         761
last_modified_time.1    761
premiere_year_month       0
Length: 101, dtype: int64


In [5]:
## Data availability for future titles 
def notnullsum(x):
    return x.notnull().sum()

df[df.category=='Popcorn'].groupby('premiere_year_month').agg({'title':'count', 
                                       'imdb_id': lambda x: notnullsum(x),
                                      'primary_genre_desc': lambda x: notnullsum(x),
                                      'release_year': lambda x: notnullsum(x),
                                      'license_contract_type': lambda x: notnullsum(x),
                                      'licensor': lambda x: notnullsum(x),
                                      'air_date': lambda x: notnullsum(x)})

Unnamed: 0_level_0,title,imdb_id,primary_genre_desc,release_year,license_contract_type,licensor,air_date
premiere_year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01,1,1,1,1.0,0,1,1
2021-02,2,2,2,2.0,0,2,2
2021-03,1,1,1,1.0,0,1,1
2021-04,2,2,2,2.0,0,2,2
2021-05,1,1,1,1.0,0,1,1
2021-06,3,2,2,2.0,0,2,2
2021-07,1,1,1,1.0,0,1,1
2021-08,2,2,2,2.0,0,2,2
2021-09,2,2,2,2.0,0,2,2
2021-10,3,2,2,2.0,1,2,2


In [48]:
### Duplicate titles from different episode #, category (e.g. suicide squad in popcorn & pay1)
dup = df[df.duplicated(subset=['title','season','episode_number_in_series'])].sort_values(by='title')
print(dup.shape)
display(dup)

df.groupby(by=['title','episode_number_in_series']).sum()

(10, 100)


Unnamed: 0.1,title,tier,season,category,premiere_date,Unnamed: 0,asset_id,wm_id,viewable_id,source_asset_id,series_id,season_id,franchise_id,catalog_geo_id,imdb_id,csm_id,grace_notes_id,asset_title_long,asset_title_short,asset_summary_long,asset_summary_short,asset_tags_age,asset_tags_content_category,asset_tags_genre,asset_tags_other,brand_hubs,asset_type,catalog_asset_type,series_title_long,series_title_short,series_type,season_title_long,season_title_short,promoted_asset_id,season_number,season_premiere_ind,season_finale_ind,series_premiere_ind,episode_number_in_season,episode_number_in_series,program_type,content_category,primary_genre_desc,navigation_genre_desc,descriptive_genre_desc,wm_enterprise_genres,leaf_level_genres,animation_type,script_type,scripted_flag,sports_flag,kids_flag,international_flag,latino_flag,short_flag,age_group,original_network,release_year,license_contract_type,license_contract_term,licensor,original_language,first_offered_date_hbo,first_offered_date_now,first_offered_date_max,air_date,latest_availability_start_date,latest_availability_end_date,premiere_air_date,theatrical_release_date,original_linear_air_date,full_asset_tags_age,full_asset_tags_content_category,full_asset_tags_genre,full_asset_tags_other,full_brand_hubs,is_downloadable,asset_url,creation_time,last_modified_time,last_rollup_trigger_time,effective_start_date,effective_end_date,active_flag,title_id,viewable_id.1,start_utc,end_utc,latest_asset_release_date,season_first_offered_date,title_first_offered_date,first_offered_date,latest_availability_start_date.1,latest_availability_end_date.1,brand,territory,channel,ever_popcorn_title,creation_time.1,last_modified_time.1
619,Godzilla vs. Kong,1,0,Pay1,2021-08-15,619,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
248,Lucas the Spider,3,3,Kids & Family,2022-06-16,248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
506,Mortal Kombat,1,0,Popcorn,2021-04-23,506,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
363,Scavengers Reign,3,1,Scripted Drama Series,2022-10-13,363,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
796,Space Jam: A New Legacy,1,0,Pay1,2021-12-15,796,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
411,Stateless,3,0,Docu-Series,2022-05-24,411,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
307,The Little Things,2,0,Pay1,2021-06-15,307,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
843,The Suicide Squad,1,0,Pay1,2021-12-15,843,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
337,Those Who Wish Me Dead,2,0,Pay1,2021-10-15,337,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
643,Tom & Jerry,2,0,Popcorn,2021-02-26,643,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Unnamed: 0_level_0,Unnamed: 1_level_0,tier,season,Unnamed: 0,asset_id,franchise_id,catalog_geo_id,grace_notes_id,asset_tags_content_category,promoted_asset_id,season_number,season_premiere_ind,season_finale_ind,series_premiere_ind,episode_number_in_season,scripted_flag,sports_flag,kids_flag,international_flag,latino_flag,short_flag,release_year,license_contract_type,license_contract_term,theatrical_release_date,full_asset_tags_content_category,is_downloadable,active_flag,ever_popcorn_title
title,episode_number_in_series,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
All That Glitters,5.0,3,1,751,2.480853e+18,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,1.0,0.0,1.0,0.0,0.0,0.0,2008.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Axios,47.0,3,4,86,3.417718e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Axios,48.0,3,4,85,7.429259e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Axios,49.0,3,4,87,4.22072e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Axios,50.0,3,4,90,3.125906e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Axios,51.0,3,4,84,1.379632e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Axios,52.0,3,4,88,5.637143e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Axios,53.0,3,4,89,3.945143e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Bridge,1.0,3,1,6,3.157351e+18,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
The Dog House: UK,9.0,3,2,384,2.291106e+17,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2021.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Future title imdb_id match

In [2]:
!pip install python-Levenshtein

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [7]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

df_imdb = pd.read_csv('s3://datascience-hbo-users/users/tjung/psi/future_program_imdb_id.csv')
df_imdb.columns= df_imdb.columns.str.lower()
df_imdb = df_imdb.rename(columns={'imdb id':'imdb_id', 
                          'title':'title_name', 
                          'season':'season_number',
                          'release date':'release_date'})

querystr = '''
select 
title as title_name, 
tier,
season as season_number,
category,
premiere_date 
from max_prod.content_analytics.daily_future_programming_schedule 
group by title, tier, season, category, premiere_date
;
'''

cursor_list = ctx.execute_string(
    querystr
    )
df_fp = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
df_fp.columns= df_fp.columns.str.lower()


grpby_title= ['title_name', 'season_number',]
df_title = df_fp[grpby_title].drop_duplicates(subset=grpby_title)
df_title['title_name_match'] = df_title['title_name'].apply(lambda x: process.extractOne(x, df_imdb['title_name'].to_list(), score_cutoff=90))
name_from_df2_list = df_title['title_name_match'].to_list()
name_from_df2_list = [_[0] if _ != None else None for _ in name_from_df2_list]
df_title['title_name_imdb'] = name_from_df2_list
df_fp = df_fp.merge(df_title[grpby_title+['title_name_imdb']], on=grpby_title, how='left')
df_fp['premiere_date'] = pd.to_datetime(df_fp['premiere_date'])
df_fp['premiere_month'] = df_fp['premiere_date'].dt.to_period('M').dt.to_timestamp()


print(df_fp.isnull().sum())
print(df_imdb.isnull().sum())
df_imdb['season_number'] = df_imdb['season_number'].fillna(1)
df_imdb.loc[df_imdb.season_number.isin(['1A','1B']), ['season_number']] = 1
df_imdb['season_number'] = df_imdb['season_number'].astype(int)
df_imdb = df_imdb.rename(columns={'title_name':'title_name_imdb'})


df_fp_imdb = df_fp.merge(df_imdb[['title_name_imdb','imdb_id','program_type']], how='left', on=['title_name_imdb'])
display(df_fp_imdb.head())

df_fp_imdb.loc[df_fp_imdb.title_name=='The Matrix Resurrections', 'imdb_id'] = 'tt10838180'
df_fp_imdb.loc[df_fp_imdb.title_name=='The Matrix Resurrections', 'title_name_imdb'] = 'The Matrix Resurrections'
df_fp_imdb.loc[df_fp_imdb.title_name=='The Matrix Resurrections', 'program_type'] = 'movie'


df_fp_imdb.to_csv('s3://datascience-hbo-users/users/tjung/psi/future_program_imdb_id_full.csv')


title_name           0
tier                 0
season_number        0
category             0
premiere_date        0
title_name_imdb    695
premiere_month       0
dtype: int64
imdb_id          7
season_number    5
title_name       0
release_date     0
brand            0
program_type     0
dtype: int64


Unnamed: 0,title_name,tier,season_number,category,premiere_date,title_name_imdb,premiere_month,imdb_id,program_type
0,The Bridge,3,1,Unscripted Series,2021-02-11,,2021-02-01,,
1,Westworld,1,4,Scripted Drama Series,2022-06-26,Westworld S4,2022-06-01,tt0475784,series
2,Tig Notaro: Drawn,3,0,Specials,2021-07-24,,2021-07-01,,
3,Odo,3,3,Kids & Family,2022-04-07,,2022-04-01,,
4,The First Year,3,0,Documentary Features,2022-07-05,,2022-07-01,,


In [None]:
querystr = '''
select title_id,
original_title as title_name,
title_type,
production_release_date
from enterprise_data.catalog.imdb_title 
where ((production_release_date>='2021-11-01' and production_release_date<'2023-01-01') or production_release_date IS NULL)
and title_type in ('tvSeries', 'tvMiniSeries', 'movie')
;
'''

cursor_list = ctx.execute_string(
    querystr
    )
df_imdb_title = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
df_imdb_title.columns= df_imdb_title.columns.str.lower()

df_fp_null = df_fp[(df_fp.title_name_imdb.isnull()) & (df_fp.premiere_month>='2021-11-01') & (df_fp.premiere_month<'2023-01-01')]
df_fp_null['title_name_match'] = df_fp_null['title_name'].apply(lambda x: process.extractOne(x, df_imdb_title['title_name'].to_list(), score_cutoff=90))
name_from_df2_list = df_fp_null['title_name_match'].to_list()
name_from_df2_list = [_[0] if _ != None else None for _ in name_from_df2_list]
df_fp_null['title_name_imdb_sf'] = name_from_df2_list

df_fp_null.to_csv('s3://datascience-hbo-users/users/tjung/psi/future_program_imdb_id_full_2022.csv')
# df_fp = df_fp.merge(df_fp_null[grpby_title+['title_name_imdb_sf']], on=grpby_title, how='left')
# df_fp['title_name_imdb'] = df_fp['title_name_imdb'].fillna(df_fp['title_name_imdb'])


In [42]:
df_fp_null.head()

Unnamed: 0,title_name,tier,season_number,category,premiere_date,title_name_imdb,premiere_month,title_name_match,title_name_imdb_sf
3,Odo,3,3,Kids & Family,2022-04-07,,2022-04-01,"(Anjo do Lodo, 90)",Anjo do Lodo
4,The First Year,3,0,Documentary Features,2022-07-05,,2022-07-01,"(The First Year, 100)",The First Year
5,Odo,3,2,Kids & Family,2022-02-10,,2022-02-01,"(Anjo do Lodo, 90)",Anjo do Lodo
6,"Stand Up, Yumi Chung",3,0,Kids & Family,2022-11-10,,2022-11-01,"(Stand Up, 90)",Stand Up
13,Frederick Douglas: In Five Speeches,3,0,Documentary Features,2022-02-23,,2022-02-01,"(Frederick, 90)",Frederick


In [43]:
1

1

In [24]:
df_fp_null = df_fp[(df_fp.title_name_imdb.isnull()) & (df_fp.premiere_month>='2021-11-01') & (df_fp.premiere_month<'2023-01-01')]
print(df_imdb_title.shape)
df_fp_null.shape

(186906, 4)


(231, 7)

## Future titles IMDB features

In [123]:
df_fp_conn = df_fp_imdb[(df_fp_imdb.premiere_date>='2021-09-01') & (df_fp_imdb.premiere_date<'2022-04-01')& (df_fp_imdb.imdb_id.notnull())]
display(df_fp_conn)
df_fp_conn.imdb_id.tolist()


Unnamed: 0,title_name,tier,season_number,category,premiere_date,title_name_imdb,imdb_id,program_type
3,Dune,1,0,Popcorn,2021-10-22,Dune,tt1160419,movie
18,Real Time,2,20,Unscripted Series,2022-01-21,Real Time with Bill Maher S18,tt0350448,series
34,Search Party,3,5,Scripted Comedy Series,2022-01-06,Search Party S4,tt5460226,series
85,How To With John Wilson,3,2,Unscripted Series,2021-11-26,How to With John Wilson S2,tt10801534,series
103,Moonshot,2,0,Scripted Features,2022-03-24,Moonshot,tt12585076,movie
111,Our Flag Means Death,2,1,Scripted Comedy Series,2022-02-24,Our Flag Means Death,tt11000902,series
145,The Sex Lives of College Girls,1,1,Scripted Comedy Series,2021-11-18,Sex Lives of College Girls,tt11212276,series
153,Landscapers,2,0,Scripted Drama Series,2021-12-06,Landscapers,tt11471892,series
198,Curb Your Enthusiasm,2,11,Scripted Comedy Series,2021-10-24,Curb Your Enthusiasm S10,tt0264235,series
227,And Just Like That,1,1,Scripted Comedy Series,2021-12-09,And Just Like That,tt13819960,series


['tt1160419',
 'tt0350448',
 'tt5460226',
 'tt10801534',
 'tt12585076',
 'tt11000902',
 'tt11212276',
 'tt11471892',
 'tt0264235',
 'tt13819960',
 'tt10838180',
 'tt9170108',
 'tt11285856',
 'tt12759100',
 'tt3215824',
 'tt9272514',
 'tt8634332',
 'tt7278862',
 'tt9620288',
 'tt8110232',
 'tt10574236',
 'tt8416494',
 'tt0063951',
 'tt5024912',
 'tt0293429',
 'tt10222764',
 'tt12564744',
 'tt1924245',
 'tt14128670',
 'tt1321510',
 'tt10244600',
 'tt11057226',
 'tt12682218',
 'tt12286260',
 'tt0063951',
 'tt7660850',
 'tt11847410',
 'tt3811906',
 'tt8772296',
 'tt13146488',
 'tt14825858',
 'tt4406178',
 'tt2887954',
 'tt3554046',
 'tt10380768',
 'tt0063951',
 'tt11540284',
 'tt6334354',
 'tt11947418']

In [181]:
querystr='''
select 
it.original_title,
it.title_id,
it.title_type,
it.genres,
imc.reference_type,
itr.original_title as reference_title,
itr.title_id as reference_title_id,
itr.title_type as reference_title_type,
imcr.reference_type as reference_referece_type
from enterprise_data.catalog.imdb_title it 
left join enterprise_data.catalog.imdb_movie_connection imc 
    on it.title_id = imc.title_id
left join enterprise_data.catalog.imdb_title itr 
    on itr.title_id = imc.reference_title_id
left join enterprise_data.catalog.imdb_movie_connection imcr
    on itr.title_id = imcr.title_id
    and imcr.reference_type in ('featured_in', 'spoofed_in')
where imc.reference_type in ('follows','spin_off_from','remake_of', 'version_of', 'featured_in')
and it.title_id in ('tt1160419',
 'tt0350448',
 'tt5460226',
 'tt10801534',
 'tt12585076',
 'tt11000902',
 'tt11212276',
 'tt11471892',
 'tt0264235',
 'tt13819960',
 'tt10838180',
 'tt9170108',
 'tt11285856',
 'tt12759100',
 'tt3215824',
 'tt9272514',
 'tt8634332',
 'tt7278862',
 'tt9620288',
 'tt8110232',
 'tt10574236',
 'tt8416494',
 'tt0063951',
 'tt5024912',
 'tt0293429',
 'tt10222764',
 'tt12564744',
 'tt1924245',
 'tt14128670',
 'tt1321510',
 'tt10244600',
 'tt11057226',
 'tt12682218',
 'tt12286260',
 'tt0063951',
 'tt7660850',
 'tt11847410',
 'tt3811906',
 'tt8772296',
 'tt13146488',
 'tt14825858',
 'tt4406178',
 'tt2887954',
 'tt3554046',
 'tt10380768',
 'tt0063951',
 'tt11540284',
 'tt6334354',
 'tt11947418')
;
'''

cursor_list = ctx.execute_string(
    querystr
    )
df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
df.columns= df.columns.str.lower()
df
df.to_csv('s3://datascience-hbo-users/users/tjung/psi/future_program_imdb_id_features.csv')

In [148]:
df_ref_ref.head()

reference_referece_type,title_name_imdb,genres,title_type,0,ref_ref_featured_in,ref_ref_spoofed_in
0,And Just Like That...,"[""Comedy"",""Drama"",""Romance""]",tvMiniSeries,,133.0,40.0
1,Cry Macho,"[""Drama"",""Thriller"",""Western""]",movie,1.0,,
2,Curb Your Enthusiasm,"[""Comedy""]",tvSeries,60.0,26.0,1.0
3,Doom Patrol,"[""Action"",""Adventure"",""Comedy"",""Drama"",""Myster...",tvSeries,5.0,7.0,
4,Dune,"[""Action"",""Adventure"",""Drama"",""Sci-Fi""]",movie,12.0,69.0,22.0


In [189]:
df_ref_ref[df_ref_ref.title_name_imdb=='Succession']

reference_referece_type,title_name_imdb,imdb_id,genres,title_type,0,ref_ref_featured_in,ref_ref_spoofed_in
20,Succession,tt7660850,"[""Drama""]",tvSeries,8.0,13.0,


In [188]:
### get imdb data for future titles.  
### features:  imdb_pg available; 
### identify if there're prequels:  reference_type = 'follows', 'remake_of','spin_off_from'
### identify popularity of prequels: 'featured_in'
df = df.fillna(0)
df = df.rename(columns={'original_title':'title_name_imdb','title_id':'imdb_id'})

grpby=['title_name_imdb','imdb_id','genres','title_type','reference_type']
df_ref = df.groupby(by=grpby).agg({'reference_title_id':'nunique'}).reset_index()
df_ref = df_ref.pivot(index=grpby[:-1], columns='reference_type', values='reference_title_id')\
            .reset_index()
df_ref = df_ref.rename(columns={'follows':'ref_follows',
                                'spin_off_from':'ref_spin_off_from','remake_of':'ref_remake_of',
                               'version_of':'ref_version_of', 'featured_in':'ref_featured_in'})


grpby=['title_name_imdb','imdb_id','genres','title_type','reference_referece_type']
df['imdb_title_id'] = df['imdb_id']
df_ref_ref = df.groupby(by=grpby).agg({'imdb_title_id':'count'}).reset_index()
df_ref_ref = df_ref_ref.pivot(index=grpby[:-1], columns='reference_referece_type', values='imdb_title_id')\
            .reset_index()
df_ref_ref = df_ref_ref.rename(columns={'featured_in':'ref_ref_featured_in','spoofed_in':'ref_ref_spoofed_in'})



# grpby_title= ['tier','content_category','category','title_name', 'title_id','season_number',
#         'program_type']
# df_actuals=pd.read_csv('s3://datascience-hbo-users/users/tjung/psi/fv_actual_0922.csv')
# df_actuals = df_actuals[['first_views'] + grpby_title].groupby(by=grpby_title).sum().reset_index()



col_ref = ['imdb_id', 'genres','ref_follows','ref_spin_off_from','ref_remake_of', 'ref_version_of', 'ref_featured_in']
col_ref_ref = ['imdb_id','ref_ref_featured_in','ref_ref_spoofed_in']

df_tot = df_fp_imdb.merge(df_ref[col_ref], how='left', on='imdb_id')\
                    .merge(df_ref_ref[col_ref_ref], how='left', on= 'imdb_id')
df_tot = df_tot.rename(columns={'imdb_id_x':'imdb_id', 'program_type':'content_category'})
print(df_tot.shape)
display(df_tot.isnull().sum())
display(df_tot.describe())

display(df_tot.head(2))
df_tot = df_tot.fillna(0)

df_tot.loc[(df_tot['ref_follows']>5), 'ref_follows'] = 1
# df_tot.to_csv('s3://datascience-hbo-users/users/tjung/psi/imdb_features_engineered_future_program.csv')
# df_tot[df_tot.title_name=='Succession']


(909, 16)


title_name               0
tier                     0
season_number            0
category                 0
premiere_date            0
title_name_imdb        694
imdb_id                697
content_category       694
genres                 856
ref_follows            890
ref_spin_off_from      905
ref_remake_of          903
ref_version_of         898
ref_featured_in        863
ref_ref_featured_in    867
ref_ref_spoofed_in     879
dtype: int64

Unnamed: 0,season_number,ref_follows,ref_spin_off_from,ref_remake_of,ref_version_of,ref_featured_in,ref_ref_featured_in,ref_ref_spoofed_in
count,909.0,19.0,4.0,6.0,11.0,46.0,42.0,30.0
mean,1.607261,1.789474,1.0,1.0,13.636364,33.130435,165.761905,106.7
std,4.772442,1.718492,0.0,0.0,7.47359,49.056253,202.55548,134.284348
min,0.0,1.0,1.0,1.0,2.0,1.0,7.0,1.0
25%,0.0,1.0,1.0,1.0,10.0,2.25,14.5,6.75
50%,1.0,1.0,1.0,1.0,18.0,9.5,102.0,36.5
75%,2.0,2.0,1.0,1.0,18.0,36.0,165.25,306.75
max,55.0,8.0,1.0,1.0,18.0,135.0,546.0,307.0


Unnamed: 0,title_name,tier,season_number,category,premiere_date,title_name_imdb,imdb_id,content_category,genres,ref_follows,ref_spin_off_from,ref_remake_of,ref_version_of,ref_featured_in,ref_ref_featured_in,ref_ref_spoofed_in
0,Westworld,1,4,Scripted Drama Series,2022-06-26,Westworld S4,tt0475784,series,,,,,,,,
1,House Of The Dragon,0,1,Scripted Drama Series,2022-08-21,House of the Dragon S1,tt11198330,series,,,,,,,,


In [191]:
df_tot[df_tot.title_name.str.contains('Sex')]

Unnamed: 0,title_name,tier,season_number,category,premiere_date,title_name_imdb,imdb_id,content_category,genres,ref_follows,ref_spin_off_from,ref_remake_of,ref_version_of,ref_featured_in,ref_ref_featured_in,ref_ref_spoofed_in
145,The Sex Lives of College Girls,1,1,Scripted Comedy Series,2021-11-18,Sex Lives of College Girls,tt11212276,series,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
480,The Sex Lives of College Girls,1,3,Scripted Comedy Series,2023-11-16,Sex Lives of College Girls,tt11212276,series,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
613,Sex and the City - BTS,2,0,Documentary Features,2022-01-27,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
614,Sex Diaries,3,0,Docu-Series,2022-05-06,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
847,The Sex Lives of College Girls,1,2,Scripted Comedy Series,2022-11-17,Sex Lives of College Girls,tt11212276,series,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
906,The Sex Lives of College Girls,1,4,Scripted Comedy Series,2024-11-14,Sex Lives of College Girls,tt11212276,series,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [190]:
df_tot[df_tot.title_name=='Succession']

Unnamed: 0,title_name,tier,season_number,category,premiere_date,title_name_imdb,imdb_id,content_category,genres,ref_follows,ref_spin_off_from,ref_remake_of,ref_version_of,ref_featured_in,ref_ref_featured_in,ref_ref_spoofed_in
662,Succession,1,3,Scripted Drama Series,2021-10-17,Succession S3,tt7660850,series,"[""Drama""]",0.0,0.0,0.0,0.0,11.0,13.0,0.0
697,Succession,1,4,Scripted Drama Series,2023-03-26,Succession S3,tt7660850,series,"[""Drama""]",0.0,0.0,0.0,0.0,11.0,13.0,0.0


## Past titles IMDB

In [135]:
querystr='''
select 
a.asset_title_long,
a.viewable_id,
a.imdb_id,
it.original_title,
it.title_id,
it.title_type,
it.genres,
imc.reference_type,
itr.original_title as reference_title,
itr.title_id as reference_title_id,
itr.title_type as reference_title_type,
imcr.reference_type as reference_referece_type
from max_prod.catalog.asset_dim a
join max_prod.catalog.reporting_asset_offering_dim raod
    on a.viewable_id = raod.viewable_id
left join enterprise_data.catalog.imdb_title it 
    on a.imdb_id = it.title_id
left join enterprise_data.catalog.imdb_movie_connection imc 
    on it.title_id = imc.title_id
left join enterprise_data.catalog.imdb_title itr 
    on itr.title_id = imc.reference_title_id
left join enterprise_data.catalog.imdb_movie_connection imcr
    on itr.title_id = imcr.title_id
    and imcr.reference_type in ('featured_in', 'spoofed_in')
where raod.brand = 'HBO MAX'
and raod.territory = 'HBO MAX DOMESTIC'
and raod.channel = 'HBO MAX SUBSCRIPTION'
and imc.reference_type in ('follows','spin_off_from','remake_of', 'version_of', 'featured_in')
;
'''

cursor_list = ctx.execute_string(
    querystr
    )
df = pd.DataFrame.from_records(cursor_list[-1].fetchall(), columns=[x[0] for x in cursor_list[-1].description])
df.columns= df.columns.str.lower()
df.to_csv('s3://datascience-hbo-users/users/tjung/psi/imdb_features.csv')

# df = pd.read_csv('s3://datascience-hbo-users/users/tjung/psi/imdb_features.csv')
# display(df.head(2))
# display(df.shape)
# display(df.groupby(by='reference_type').count())


In [154]:
df

Unnamed: 0,asset_title_long,viewable_id,imdb_id,original_title,title_id,title_type,genres,reference_type,reference_title,reference_title_id,reference_title_type,reference_referece_type
0,Batman: Gotham Knight,GXdu2VgbP-KXCPQEAADfT,tt1117563,Batman: Gotham Knight,tt1117563,video,"[""Animation"",""Action"",""Crime"",""Sci-Fi"",""Thrill...",spin_off_from,Batman Begins,tt0372784,movie,featured_in
1,Batman: Gotham Knight,GXdu2VgbP-KXCPQEAADfT,tt1117563,Batman: Gotham Knight,tt1117563,video,"[""Animation"",""Action"",""Crime"",""Sci-Fi"",""Thrill...",spin_off_from,Batman Begins,tt0372784,movie,featured_in
2,Batman: Gotham Knight,GXdu2VgbP-KXCPQEAADfT,tt1117563,Batman: Gotham Knight,tt1117563,video,"[""Animation"",""Action"",""Crime"",""Sci-Fi"",""Thrill...",spin_off_from,Batman Begins,tt0372784,movie,featured_in
3,Batman: Gotham Knight,GXdu2VgbP-KXCPQEAADfT,tt1117563,Batman: Gotham Knight,tt1117563,video,"[""Animation"",""Action"",""Crime"",""Sci-Fi"",""Thrill...",spin_off_from,Batman Begins,tt0372784,movie,featured_in
4,Batman: Gotham Knight,GXdu2VgbP-KXCPQEAADfT,tt1117563,Batman: Gotham Knight,tt1117563,video,"[""Animation"",""Action"",""Crime"",""Sci-Fi"",""Thrill...",spin_off_from,Batman Begins,tt0372784,movie,featured_in
...,...,...,...,...,...,...,...,...,...,...,...,...
122220,The Champ,GXsVBnAg6m0CxkQEAAABB,tt0078950,The Champ,tt0078950,movie,"[""Drama"",""Sport""]",remake_of,The Champ,tt0021730,movie,featured_in
122221,The Champ,GXsVBnAg6m0CxkQEAAABB,tt0078950,The Champ,tt0078950,movie,"[""Drama"",""Sport""]",remake_of,The Champ,tt0021730,movie,featured_in
122222,The Champ,GXsVBnAg6m0CxkQEAAABB,tt0078950,The Champ,tt0078950,movie,"[""Drama"",""Sport""]",remake_of,The Champ,tt0021730,movie,spoofed_in
122223,The Champ,GXsVBnAg6m0CxkQEAAABB,tt0078950,The Champ,tt0078950,movie,"[""Drama"",""Sport""]",remake_of,The Champ,tt0021730,movie,spoofed_in


In [131]:
grpby=['asset_title_long','viewable_id','imdb_id','genres','title_type','reference_referece_type']
df_ref_ref = df.groupby(by=grpby).agg({'title_id':'count'}).reset_index()
df_ref_ref = df_ref_ref.pivot(index=grpby[:-1], columns='reference_referece_type', values='title_id')\
            .reset_index()


df_ref_ref

reference_referece_type,asset_title_long,viewable_id,imdb_id,genres,title_type,featured_in,spoofed_in
0,'Tis the Season to Be Smurfy,GX3Uk3gHbrBmDbAEAAAKX,tt0198267,"[""Animation"",""Short"",""Adventure"",""Comedy"",""Dra...",tvShort,40.0,30.0
1,*batteries not included,GXkV4_wx4nsPDwwEAABNU,tt0092494,"[""Comedy"",""Family"",""Fantasy"",""Sci-Fi""]",movie,1.0,
2,-30-,GVU3KIw6iXFFvjSoJAXFK,tt0977179,"[""Crime"",""Drama"",""Thriller""]",tvEpisode,1.0,
3,...To Miss New Orleans,GVU4MGgf_uFFvjSoJAbgS,tt2593078,"[""Drama"",""Music""]",tvEpisode,1.0,
4,10 to Midnight,GXeq3xAVyMMPCwwEAAEcx,tt0085121,"[""Crime"",""Drama"",""Thriller""]",movie,2.0,
...,...,...,...,...,...,...,...
2883,Zoom and Bored,GXo0IGQxP7bLCwgEAABTh,tt0051228,"[""Animation"",""Family"",""Short"",""Comedy""]",short,68.0,2.0
2884,Zoom at the Top,GX5wjuQ39niGvfwEAAAA6,tt0056723,"[""Animation"",""Family"",""Short"",""Comedy""]",short,98.0,2.0
2885,berkman > block,GXJvjrAmO16gjKgEAAAHG,tt8890392,"[""Action"",""Comedy"",""Crime"",""Drama""]",tvEpisode,1.0,
2886,ronny/lily,GXHBPvwbY0IBThwEAAAA9,tt8956332,"[""Action"",""Comedy"",""Crime"",""Drama""]",tvEpisode,1.0,


In [136]:
### get imdb data for future titles.  
### features:  imdb_pg available; 
### identify if there're prequels:  reference_type = 'follows', 'remake_of','spin_off_from'
### identify popularity of prequels: 'featured_in'
df = df.fillna(0)

grpby=['asset_title_long','viewable_id','imdb_id','genres','title_type','reference_type']
df_ref = df.groupby(by=grpby).agg({'reference_title_id':'nunique'}).reset_index()
df_ref = df_ref.pivot(index=grpby[:-1], columns='reference_type', values='reference_title_id')\
            .reset_index()
df_ref = df_ref.rename(columns={'viewable_id':'title_id','follows':'ref_follows',
                                'spin_off_from':'ref_spin_off_from','remake_of':'ref_remake_of',
                               'version_of':'ref_version_of', 'featured_in':'ref_featured_in'})


grpby=['asset_title_long','viewable_id','imdb_id','genres','title_type','reference_referece_type']
df_ref_ref = df.groupby(by=grpby).agg({'title_id':'count'}).reset_index()
df_ref_ref = df_ref_ref.pivot(index=grpby[:-1], columns='reference_referece_type', values='title_id')\
            .reset_index()
df_ref_ref = df_ref_ref.rename(columns={'viewable_id':'title_id','featured_in':'ref_ref_featured_in','spoofed_in':'ref_ref_spoofed_in'})

grpby_title= ['tier','content_category','category','title_name', 'title_id','season_number',
        'program_type']
df_actuals=pd.read_csv('s3://datascience-hbo-users/users/tjung/psi/fv_actual_1025.csv')
df_actuals = df_actuals[['first_views'] + grpby_title].groupby(by=grpby_title).sum().reset_index()


col_ref = ['title_id', 'title_type', 'imdb_id', 'asset_title_long', 'genres','ref_follows','ref_spin_off_from','ref_remake_of', 'ref_version_of','ref_featured_in']
col_ref_ref = ['title_id','ref_ref_featured_in','ref_ref_spoofed_in']

df_tot = df_actuals.merge(df_ref[col_ref], how='left', on='title_id')\
                    .merge(df_ref_ref[col_ref_ref], how='left', on='title_id')
print(df_tot.shape)
display(df_tot.isnull().sum())
display(df_tot.describe())

display(df_tot.head(2))
df_tot = df_tot.fillna(0)


df_tot.loc[(df_tot['ref_follows']>5), 'ref_follows'] = 1
df_tot.to_csv('s3://datascience-hbo-users/users/tjung/psi/imdb_features_engineered.csv')



(346, 19)


tier                     0
content_category         0
category                 0
title_name               0
title_id                 0
season_number            0
program_type             0
first_views              0
title_type             265
imdb_id                265
asset_title_long       265
genres                 265
ref_follows            331
ref_spin_off_from      343
ref_remake_of          332
ref_version_of         336
ref_featured_in        269
ref_ref_featured_in    297
ref_ref_spoofed_in     320
dtype: int64

Unnamed: 0,tier,season_number,first_views,ref_follows,ref_spin_off_from,ref_remake_of,ref_version_of,ref_featured_in,ref_ref_featured_in,ref_ref_spoofed_in
count,346.0,346.0,346.0,15.0,3.0,14.0,10.0,77.0,49.0,26.0
mean,2.560694,2.820809,60651.34,4.533333,1.0,1.285714,10.2,7.285714,93.714286,38.538462
std,0.648801,9.097681,225374.0,9.272591,0.0,0.468807,14.62722,7.940332,237.981004,63.046796
min,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,0.0,1249.0,1.0,1.0,1.0,1.25,2.0,7.0,6.0
50%,3.0,1.0,5447.5,1.0,1.0,1.0,4.0,4.0,22.0,13.5
75%,3.0,1.0,28034.25,3.0,1.0,1.75,9.75,10.0,90.0,30.25
max,3.0,52.0,2727097.0,37.0,1.0,2.0,47.0,42.0,1650.0,234.0


Unnamed: 0,tier,content_category,category,title_name,title_id,season_number,program_type,first_views,title_type,imdb_id,asset_title_long,genres,ref_follows,ref_spin_off_from,ref_remake_of,ref_version_of,ref_featured_in,ref_ref_featured_in,ref_ref_spoofed_in
0,1,movies,Pay1,Godzilla vs. Kong,GYFEzmwNES16GkQEAAAAC,0,acquired,68095,movie,tt5034838,Godzilla vs. Kong,"[""Action"",""Sci-Fi"",""Thriller""]",3.0,,2.0,,16.0,145.0,18.0
1,1,movies,Pay1,Mortal Kombat,GYGYKfQwaKLheqwEAAAEC,0,acquired,57288,movie,tt0293429,Mortal Kombat,"[""Action"",""Adventure"",""Fantasy"",""Sci-Fi"",""Thri...",,1.0,1.0,,10.0,176.0,62.0


In [None]:
df[(df.asset_title_long.str.contains('The Matrix Reloaded')) & (df.reference_type=='featured_in')]

In [67]:
print(df_tot.columns)
display(df_tot[df_tot.category=='Popcorn'].groupby(by='reference_type').count())

display(df_tot['reference_title_id'].describe(percentiles=[0.25,0.5,0.75,0.8,0.9,0.95,0.99]))
display(df_tot[df_tot['category']=='Popcorn']['reference_title_id'].describe(percentiles=percents))
display(df_tot[(df_tot['category']=='Popcorn') & (df_tot['reference_type'].isin(list_ref))]['reference_title_id'].describe(percentiles=percents))


Index(['Unnamed: 0', 'title_id', 'title_name', 'season_number',
       'content_category', 'content_source', 'program_type', 'category',
       'tier', 'effective_start_date', 'request_date', 'premiere_ind',
       'asset_premiere_count', 'premiering_hours_runtime', 'first_views',
       'hours_viewed', 'days_since_premiere', 'days_on_platform',
       'finished_window_flag', 'asset_title_long', 'viewable_id', 'imdb_id',
       'genres', 'title_type', 'reference_type', 'reference_title_id'],
      dtype='object')


Unnamed: 0_level_0,Unnamed: 0,title_id,title_name,season_number,content_category,content_source,program_type,category,tier,effective_start_date,request_date,premiere_ind,asset_premiere_count,premiering_hours_runtime,first_views,hours_viewed,days_since_premiere,days_on_platform,finished_window_flag,asset_title_long,viewable_id,imdb_id,genres,title_type,reference_title_id
reference_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
edited_from,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32
edited_into,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91,91
featured_in,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541
features,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225,225
followed_by,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32
follows,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161
referenced_in,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541,541
references,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477
remake_of,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64
spin_off,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156,156


count    25081.000000
mean         9.043978
std         19.055089
min          1.000000
25%          1.000000
50%          3.000000
75%          9.000000
80%         11.000000
90%         21.000000
95%         39.000000
99%        102.000000
max        180.000000
Name: reference_title_id, dtype: float64

count    2695.000000
mean       16.770315
std        34.740393
min         1.000000
1%          1.000000
5%          1.000000
10%         1.000000
25%         1.000000
50%         3.000000
75%        16.000000
90%        45.000000
95%        66.000000
99%       180.000000
100%      180.000000
max       180.000000
Name: reference_title_id, dtype: float64

count    734.000000
mean      11.096730
std       20.692649
min        1.000000
1%         1.000000
5%         1.000000
10%        1.000000
25%        1.000000
50%        3.000000
75%       12.000000
90%       21.000000
95%       26.000000
99%      102.000000
100%     102.000000
max      102.000000
Name: reference_title_id, dtype: float64

In [70]:
df_tot[(df_tot['category']=='Popcorn') & (df_tot.reference_type.isin(list_ref))]

Unnamed: 0.1,Unnamed: 0,title_id,title_name,season_number,content_category,content_source,program_type,category,tier,effective_start_date,request_date,premiere_ind,asset_premiere_count,premiering_hours_runtime,first_views,hours_viewed,days_since_premiere,days_on_platform,finished_window_flag,asset_title_long,viewable_id,imdb_id,genres,title_type,reference_type,reference_title_id
8645,5319,GYA79hQZbUsI3gQEAAAB0,The Little Things,0,movies,HBO Max,acquired,Popcorn,2,2021-01-29,2021-01-29,1,1,2.13,20408,950883.193,0,234,1,The Little Things,GYA79hQZbUsI3gQEAAAB0,tt10016180,"[""Crime"",""Drama"",""Mystery"",""Thriller""]",movie,references,3.0
8649,5320,GYA79hQZbUsI3gQEAAAB0,The Little Things,0,movies,HBO Max,acquired,Popcorn,2,2021-01-29,2021-01-30,0,0,0.00,109024,4390993.147,1,234,1,The Little Things,GYA79hQZbUsI3gQEAAAB0,tt10016180,"[""Crime"",""Drama"",""Mystery"",""Thriller""]",movie,references,3.0
8653,5321,GYA79hQZbUsI3gQEAAAB0,The Little Things,0,movies,HBO Max,acquired,Popcorn,2,2021-01-29,2021-01-31,0,0,0.00,106209,3745352.301,2,234,1,The Little Things,GYA79hQZbUsI3gQEAAAB0,tt10016180,"[""Crime"",""Drama"",""Mystery"",""Thriller""]",movie,references,3.0
8657,5322,GYA79hQZbUsI3gQEAAAB0,The Little Things,0,movies,HBO Max,acquired,Popcorn,2,2021-01-29,2021-02-01,0,0,0.00,52329,1747077.483,3,234,1,The Little Things,GYA79hQZbUsI3gQEAAAB0,tt10016180,"[""Crime"",""Drama"",""Mystery"",""Thriller""]",movie,references,3.0
8661,5323,GYA79hQZbUsI3gQEAAAB0,The Little Things,0,movies,HBO Max,acquired,Popcorn,2,2021-01-29,2021-02-02,0,0,0.00,26199,874480.246,4,234,1,The Little Things,GYA79hQZbUsI3gQEAAAB0,tt10016180,"[""Crime"",""Drama"",""Mystery"",""Thriller""]",movie,references,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41557,26208,GYGYKfQwaKLheqwEAAAEC,Mortal Kombat,0,movies,HBO Max,acquired,Popcorn,1,2021-04-23,2021-05-23,0,0,0.00,10429,383013.713,30,150,1,Mortal Kombat,GYGYKfQwaKLheqwEAAAEC,tt0293429,"[""Action"",""Adventure"",""Fantasy"",""Sci-Fi"",""Thri...",movie,remake_of,1.0
41558,26208,GYGYKfQwaKLheqwEAAAEC,Mortal Kombat,0,movies,HBO Max,acquired,Popcorn,1,2021-04-23,2021-05-23,0,0,0.00,10429,383013.713,30,150,1,Mortal Kombat,GYGYKfQwaKLheqwEAAAEC,tt0293429,"[""Action"",""Adventure"",""Fantasy"",""Sci-Fi"",""Thri...",movie,spin_off_from,1.0
41561,26209,GYGYKfQwaKLheqwEAAAEC,Mortal Kombat,0,movies,HBO Max,acquired,Popcorn,1,2021-04-23,2021-05-24,0,0,0.00,4060,136165.431,31,150,1,Mortal Kombat,GYGYKfQwaKLheqwEAAAEC,tt0293429,"[""Action"",""Adventure"",""Fantasy"",""Sci-Fi"",""Thri...",movie,references,12.0
41562,26209,GYGYKfQwaKLheqwEAAAEC,Mortal Kombat,0,movies,HBO Max,acquired,Popcorn,1,2021-04-23,2021-05-24,0,0,0.00,4060,136165.431,31,150,1,Mortal Kombat,GYGYKfQwaKLheqwEAAAEC,tt0293429,"[""Action"",""Adventure"",""Fantasy"",""Sci-Fi"",""Thri...",movie,remake_of,1.0
