In [314]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path
import pandas as pd
from numpy.random import randint
import numpy as np
from toolz import first, merge

from itertools import groupby

import json

In [30]:
files = pd.Series(indir.glob('*'), name='files')

In [162]:
spec = {
    'early_morning': [ 1,2,3,4,5,6],
    'morning': [7,8,9,10,11,12],
    'afternoon': [13,14,15,16,],
    'evening': [17,18,19,20],
    'late_night' : [21,22,23, 0],
}
TIME_PERIODS = {hour: tp  for tp, hours in spec.items() for hour in hours}
TIME_PERIODS

{1: 'early_morning',
 2: 'early_morning',
 3: 'early_morning',
 4: 'early_morning',
 5: 'early_morning',
 6: 'early_morning',
 7: 'morning',
 8: 'morning',
 9: 'morning',
 10: 'morning',
 11: 'morning',
 12: 'morning',
 13: 'afternoon',
 14: 'afternoon',
 15: 'afternoon',
 16: 'afternoon',
 17: 'evening',
 18: 'evening',
 19: 'evening',
 20: 'evening',
 21: 'late_night',
 22: 'late_night',
 23: 'late_night',
 0: 'late_night'}

In [32]:
%%time 
indir = Path('/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches')

reader = indir.glob('*')
reader = map(pd.read_parquet, reader)
DATA = pd.concat(reader)

CPU times: user 21.1 s, sys: 7.8 s, total: 28.9 s
Wall time: 22 s


In [295]:
print('row_count:', len(DATA))

row_count: 6265392


In [222]:
def remove_bad_genres(df):
    genre = df.genre.value_counts(dropna=False)
    remove = {'None','', 'Not Set'}
    mask = df.genre.isin(remove)
    return  df[~mask].copy()

In [222]:
def mock_segments(df):
    M = df.groupby(['time_period','genre','segment']).size().unstack()
    df = M.sum(level=0)
    df = M.div(df, level=0)
    df = df.stack().to_frame('likelihood')

    df = df.round(2).reset_index().sort_index()
    df = df.set_index(['segment','time_period','genre']).likelihood
    reader = df.groupby(level=[0,1], )
    reader = ( (segment, time_period , v.droplevel([0,1]).to_dict()) for (segment, time_period), v in reader)
    reader = groupby(reader, key=lambda x: x[0] )
    return {g: { tp : periods for seg, tp, periods in v} for g, v in reader}

In [65]:
SEGMENTS = { segment_id : f'segment{segment_id:03d}' for segment_id in range(10)}
SEGMENTS

{0: 'segment000',
 1: 'segment001',
 2: 'segment002',
 3: 'segment003',
 4: 'segment004',
 5: 'segment005',
 6: 'segment006',
 7: 'segment007',
 8: 'segment008',
 9: 'segment009'}

In [279]:
np.random.seed(100)
df = DATA.sample(20000).drop_duplicates('mpid')
df['segment_id'] =  randint(0,10,len(df))
df['segment'] = df.segment_id.map(SEGMENTS)
df['event_start_dt'] = pd.to_datetime(df.event_start_unixtime_ms.astype(np.int).div(1000), unit="s")
df['video_end_hour'] = df['event_start_dt'].dt.hour
df['time_period'] = df.video_end_hour.map(TIME_PERIODS)
df = remove_bad_genres(df)
segment_lkup = mock_segments(df)
df['segment_detail'] = df.segment.map(segment_lkup)
SAMPLE = df

In [310]:
df = SAMPLE[SAMPLE.ip =='24.186.72.123'].to_dict()
df

{'batch_id': {17035: 've_20190703_0010'},
 'file': {17035: 'NBC_20190703110951049147_90249.txt'},
 'file_idx': {17035: 3991},
 'asof_dt': {17035: '2019-10-01T21:44:15.339449Z'},
 'mpid': {17035: -3215838812283921027},
 'nbc_profile': {17035: 'Unauthenticated'},
 'mvpd': {17035: 'Unauthenticated'},
 'event_name': {17035: 'Video End'},
 'event_type': {17035: 'custom_event'},
 'platform': {17035: 'iOS'},
 'data_connection_type': {17035: 'wifi'},
 'ip': {17035: '24.186.72.123'},
 'video_id': {17035: '3980294'},
 'video_type': {17035: 'Full Episode'},
 'show': {17035: 'Days of our Lives'},
 'season': {17035: '54'},
 'episode_number': {17035: '196'},
 'episode_title': {17035: 'Tuesday, July 2, 2019'},
 'genre': {17035: 'Drama'},
 'video_duration': {17035: '37'},
 'video_end_type': {17035: 'Background'},
 'resume': {17035: 'False'},
 'event_id': {17035: '6734797442868504336'},
 'session_id': {17035: '-1980859604918654925'},
 'video_duration_watched': {17035: '29'},
 'event_start_unixtime_ms':

In [312]:
ip = '24.186.72.123'
reader = SAMPLE.itertuples()
reader = ({'mpid': x.mpid, 'segment': {x.segment: x.segment_detail} }for x in reader if x.ip == ip )
MPIDS = list(reader)[0]
MPIDS

{'mpid': -3215838812283921027,
 'segment': {'segment003': {'afternoon': {'Action and Adventure': 0.01,
    'Comedy': 0.06,
    'Drama': 0.55,
    'Family and Kids': 0.01,
    'News and Information': 0.13,
    'Reality and Game Show': 0.2,
    'Soap Opera': 0.0,
    'Talk and Interview': 0.03},
   'early_morning': {'Action and Adventure': 0.0,
    'Comedy': 0.08,
    'Drama': 0.5,
    'Family and Kids': 0.0,
    'News and Information': 0.12,
    'Political': 0.0,
    'Reality and Game Show': 0.26,
    'Sci Fi and Fantasy': 0.0,
    'Talk and Interview': 0.02},
   'evening': {'Action and Adventure': 0.0,
    'Comedy': 0.11,
    'Drama': 0.52,
    'Family and Kids': 0.01,
    'News and Information': 0.12,
    'Reality and Game Show': 0.21,
    'Sci Fi and Fantasy': 0.0,
    'Talk and Interview': 0.02},
   'late_night': {'Action and Adventure': 0.0,
    'Comedy': 0.1,
    'Crime and Mystery': 0.0,
    'Drama': 0.52,
    'Family and Kids': 0.01,
    'News and Information': 0.12,
    'Politi

In [316]:
# info derived from ip - from notebooks/geo/network
ip_info ={'network': '24.186.72.0/21',
 'geoname_id': 5120741.0,
 'registered_country_geoname_id': 6252001.0,
 'represented_country_geoname_id': np.NaN,
 'is_anonymous_proxy': 0,
 'is_satellite_provider': 0,
 'postal_code': '12528',
 'latitude': 41.7167,
 'longitude': -73.9928,
 'accuracy_radius': 5.0,
 'locale_code': 'en',
 'continent_code': np.NaN,
 'continent_name': 'North America',
 'country_iso_code': 'US',
 'country_name': 'United States',
 'subdivision_1_iso_code': 'NY',
 'subdivision_1_name': 'New York',
 'subdivision_2_iso_code': np.NaN,
 'subdivision_2_name': np.NaN,
 'city_name': 'Highland',
 'metro_code': 501.0,
 'time_zone': 'America/New_York',
 'is_in_european_union': 0}


In [317]:
# info derived from postcal_code
loc_info = {'occup_housing_units': '4937',
            'median_income': '71857',
            'median_costs': '1382'}

In [350]:
ip_extended =  merge(loc_info, ip_info )
loc_demograph = { k: ip_extended[k] for  k in  {
    'network','postal_code','city_name', 'subdivision_1_name', 'time_zone', 'occup_housing_units','median_income', 'median_costs',  } }
loc_demograph['education_level'] = 'PhD'
loc_demograph['state'] = loc_demograph.pop('subdivision_1_name', )
loc_demograph['occup_housing_units'] = int(loc_demograph['occup_housing_units'])
loc_demograph['median_costs'] = int(loc_demograph['median_costs'])
loc_demograph['median_income'] = int(loc_demograph['median_income'])


In [351]:
example['loc_demograph'] = loc_demograph

In [352]:
with open('response.json', 'w') as fh:
    json.dump(example, fh, indent=4)

In [353]:
example

{'mpid': 4860154620719807383,
 'segment': {'segment005': {'afternoon': {'Comedy': 0.06,
    'Drama': 0.57,
    'Family and Kids': 0.01,
    'News and Information': 0.14,
    'Reality and Game Show': 0.22},
   'early_morning': {'Action and Adventure': 0.0,
    'Celebrity and Gossip': 0.0,
    'Comedy': 0.1,
    'Drama': 0.51,
    'Family and Kids': 0.0,
    'Live Events and Specials': 0.0,
    'News and Information': 0.15,
    'Reality and Game Show': 0.22,
    'Sci Fi and Fantasy': 0.0,
    'Talk and Interview': 0.02},
   'evening': {'Action and Adventure': 0.0,
    'Comedy': 0.09,
    'Crime and Mystery': 0.0,
    'Drama': 0.49,
    'Family and Kids': 0.02,
    'News and Information': 0.15,
    'Reality and Game Show': 0.24,
    'Talk and Interview': 0.01},
   'late_night': {'Comedy': 0.13,
    'Drama': 0.51,
    'Family and Kids': 0.01,
    'Horror and Thriller': 0.0,
    'Live Events and Specials': 0.0,
    'News and Information': 0.11,
    'Reality and Game Show': 0.23,
    'Sci Fi