In [1]:
%reload_ext autoreload
%autoreload 2

In [228]:
import numpy as np
import pandas as pd
from toolz import concatv, first, peek
from itertools import starmap

import json
from pathlib import Path
import gzip

In [10]:
from nbc_analysis.utils.debug_utils import runit
from nbc_analysis.utils.config_utils import get_config
from nbc_analysis.analysis.generte_profiles import main as gen_profile
from nbc_analysis.transforms.merge_video_ends import main as merge_video_ends

In [229]:
from cortex_common.types import EntityEvent, StringAttributeValue, ListAttributeValue

In [11]:
config = get_config(config_f='default',
                      overrides={'BATCHES_D': '/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches'})

>> created example config file '/Users/wmcabee/.config/nbc_analysis/config_example.yaml'
>> Using default config


In [15]:
%%time
DATA = merge_video_ends(config_f=config)

>> Loading batches,batch_cnt=129,indir=/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches
>> Building index on {mpid,event_start_unixtime_ms}
>> Sorting index
>> end merge ve_events,outfile=/Users/wmcabee/DATA/NBC2/work/merged_ve_events.parquet,records=6265392
CPU times: user 1min 21s, sys: 10 s, total: 1min 31s
Wall time: 1min 24s


In [140]:
np.random.seed(7)
df= DATA[['show','genre']]
#df = df[:200]
df = df.reset_index()
GENRES = df.genre.drop_duplicates(keep='last').tolist()
SHOWS = df.show.drop_duplicates(keep='last').sort_values().tolist()

reader = filter(None, concatv(GENRES, SHOWS))
reader = concat(map(lambda x: x.split(' '),reader ))
reader = (x.lower() for x in reader if x.isalpha())
KEYWORDS = list(set(reader))


In [156]:
MPIDS = DATA[:2000].reset_index().mpid.sample(frac=1).tolist()

In [249]:
def mock_scores(choices):
    count = np.random.randint(5,10)
    reader = zip(np.random.choice(choices,count, replace=False ),
              np.round(np.random.rand(count),5) )
    reader = filter(lambda x: x[0] not in {'', None}, reader)
    return sorted(reader, key=lambda x: -x[1])
  

In [250]:
def mock_profile(mpid):
    return dict(mpid=mpid,
                genres=mock_scores(GENRES),
                shows=mock_scores(SHOWS), 
                keywords=mock_scores(KEYWORDS))

In [263]:
def write_dicts2json(inputs, outfile):
    reader = map(json.dumps, inputs)
    reader = map(lambda x: x+"\n", reader)
    reader = list(reader)


    with gzip.open(str(outfile), 'wt') as fh:
        fh.writelines(reader )
    print(f"wrote file={infile},len={len(reader)}")

In [264]:
outfile = Path('.') / 'ex_from_cs1.json.gz'     
INPUTS = list(map(mock_profile, MPIDS))
write_dicts2json(inputs=INPUTS, outfile=outfile)
print('example:')
INPUTS[0]

wrote file=ex_profile_input.json.gz,len=2000
example:


{'mpid': -9220011668977242260,
 'genres': [('Drama', 0.96549),
  ('Sports', 0.9191),
  ('Reality and Game Show', 0.59785),
  ('Comedy', 0.53361),
  ('Lifestyle and Fashion', 0.38334),
  ('Family and Kids', 0.36383)],
 'shows': [('Piranhaconda', 0.90077),
  ('Aaron Hernandez, McNair & Dele: Dangerous Games', 0.88741),
  ('Squawk Alley', 0.88174),
  ('Jodi Arias: In Defense Of', 0.60999),
  ('Tremors: A Cold Day in Hell', 0.34893),
  ('NBC News Specials', 0.284),
  ('Ice Loves Coco', 0.05726)],
 'keywords': [('drive', 0.78391),
  ('method', 0.67907),
  ('high', 0.635),
  ('copperhead', 0.57751),
  ('attack', 0.30268),
  ('funny', 0.24901),
  ('love', 0.19209)]}

## Convert to profile of 1 input format

In [260]:
def mock_po1( mpid, attr, items):
    
    try: 
        items = ListAttributeValue(value=list(StringAttributeValue(value=value, weight=weight) for value, weight in items))
    
        event = EntityEvent(
              event=attr,
              entityId=str(mpid),
              entityType="nbc/Viewer",
              properties={'value': items},
              meta= {}
        )
        return dict(event)
    except Exception as e:
        print(items)
        raise
        
reader = ( (msg['mpid'], attr, items) for msg in INPUTS for attr, items in msg.items() if attr != 'mpid')
reader = starmap(mock_po1, reader)
#reader ( (msg['mpid'], attr, value)  for msg in INPUTS  for attr, value in msg.items() if msg != 'mpid')
PROFILE_INPUTS = list(reader)
PROFILE_INPUTS[0]

{'event': 'genres',
 'entityId': '-9220011668977242260',
 'entityType': 'nbc/Viewer',
 'properties': {'value': {'value': [{'value': 'Sci Fi and Fantasy',
     'weight': 0.67676,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Live Events and Specials',
     'weight': 0.65635,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Horror and Thriller',
     'weight': 0.505,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Lifestyle and Fashion',
     'weight': 0.37419,
     'context': 'cortex/attribute-value-string'},
    {'value': 'News and Information',
     'weight': 0.28834,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Sports',
     'weight': 0.14857,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Political',
     'weight': 0.14009,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Action and Adventure',
     'weight': 0.13853,
     'context': 'cortex/attribute-value-string'}],
   'contex

In [265]:
outfile = Path('.') / 'ex_profile_input.json.gz'     
write_dicts2json(inputs=PROFILE_INPUTS, outfile=outfile)

wrote file=ex_profile_input.json.gz,len=6000


In [216]:
StringAttributeValue(value='x', weight=3)

StringAttributeValue(value='x', weight=3, context='cortex/attribute-value-string')