In [None]:
%reload_ext autoreload
%autoreload 2

In [10]:
import numpy as np
import pandas as pd
from toolz import concat, concatv, first, peek
from itertools import starmap

import json
from pathlib import Path
import gzip

In [5]:
from nbc_analysis.utils.debug_utils import runit
from nbc_analysis.utils.config_utils import get_config
from nbc_analysis.analysis.gen_profiles import main as gen_profile
from nbc_analysis.transforms.merge_video_ends import main as merge_video_ends

In [6]:
from cortex_common.types import EntityEvent, StringAttributeValue, ListAttributeValue

In [7]:
config = get_config(config_f='default',
                      overrides={'BATCHES_D': '/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches'})

>> created example config file '/Users/wmcabee/.config/nbc_analysis/config_example.yaml'
>> Using default config


In [8]:
%%time
DATA = merge_video_ends(config_f=config)

>> Loading batches,batch_cnt=129,indir=/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches
>> Building index on {mpid,event_start_unixtime_ms}
>> Sorting index
>> end merge ve_events,outfile=/Users/wmcabee/DATA/NBC2/work/merged_ve_events.parquet,records=6265392
CPU times: user 1min 18s, sys: 7.26 s, total: 1min 25s
Wall time: 1min 19s


In [11]:
np.random.seed(7)
df= DATA[['show','genre']]
#df = df[:200]
df = df.reset_index()
GENRES = df.genre.drop_duplicates(keep='last').tolist()
SHOWS = df.show.drop_duplicates(keep='last').sort_values().tolist()

reader = filter(None, concatv(GENRES, SHOWS))
reader = concat(map(lambda x: x.split(' '),reader ))
reader = (x.lower() for x in reader if x.isalpha())
KEYWORDS = list(set(reader))


In [12]:
MPIDS = DATA[:2000].reset_index().mpid.sample(frac=1).tolist()

In [13]:
def mock_scores(choices):
    count = np.random.randint(5,10)
    reader = zip(np.random.choice(choices,count, replace=False ),
              np.round(np.random.rand(count),5) )
    reader = filter(lambda x: x[0] not in {'', None}, reader)
    return sorted(reader, key=lambda x: -x[1])
  

In [14]:
def mock_profile(mpid):
    return dict(mpid=mpid,
                genres=mock_scores(GENRES),
                shows=mock_scores(SHOWS), 
                keywords=mock_scores(KEYWORDS))

In [17]:
def write_dicts2json(inputs, outfile):
    reader = map(json.dumps, inputs)
    reader = map(lambda x: x+"\n", reader)
    reader = list(reader)


    with gzip.open(str(outfile), 'wt') as fh:
        fh.writelines(reader )
    print(f"wrote file={outfile},len={len(reader)}")

In [18]:
outfile = Path('.') / 'ex_from_cs1.json.gz'     
INPUTS = list(map(mock_profile, MPIDS))
write_dicts2json(inputs=INPUTS, outfile=outfile)
print('example:')
INPUTS[0]

wrote file=ex_from_cs1.json.gz,len=2000
example:


{'mpid': -9220657203495870507,
 'genres': [('Political', 0.91858),
  ('Soap Opera', 0.74657),
  ('Lifestyle and Fashion', 0.74127),
  ('Crime and Mystery', 0.54787),
  ('Sports', 0.35383),
  ('Comedy', 0.31883),
  ('Talk and Interview', 0.11153),
  ('Action and Adventure', 0.0403)],
 'shows': [('I Feel Bad', 0.88573),
  ('House of the Witch', 0.83055),
  ('The Real Housewives of Beverly Hills', 0.82493),
  ('AM Joy', 0.45652),
  ('State of Affairs', 0.38617)],
 'keywords': [('saved', 0.75176),
  ('lady', 0.74697),
  ('jackson', 0.3372),
  ('cromarties', 0.32888),
  ('arctic', 0.1592)]}

## Convert to profile of 1 input format

In [19]:
def mock_po1( mpid, attr, items):
    
    try: 
        items = ListAttributeValue(value=list(StringAttributeValue(value=value, weight=weight) for value, weight in items))
    
        event = EntityEvent(
              event=attr,
              entityId=str(mpid),
              entityType="nbc/Viewer",
              properties={'value': items},
              meta= {}
        )
        return dict(event)
    except Exception as e:
        print(items)
        raise
        
reader = ( (msg['mpid'], attr, items) for msg in INPUTS for attr, items in msg.items() if attr != 'mpid')
reader = starmap(mock_po1, reader)
PROFILE_INPUTS = list(reader)
PROFILE_INPUTS[0]

{'event': 'genres',
 'entityId': '-9220657203495870507',
 'entityType': 'nbc/Viewer',
 'properties': {'value': {'value': [{'value': 'Political',
     'weight': 0.91858,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Soap Opera',
     'weight': 0.74657,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Lifestyle and Fashion',
     'weight': 0.74127,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Crime and Mystery',
     'weight': 0.54787,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Sports',
     'weight': 0.35383,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Comedy',
     'weight': 0.31883,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Talk and Interview',
     'weight': 0.11153,
     'context': 'cortex/attribute-value-string'},
    {'value': 'Action and Adventure',
     'weight': 0.0403,
     'context': 'cortex/attribute-value-string'}],
   'context': 'cortex/attribute-value-l

In [20]:
outfile = Path('.') / 'ex_profile_input.json.gz'     
write_dicts2json(inputs=PROFILE_INPUTS, outfile=outfile)

wrote file=ex_profile_input.json.gz,len=6000


In [21]:
StringAttributeValue(value='x', weight=3)

StringAttributeValue(value='x', weight=3, context='cortex/attribute-value-string')