In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from toolz import concat, concatv, first, peek
from itertools import starmap

import json
from pathlib import Path
import gzip

In [4]:
from nbc_analysis.utils.debug_utils import runit
from nbc_analysis.utils.config_utils import get_config
from nbc_analysis.analysis.gen_profiles import main as gen_profile
from nbc_analysis.transforms.merge_video_ends import main as merge_video_ends

In [5]:
from cortex_common.types import EntityEvent, StringAttributeValue, ListAttributeValue

In [6]:
config = get_config(config_f='default',
                      overrides={'BATCHES_D': '/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches'})

>> created example config file '/Users/wmcabee/.config/nbc_analysis/config_example.yaml'
>> Using default config


In [7]:
%%time
DATA = merge_video_ends(config_f=config)

>> Loading batches,batch_cnt=129,indir=/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches
>> Building index on {mpid,event_start_unixtime_ms}
>> Sorting index
>> end merge ve_events,outfile=/Users/wmcabee/DATA/NBC2/work/merged_ve_events.parquet,records=6265392
CPU times: user 1min 19s, sys: 7.16 s, total: 1min 26s
Wall time: 1min 19s


In [8]:
np.random.seed(7)
df= DATA[['show','genre']]
#df = df[:200]
df = df.reset_index()
GENRES = df.genre.drop_duplicates(keep='last').tolist()
SHOWS = df.show.drop_duplicates(keep='last').sort_values().tolist()

reader = filter(None, concatv(GENRES, SHOWS))
reader = concat(map(lambda x: x.split(' '),reader ))
reader = (x.lower() for x in reader if x.isalpha())
KEYWORDS = list(set(reader))


In [9]:
df = DATA[:2000].reset_index()[['mpid', 'event_start_unixtime_ms']].sample(frac=1)
MPIDS = df.mpid
MPIDS_TS = df.event_start_unixtime_ms.astype(np.int)

In [10]:
def mock_scores(choices):
    count = np.random.randint(5,10)
    reader = zip(np.random.choice(choices,count, replace=False ),
              np.round(np.random.rand(count),5) )
    reader = filter(lambda x: x[0] not in {'', None}, reader)
    return sorted(reader, key=lambda x: -x[1])
  

In [11]:
def mock_profile(mpid, mpid_ts):
    return dict(mpid=mpid,
                genres=mock_scores(GENRES),
                shows=mock_scores(SHOWS),
                event_start_unixtime_ms=mpid_ts,
                #keywords=mock_scores(KEYWORDS),
               )

In [12]:
def write_dicts2json(inputs, outfile):
    reader = map(json.dumps, inputs)
    reader = map(lambda x: x+"\n", reader)
    reader = list(reader)


    with gzip.open(str(outfile), 'wt') as fh:
        fh.writelines(reader )
    print(f"wrote file={outfile},len={len(reader)}")

In [13]:
outfile = Path('.') / 'ex_from_cs1.json.gz'     
INPUTS = list(starmap(mock_profile, zip(MPIDS, MPIDS_TS)))
write_dicts2json(inputs=INPUTS, outfile=outfile)
print('example:')
INPUTS[0]

wrote file=ex_from_cs1.json.gz,len=2000
example:


{'mpid': -9220657203495870507,
 'genres': [('Comedy', 0.88688),
  ('Horror and Thriller', 0.88377),
  ('Sports', 0.67464),
  ('Crime and Mystery', 0.55149),
  ('Action and Adventure', 0.28971),
  ('Reality and Game Show', 0.08159)],
 'shows': [('Jurassic Park III', 0.8296),
  ('Beyond Sherwood Forest', 0.67711),
  ('Law & Order: Special Victims Unit', 0.61222),
  ('Outsourced', 0.53266),
  ('Independence Day-saster', 0.45241),
  ('Killjoys', 0.24296),
  ('Underworld: Rise of the Lycans', 0.16599)],
 'event_start_unixtime_ms': 1562216450152}

## Convert to profile of 1 input format

In [14]:
def mock_po1( msg, attr, items):
    
    try: 
        items = ListAttributeValue(value=list(StringAttributeValue(value=value, weight=weight) for value, weight in items))
    
        event = EntityEvent(
              event=attr,
              entityId=str(msg['mpid']),
              entityType="nbc/Viewer",
              properties=items,
              meta= {},
              eventTime= msg['event_start_unixtime_ms'],
        )
        return dict(event)
    except Exception as e:
        print(items)
        raise
        
reader = ( (msg, attr, items) for msg in INPUTS for attr, items in msg.items() if attr not in {'mpid', 'event_start_unixtime_ms'})
reader = starmap(mock_po1, reader)
PROFILE_INPUTS = list(reader)
PROFILE_INPUTS[0]

{'event': 'genres',
 'entityId': '-9220657203495870507',
 'entityType': 'nbc/Viewer',
 'properties': {'value': [{'value': 'Comedy',
    'weight': 0.88688,
    'context': 'cortex/attribute-value-string'},
   {'value': 'Horror and Thriller',
    'weight': 0.88377,
    'context': 'cortex/attribute-value-string'},
   {'value': 'Sports',
    'weight': 0.67464,
    'context': 'cortex/attribute-value-string'},
   {'value': 'Crime and Mystery',
    'weight': 0.55149,
    'context': 'cortex/attribute-value-string'},
   {'value': 'Action and Adventure',
    'weight': 0.28971,
    'context': 'cortex/attribute-value-string'},
   {'value': 'Reality and Game Show',
    'weight': 0.08159,
    'context': 'cortex/attribute-value-string'}],
  'context': 'cortex/attribute-value-list'},
 'meta': {},
 'eventTime': 1562216450152}

In [16]:
outfile = Path('.') / 'ex_profile_input.json.gz'     
write_dicts2json(inputs=PROFILE_INPUTS, outfile=outfile)

wrote file=ex_profile_input.json.gz,len=4000


In [17]:
StringAttributeValue(value='x', weight=3)

StringAttributeValue(value='x', weight=3, context='cortex/attribute-value-string')