In [9]:
%reload_ext autoreload
%autoreload 2

In [10]:
import numpy as np
import pandas as pd
from toolz import concat, concatv, first, peek
from itertools import starmap

import json
from pathlib import Path
import gzip

In [11]:
from nbc_analysis.utils.debug_utils import runit
from nbc_analysis.utils.config_utils import get_config
from nbc_analysis.analysis.gen_profiles import main as gen_profile
from nbc_analysis.transforms.merge_video_ends import main as merge_video_ends

In [12]:
from cortex_common.types import EntityEvent, StringAttributeValue, ListAttributeValue

In [16]:
config = get_config(config_f='default',
                      overrides={'BATCHES_D': '/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches'})

>> created example config file '/Users/wmcabee/.config/nbc_analysis/config_example.yaml'
>> Using default config


In [17]:
%%time
DATA = merge_video_ends(config_f=config, STOP_AFTER_MERGE=True)

>> Loading batches,batch_cnt=129,indir=/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/NBC2/batches
>> finished read of batch files,files=129,records=6265392
>> add calculated fields
CPU times: user 27.5 s, sys: 5.94 s, total: 33.4 s
Wall time: 37.9 s


In [18]:
np.random.seed(7)
df= DATA[['show','genre']]
#df = df[:200]
df = df.reset_index()
GENRES = df.genre.drop_duplicates(keep='last').tolist()
SHOWS = df.show.drop_duplicates(keep='last').sort_values().tolist()

reader = filter(None, concatv(GENRES, SHOWS))
reader = concat(map(lambda x: x.split(' '),reader ))
reader = (x.lower() for x in reader if x.isalpha())
KEYWORDS = list(set(reader))


In [76]:
df = DATA
#df = DATA[:2000].reset_index()[['mpid', 'event_start_unixtime_ms']].sample(frac=1)

MPIDS = df.mpid
MPIDS_TS = df.event_start_unixtime_ms.astype(np.int)

In [77]:
def mock_scores(choices):
    count = np.random.randint(5,10)
    reader = zip(np.random.choice(choices,count, replace=False ),
              np.round(np.random.rand(count),5) )
    reader = filter(lambda x: x[0] not in {'', None}, reader)
    return sorted(reader, key=lambda x: -x[1])
  

In [78]:
def mock_profile(mpid, mpid_ts):
    return dict(mpid=mpid,
                genres=mock_scores(GENRES),
                shows=mock_scores(SHOWS),
                event_start_unixtime_ms=mpid_ts,
                #keywords=mock_scores(KEYWORDS),
               )

In [79]:
def log_status(reader, input_cnt, msg):
    for idx, rec in enumerate(reader):
        if idx % 20000 == 0:
            pct = round((idx / input_cnt) * 100,1)
            print(f">> {msg}, running {idx}/{input_cnt}: {pct}%")
        yield rec
        

def write_dicts2json(inputs, outfile):
    reader = map(json.dumps, inputs)
    reader = map(lambda x: x+"\n", reader)
    reader = log_status(reader, input_cnt=input_cnt, msg='write_dicts2json:')
    #reader = list(reader)


    with gzip.open(str(outfile), 'wt') as fh:
        fh.writelines(reader )
    print(f"wrote file={outfile}")

In [80]:
%%time

#write_dicts2json(inputs=INPUTS, outfile=outfile)
#print('example:')

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs


## Convert to profile of 1 input format

In [81]:

def mock_po1( msg, attr, items):
    
    try: 
        items = ListAttributeValue(value=list(StringAttributeValue(value=value, weight=weight) for value, weight in items))
    
        event = EntityEvent(
              event=attr,
              entityId=str(msg['mpid']),
              entityType="nbc/Viewer",
              properties=items,
              meta= {},
              eventTime= msg['event_start_unixtime_ms'],
        )
        return dict(event)
    except Exception as e:
        print(items)
        raise

In [83]:
%time
outfile = Path('.') / 'ex_from_cs1.json.gz'   
reader = zip(MPIDS, MPIDS_TS)
input_cnt = len(MPIDS)
reader = log_status(reader, input_cnt, msg="mock_profile:")
INPUTS = starmap(mock_profile,reader)     
reader = ( (msg, attr, items) for msg in INPUTS for attr, items in msg.items() if attr not in {'mpid', 'event_start_unixtime_ms'})
reader = starmap(mock_po1, reader)
outfile = Path('.') / 'ex_profile_input.json.gz'
outfile = '/Users/wmcabee/Dropbox (Cognitive Scale)/NBC Analysis/data/ex_profile_input_6m.json.gz'
write_dicts2json(inputs=reader, outfile=outfile)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs
>> mock_profile:, running 0/6265392: 0.0%
>> write_dicts2json:, running 0/6265392: 0.0%
>> write_dicts2json:, running 20000/6265392: 0.3%
>> mock_profile:, running 20000/6265392: 0.3%
>> write_dicts2json:, running 40000/6265392: 0.6%
>> write_dicts2json:, running 60000/6265392: 1.0%
>> mock_profile:, running 40000/6265392: 0.6%
>> write_dicts2json:, running 80000/6265392: 1.3%
>> write_dicts2json:, running 100000/6265392: 1.6%
>> mock_profile:, running 60000/6265392: 1.0%
>> write_dicts2json:, running 120000/6265392: 1.9%
>> write_dicts2json:, running 140000/6265392: 2.2%
>> mock_profile:, running 80000/6265392: 1.3%
>> write_dicts2json:, running 160000/6265392: 2.6%
>> write_dicts2json:, running 180000/6265392: 2.9%
>> mock_profile:, running 100000/6265392: 1.6%
>> write_dicts2json:, running 200000/6265392: 3.2%
>> write_dicts2json:, running 220000/6265392: 3.5%
>> mock_profile:, running 120000/6265392: 1.9%
>> write_dict

Could not convert int time provided 946702878340
Could not convert int time provided 946702878340


>> write_dicts2json:, running 3820000/6265392: 61.0%
>> mock_profile:, running 1920000/6265392: 30.6%
>> write_dicts2json:, running 3840000/6265392: 61.3%
>> write_dicts2json:, running 3860000/6265392: 61.6%
>> mock_profile:, running 1940000/6265392: 31.0%
>> write_dicts2json:, running 3880000/6265392: 61.9%
>> write_dicts2json:, running 3900000/6265392: 62.2%
>> mock_profile:, running 1960000/6265392: 31.3%
>> write_dicts2json:, running 3920000/6265392: 62.6%
>> write_dicts2json:, running 3940000/6265392: 62.9%
>> mock_profile:, running 1980000/6265392: 31.6%
>> write_dicts2json:, running 3960000/6265392: 63.2%
>> write_dicts2json:, running 3980000/6265392: 63.5%
>> mock_profile:, running 2000000/6265392: 31.9%
>> write_dicts2json:, running 4000000/6265392: 63.8%
>> write_dicts2json:, running 4020000/6265392: 64.2%
>> mock_profile:, running 2020000/6265392: 32.2%
>> write_dicts2json:, running 4040000/6265392: 64.5%
>> write_dicts2json:, running 4060000/6265392: 64.8%
>> mock_profile:,

Could not convert int time provided 2500957991
Could not convert int time provided 2500957991


>> write_dicts2json:, running 4660000/6265392: 74.4%
>> mock_profile:, running 2340000/6265392: 37.3%
>> write_dicts2json:, running 4680000/6265392: 74.7%
>> write_dicts2json:, running 4700000/6265392: 75.0%
>> mock_profile:, running 2360000/6265392: 37.7%
>> write_dicts2json:, running 4720000/6265392: 75.3%
>> write_dicts2json:, running 4740000/6265392: 75.7%
>> mock_profile:, running 2380000/6265392: 38.0%
>> write_dicts2json:, running 4760000/6265392: 76.0%
>> write_dicts2json:, running 4780000/6265392: 76.3%
>> mock_profile:, running 2400000/6265392: 38.3%
>> write_dicts2json:, running 4800000/6265392: 76.6%
>> write_dicts2json:, running 4820000/6265392: 76.9%
>> mock_profile:, running 2420000/6265392: 38.6%
>> write_dicts2json:, running 4840000/6265392: 77.2%
>> write_dicts2json:, running 4860000/6265392: 77.6%
>> mock_profile:, running 2440000/6265392: 38.9%
>> write_dicts2json:, running 4880000/6265392: 77.9%
>> write_dicts2json:, running 4900000/6265392: 78.2%
>> mock_profile:,

Could not convert int time provided 2079345531
Could not convert int time provided 2079345531


>> mock_profile:, running 2720000/6265392: 43.4%
>> write_dicts2json:, running 5440000/6265392: 86.8%
>> write_dicts2json:, running 5460000/6265392: 87.1%
>> mock_profile:, running 2740000/6265392: 43.7%
>> write_dicts2json:, running 5480000/6265392: 87.5%
>> write_dicts2json:, running 5500000/6265392: 87.8%
>> mock_profile:, running 2760000/6265392: 44.1%
>> write_dicts2json:, running 5520000/6265392: 88.1%
>> write_dicts2json:, running 5540000/6265392: 88.4%
>> mock_profile:, running 2780000/6265392: 44.4%
>> write_dicts2json:, running 5560000/6265392: 88.7%
>> write_dicts2json:, running 5580000/6265392: 89.1%
>> mock_profile:, running 2800000/6265392: 44.7%
>> write_dicts2json:, running 5600000/6265392: 89.4%
>> write_dicts2json:, running 5620000/6265392: 89.7%
>> mock_profile:, running 2820000/6265392: 45.0%
>> write_dicts2json:, running 5640000/6265392: 90.0%
>> write_dicts2json:, running 5660000/6265392: 90.3%
>> mock_profile:, running 2840000/6265392: 45.3%
>> write_dicts2json:,

wrote file=ex_profile_input.json.gz,len=4000


In [17]:
StringAttributeValue(value='x', weight=3)

StringAttributeValue(value='x', weight=3, context='cortex/attribute-value-string')