In [1]:
%matplotlib inline
import json
import pandas as pd
import numpy as np
import matplotlib

In [50]:
# filtering and variable generation
def prepare_data(filename):
    data = pd.read_csv(filename)
    
    print(data['key_id'].count())
    
    # filter
    data = data[(data['stroke_in_order'] == 0) & 
     (data['drawing_time_min'] >= 0) & 
     (data['drawing_time_draw'] > 100)]
    
    print(data['key_id'].count())
    
    data = data[(data['recognized'] == True)]
    print(data['key_id'].count())    
    
    # generate seconds data
    data['dt_sec'] = data['drawing_time_draw'] / 1000.0
    data['dt_sec_floor'] = np.floor(data['dt_sec'])
    data['t_sec'] = data['drawing_time'] / 1000.0
    data['t_sec_floor'] = np.floor(data['t_sec'])
    data['dp_sec'] = data['drawing_time_pause'] / 1000.0
    data['dp_sec_floor'] = np.floor(data['dp_sec'])
    
    
    
    return data
    

In [64]:
def bin_data(data, metric):
    #metric_floor = metric + "_floor"
    print(metric)
    
    groups = data.groupby(metric)
    counts = groups[metric].count()
    sizes = counts.to_frame()
    sizes = sizes.rename(columns = {metric:'count'})
    #print(sizes)
    sizes = sizes.reset_index()
    sizes = sizes.rename(columns = {metric:'x0'})
    alltotal = data['key_id'].count()

    sizes['x1'] = sizes['x0'] + 1
    sizes['freq'] = sizes['count'] / alltotal
    #sizes = sizes.round({'freq': 3})

    #sizes.describe()

    return sizes

In [63]:
def to_hist(data):
    hist = list(data.T.to_dict().values())
    for h in hist:
        h['freq'] = round(h['freq'], 3)
    return hist

In [57]:
def get_drawings_dict(d_key):
    filename = "/Users/vlandham/code/data/quickdraw/simplified/" + d_key + ".ndjson"
    
    by_key_id = {}
    with open(filename) as f:
        for line in f:
            drawing = json.loads(line)
            by_key_id[drawing['key_id']] = drawing
    return by_key_id



def get_drawing_ids_dict(data, count, max_bin):
    groups = data.groupby('dt_sec_floor')
    drawing_ids = groups['key_id'].apply(lambda x:  x.sample(n = count) if len(x) > count else x.sample(n = len(x)))
    drawing_ids = drawing_ids.reset_index()
    drawing_id_dict = {}
    for idx, row in drawing_ids.iterrows():
    
        key = int(row['dt_sec_floor'])
        if (key not in drawing_id_dict):
            drawing_id_dict[key] = []
        drawing_id_dict[key].append(row['key_id'])
    return drawing_id_dict
    
    
def get_drawings(key_id, data, count, max_bin):
    drawing_id_dict = get_drawing_ids_dict(data, count, max_bin)
    
    drawings_dict = get_drawings_dict(key_id)
    
    drawings = {}
    
    for key, ids in drawing_id_dict.items():
        drawings[key] = []
        for d_id in ids:
            drawings[key].append(drawings_dict[str(int(d_id))])
    return drawings
        
    

In [53]:
def write_results(output):
    filename = "data/" + "_".join(output['keys']) + "_out.json"
    with open(filename, 'w') as outfile:
        json.dump(output, outfile)
    

In [66]:
keys = ['circle', 'bird', 'swan', 'flamingo']
metrics = {'hist': 'dt_sec_floor'}

#keys = ['dog', 'cat']
#metrics = {'hist': 'dt_sec_floor', 'hist_total': 't_sec_floor', 'hist_pause': 'dp_sec_floor'}

#keys = ['butterfly', 'ant', 'mosquito', 'scorpion']
#metrics = {'hist': 'dt_sec_floor'}


output = {"keys": keys}

for key_id in keys:
    
    
    filename = "data/" + key_id +".stats.csv"
    data = prepare_data(filename)
    output[key_id] = {}
    
    for out_key, metric in metrics.items():
        
        bins = bin_data(data, metric)
    
    
        hist = to_hist(bins)
    
    
        output[key_id][out_key] = hist
    output[key_id]['drawings'] = get_drawings(key_id, data, 20, 25)
write_results(output)

122876
122774
118709
dt_sec_floor
133572
133381
111468
dt_sec_floor
152088
151901
132834
dt_sec_floor
124569
124443
116143
dt_sec_floor


In [14]:
#get_drawings(key_id, data, 20, 25)
ddd = get_drawings_dict('dog')

In [17]:
list(ddd.keys())[0]

'6718004173733888'

In [117]:

#hist = bins.T.to_dict().values()
# for val in hist:
#     print(val['x0'])
#sizes.reset_index()
#sizes.columns()
#sizes = pd.DataFrame(data =counts)

#counts.columns('x0', 'count')
#sizes.columns('x0')
#bins['freq'] = bins['key_id'] / data['key_id'].count()

0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
26.0
27.0
29.0
30.0
44.0
69.0


In [34]:
#sizes.plot.bar(x = 'x0', y = 'freq')