In [1]:
%matplotlib inline
import json
import pandas as pd
import numpy as np
import matplotlib

In [96]:
# filtering and variable generation
def prepare_data(filename):
    data = pd.read_csv(filename)
    
    print(data['key_id'].count())
    
    # filter
    data = data[(data['stroke_in_order'] == 0) & 
     (data['drawing_time_min'] >= 0) & 
     (data['drawing_time_draw'] > 100)]
    
    print(data['key_id'].count())
    
    data = data[(data['recognized'] == True)]
    print(data['key_id'].count())    
    
    # generate seconds data
    data['dt_sec'] = data['drawing_time_draw'] / 1000.0
    data['dt_sec_floor'] = np.floor(data['dt_sec'])
    data['t_sec'] = data['drawing_time'] / 1000.0
    data['t_sec_floor'] = np.floor(data['t_sec'])
    data['dp_sec'] = data['drawing_time_pause'] / 1000.0
    data['dp_sec_floor'] = np.floor(data['dp_sec'])
    
    data['stroke_count_floor'] = np.floor(data['stroke_count'])
    
    
    
    return data
    

In [80]:
def bin_data(data, metric):
    print(metric)
    metric_floor = metric + "_floor"
    
    
    groups = data.groupby(metric_floor)
    counts = groups[metric_floor].count()
    
    sizes = counts.to_frame()
    sizes = sizes.rename(columns = {metric_floor:'count'})
    #print(sizes)
    sizes = sizes.reset_index()
    sizes = sizes.rename(columns = {metric_floor:'x0'})
    alltotal = data['key_id'].count()

    sizes['x1'] = sizes['x0'] + 1
    sizes['freq'] = sizes['count'] / alltotal
    #sizes = sizes.round({'freq': 3})

    #sizes.describe()

    return sizes

In [81]:
def to_hist(data):
    hist = list(data.T.to_dict().values())
    for h in hist:
        h['freq'] = round(h['freq'], 3)
    return hist

In [112]:
def get_drawings_dict(d_key):
    filename = "/Users/vlandham/code/data/quickdraw/simplified/" + d_key + ".ndjson"
    
    by_key_id = {}
    with open(filename) as f:
        for line in f:
            drawing = json.loads(line)
            by_key_id[drawing['key_id']] = drawing
    return by_key_id



def get_drawing_ids_dict(data, metric, count, max_bin):
    
    #metric = 'dt_sec_floor'
    
    groups = data.groupby(metric)
    drawing_ids = groups['key_id'].apply(lambda x:  x.sample(n = count) if len(x) > count else x.sample(n = len(x)))
    drawing_ids = drawing_ids.reset_index()
    drawing_id_dict = {}
    for idx, row in drawing_ids.iterrows():
    
        key = int(row[metric])
        if (key not in drawing_id_dict):
            drawing_id_dict[key] = []
        drawing_id_dict[key].append(row['key_id'])
    return drawing_id_dict
    
def combine_cords(stroke):
    return list(zip(stroke[0], stroke[1]))

def convert_drawing(drawing):
    strokes = []
    for stroke in drawing['drawing']:
        strokes.append(combine_cords(stroke))
    drawing['drawing'] = strokes
    del(drawing['key_id'])
    del(drawing['countrycode'])
    del(drawing['timestamp'])
    del(drawing['word'])
    del(drawing['recognized'])
    return drawing

def get_drawings(key_id, metric_id, data, count, max_bin):
    drawing_id_dict = get_drawing_ids_dict(data, metric_id, count, max_bin)
    
    drawings_dict = get_drawings_dict(key_id)
    
    drawings = {}
    
    for key, ids in drawing_id_dict.items():
        drawings[key] = []
        for d_id in ids:
            drawings[key].append(convert_drawing(drawings_dict[str(int(d_id))]))
    return drawings
        
    

In [104]:
def write_results(output):
    filename = "data/" + "_".join(output['keys']) + "_out.json"
    with open(filename, 'w') as outfile:
        json.dump(output, outfile)
    

In [118]:
#keys = ['bird', 'flamingo', 'owl', 'duck']
#metrics = {'hist': 'dt_sec'}

#keys = ['dog', 'cat']
#metrics = {'hist': 'dt_sec', 'hist_total': 't_sec', 'hist_pause': 'dp_sec'}

#keys = ['ant', 'mosquito', 'butterfly', 'scorpion']
#metrics = {'hist': 'dt_sec'}

#keys = ['circle', 'squiggle', 'triangle', 'square',]
#metrics = {'hist': 'dt_sec'}

keys = ['dog', 'cat', 'horse']
metrics = {'hist': 'dt_sec', 'hist_stroke': 'stroke_count'}


output = {"keys": keys}

for key_id in keys:
    print(key_id)
    
    
    filename = "data/" + key_id +".stats.csv"
    data = prepare_data(filename)
    output[key_id] = {}
    
    for out_key, metric in metrics.items():
        
        bins = bin_data(data, metric)
    
    
        hist = to_hist(bins)
    
    
        output[key_id][out_key] = hist
        output[key_id][metric + '_mean'] = data[metric].mean()
        output[key_id][metric + '_quans'] = list(data[metric].quantile([0.25, 0.5, 0.75]))
    output[key_id]['drawings'] = get_drawings(key_id, 'dt_sec_floor', data, 1000, 25)
    #output[key_id]['drawings_strokes'] = get_drawings(key_id, 'stroke_count_floor', data, 20, 25)
write_results(output)

dog
152159
151962
143102
dt_sec
stroke_count
cat
123202
123077
102943
dt_sec
stroke_count
horse
178286
178062
156113
dt_sec
stroke_count


In [74]:
data['dt_sec'].mean()

5.474867421777101

In [14]:
#get_drawings(key_id, data, 20, 25)
ddd = get_drawings_dict('dog')

In [17]:
list(ddd.keys())[0]

'6718004173733888'

In [117]:

#hist = bins.T.to_dict().values()
# for val in hist:
#     print(val['x0'])
#sizes.reset_index()
#sizes.columns()
#sizes = pd.DataFrame(data =counts)

#counts.columns('x0', 'count')
#sizes.columns('x0')
#bins['freq'] = bins['key_id'] / data['key_id'].count()

0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
26.0
27.0
29.0
30.0
44.0
69.0


In [34]:
#sizes.plot.bar(x = 'x0', y = 'freq')