In [1]:
import os
import json
import pandas
import csv
import pycountry

### The dataset

Each drawing from the ndjson file are in the format of line of json. We will therefore read line by line the ndjson file. Each json has a "drawing" array taht represent the strokes of the drawing.
Each of the drawing array is as following format: 

```python
[ 
  [  // First stroke 
    [x0, x1, x2, x3, ...],
    [y0, y1, y2, y3, ...],
    [t0, t1, t2, t3, ...]
  ],
  [  // Second stroke
    [x0, x1, x2, x3, ...],
    [y0, y1, y2, y3, ...],
    [t0, t1, t2, t3, ...]
  ],
  ... // Additional strokes
]
```

The time is in milliseconds

In [2]:
# Get the time of the first and last point in a stroke (not drawing)
def time_first_point(stroke):
    return stroke[2][0]

def time_last_point(stroke):
    return stroke[2][-1]

def time_stroke(stroke):
    return time_last_point(stroke) - time_first_point(stroke)

def pause_time(drawing, stroke, stroke_index):
    if (stroke_index == 0):
        return 0
    prev_stroke = drawing['drawing'][stroke_index - 1]
    return time_first_point(stroke) - time_last_point(prev_stroke)

def gen_stats_stroke(drawing, stroke, stroke_index):
    stats = {
        "stroke_index": stroke_index,
        "stroke_time": time_stroke(stroke),
        "stroke_time_first": time_first_point(stroke),
        "stroke_time_last": time_last_point(stroke),
        "stroke_time_pause": pause_time(drawing, stroke, stroke_index)
    }
    
    return stats

def get_country_name(country_code):
    country_class = pycountry.countries.get(alpha_2=country_code)
    try:
        if country_code == 'TW':
            country_name = 'Taiwan'
        else:
            country_name = country_class.name
    except:
        country_name = country_code
    return country_name

def gen_stats(drawing):
    first_stroke = drawing['drawing'][0]
    last_stroke = drawing['drawing'][-1]
    
    stats = {
        "key_id": drawing['key_id'],
        "recognized": drawing['recognized'],
        "word": drawing['word'],
        "stroke_count": len(drawing['drawing']),
        "countrycode": get_country_name(drawing['countrycode']),
        "drawing_time_total": time_last_point(last_stroke) - time_first_point(first_stroke)
    }
    strokes = []
    
    for idx, stroke in enumerate(drawing['drawing']):
        strokes.append(gen_stats_stroke(drawing, stroke, idx))
    pause_times = [s['stroke_time_pause'] for s in strokes]
    stats['drawing_time_pause'] = sum(pause_times)
    stroke_times = [s['stroke_time'] for s in strokes]
    stats['drawing_time_draw'] = sum(stroke_times)
    
    return stats

In [3]:
RAW_ROOT = "./data/full_raw_"
PROCESS_ROOT = "./processed_data/time_per_"
draw_ids = ['dog']

In [4]:
def save_stats(drawing_name, filename):
    stats = []
    with open(filename) as f:
        for line in f:
            drawing = json.loads(line)
            drawing_stats = gen_stats(drawing)
            stats.append(drawing_stats)
    with open(PROCESS_ROOT + drawing_name + '_stats.csv', 'w') as f:
        w = csv.DictWriter(f, stats[0].keys())
        w.writeheader()
        w.writerows(stats)

In [5]:
draw_ids = ['square','triangle']
#draw_id = 'cat'
for draw_id in draw_ids:
    print(draw_id)

    filename = RAW_ROOT + draw_id + ".ndjson"
    save_stats(draw_id, filename)

circle
