# Quickdraw Data

If machine learning is rocket science then data is your fuel! So before
doing anything we will have a close look at the data available and spend
some time bringing it into the "right" form (i.e.
[tf.train.Example](https://www.tensorflow.org/versions/r1.0/api_docs/python/tf/train/Example)).

That's why we start by spending quite a lot on this notebook, downloading
the data, understanding it, and transforming it into the right format for
Tensorflow.

The data used in this workshop is taken from Google's quickdraw (click on
the images to see loads of examples):

https://quickdraw.withgoogle.com/data

Table of contents:

- [ 1 Get the data](#1-Get-the-data)
- [ 2 Inspect the data](#2-Inspect-the-data)
- [ 3 Rasterize](#3-Rasterize)
- [ 4 tf.train.Example data format](#4-tf.train.Example-data-format)
- [ 5 Create dataset](#5-Create-dataset)
- [ 6 Prepare dataset for RNN – bonus!](#6-Prepare-dataset-for-RNN-%E2%80%93-bonus!)

In [None]:
import base64, io, itertools, json, os, random, re, time
import numpy as np
import tensorflow as tf
from matplotlib import pyplot
from PIL import Image, ImageDraw
from IPython import display
from six.moves.urllib import request
from xml.dom import minidom

%matplotlib inline
# Always make sure you are using running the expected version.
# There are considerable differences between versions...
tf.__version__

# 1 Get the data

In this section we download a set of raw data files from the web.

In [None]:
# Retrieve list of classes.

def list_bucket(bucket, regexp='.*'):
    """Returns a (filtered) list of Keys in specified GCE bucket."""
    keys = []
    fh = request.urlopen('https://storage.googleapis.com/%s' % bucket)
    content = minidom.parseString(fh.read())
    for e in content.getElementsByTagName('Contents'):
        key = e.getElementsByTagName('Key')[0].firstChild.data
        if re.match(regexp, key):
            keys.append(key)
    return keys

all_ndjsons = list_bucket('quickdraw_dataset', '.*ndjson$')
print 'available: (%d)' % len(all_ndjsons)
print ' '.join([key.split('/')[-1].split('.')[0] for key in all_ndjsons])

In [None]:
# Store all data locally in this directory.
data_path = '../data'

# Mini group of two animals.
pets = ['cat', 'dog']

# Somewhat larger group of zoo animals.
zoo = ['elephant', 'giraffe', 'kangaroo', 'lion', 'monkey', 'panda',
       'penguin', 'rhinoceros', 'tiger', 'zebra']

# Even larger group of all animals.
animals = ['bat', 'bird', 'butterfly', 'camel', 'cat', 'cow', 'crab',
           'crocodile', 'dog', 'dolphin', 'duck', 'elephant', 'fish',
           'frog', 'giraffe', 'hedgehog', 'horse', 'kangaroo', 'lion',
           'lobster', 'monkey', 'mosquito', 'mouse', 'octopus', 'owl',
           'panda', 'parrot', 'penguin', 'pig', 'rabbit', 'raccoon',
           'rhinoceros', 'scorpion', 'sea turtle', 'shark', 'sheep',
           'snail', 'spider', 'squirrel', 'teddy-bear', 'tiger',
           'whale', 'zebra']

# Create your own group -- the more classes you include the more challenging
# the classification task will be...

# Choose one of above groups for remainder of workshop.
classes, classes_name = zoo, 'zoo'

In [None]:
# Download above chosen group.

def retrieve(bucket, key, filename):
    """Returns a file specified by its Key from a GCE bucket."""
    url = 'https://storage.googleapis.com/%s/%s' % (bucket, key)
    request.urlretrieve(url=url, filename=filename)

if not os.path.exists(data_path):
    os.mkdir(data_path)

print '\n%d classes:' % len(classes),

for name in classes:
    print name,
    dst = '%s/%s.ndjson' % (data_path, name)
    if not os.path.exists(dst):
        retrieve('quickdraw_dataset', 'full/simplified/%s.ndjson' % name, dst)

# 2 Inspect the data

What is the format of the downloaded files?

In [None]:
# So let's check out the downloaded files...
!ls $data_path

In [None]:
# What is the NDJSON file format?
# Seems to be one JSON dictionary per line...
path = tf.gfile.Glob(os.path.join(data_path, '*.ndjson'))[1]
print file(path).read()[:1000] + '...'

In [None]:
# Parse single line.
data_json = json.loads(file(path).readline())
data_json.keys()

In [None]:
# So we have some meta information...
for k, v in data_json.iteritems():
    if k != 'drawing':
        print '%20s   ->   %s' % (k, v)

In [None]:
# ...and the actual drawing.
drawing = data_json['drawing']
# The drawing consists of a series of strokes:
[np.array(stroke).shape for stroke in drawing]

In [None]:
# Draw the image -- the strokes all have have shape (2, n)
# so the first index seems to be x/y coordinate:
for stroke in drawing:
    pyplot.plot(np.array(stroke[0]), -np.array(stroke[1]))
# Would YOU recognize this drawing successfully?

In [None]:
%%time
# Some more code to load many sketches at once.
# Let's ignore the difficult "unrecognized" sketches for now...
# (i.e. unrecognized by the official quickdraw classifier)

def convert(line):
    """Converts single line to JSON + converts 'drawing' to list of np.array."""
    d = json.loads(line)
    d['drawing'] = [np.array(stroke) for stroke in d['drawing']]
    return d

def loaditer(name, unrecognized=False):
    """Returns iterable of drawings in specified file.

    Args:
      name: Name of the downloaded object (e.g. "elephant").
      unrecognized: Whether to include drawings that were not recognized
          by Google AI (i.e. the hard ones).
    """
    for line in open('%s/%s.ndjson' % (data_path, name)):
        d = convert(line)
        if d['recognized'] or unrecognized:
            yield d

def loadn(name, n, unrecognized=False):
    """Returns list of drawings.

    Args:
      name: Name of the downloaded object (e.g. "elephant").
      n: Number of drawings to load.
      unrecognized: Whether to include drawings that were not recognized
          by Google AI (i.e. the hard ones).
    """
    it = loaditer(name, unrecognized=unrecognized)
    return list(itertools.islice(it, 0, n))

print 'loading some "%s"...' % classes[0]
sample = loadn(classes[0], 100)

In [None]:
# Some more drawings...
pyplot.figure(figsize=(10, 10))
n = 3
for x in range(n):
    for y in range(n):
        i = x * n + y
        pyplot.subplot(n, n, i + 1)
        for stroke in sample[i]['drawing']:
            pyplot.plot(np.array(stroke[0]), -np.array(stroke[1]))

# 3 Rasterize

Idea: After converting the raw drawing data into rasterized images, we can
use [MNIST](https://www.tensorflow.org/get_started/mnist/beginners)-like
image processing to classify the drawings.

In [None]:
%%writefile _derived/1_json_to_img.py
# (Written into separate file for sharing between notebooks.)

# Function that converts drawing (specified by individual strokes)
# to a rendered black/white image.

def json_to_img(drawing, img_sz=64, lw=3, maximize=True):
    img = Image.new('L', (img_sz, img_sz))
    draw = ImageDraw.Draw(img)
    lines = np.array([
        stroke[0:2, i:i+2]
        for stroke in drawing['drawing']
        for i in range(stroke.shape[1] - 1)
    ], dtype=np.float32)
    if maximize:
        for i in range(2):
            min_, max_ = lines[:,i,:].min() * 0.95, lines[:,i,:].max() * 1.05
            lines[:,i,:] = (lines[:,i,:] - min_) / max(max_ - min_, 1)
    else:
        lines /= 1024
    for line in lines:
        draw.line(tuple(line.T.reshape((-1,)) * img_sz), fill='white', width=lw)
    return img

In [None]:
# (Load code from previous cell -- make sure to have executed above cell first.)
%run -i _derived/1_json_to_img.py

# Show some examples.

def showimg(img):
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img, 'L')
    b = io.BytesIO()
    img.convert('RGB').save(b, format='png')
    enc = base64.b64encode(b.getvalue()).decode('utf-8')
    display.display(display.HTML(
        '<img src="data:image/png;base64,%s">' % enc))

# Fetch some images + shuffle order.
rows, cols = 10, 10
n_per_class = rows * cols // len(classes) + 1
drawings_matrix = [loadn(name, rows*cols) for name in classes]
drawings_list = reduce(lambda x, y: x + y, drawings_matrix, [])
drawings_list = np.random.permutation(drawings_list)

# Create mosaic of rendered images.
lw = 4
img_sz = 64
tableau = np.zeros((img_sz * rows, img_sz * cols), dtype=np.uint8)
for y in range(rows):
    for x in range(cols):
        i = y * rows + x
        img = json_to_img(drawings_list[i], img_sz=img_sz, lw=lw, maximize=True)
        tableau[y*img_sz:(y+1)*img_sz, x*img_sz:(x+1)*img_sz] = np.asarray(img)

showimg(tableau)

# 4 tf.train.Example data format

Tensorflow's "native" format for data storage is the `tf.train.Example`
[protocol buffer](https://en.wikipedia.org/wiki/Protocol_Buffers).

In this section we briefly explore the API needed to access the data
inside the `tf.train.Example` protocol buffer. It's **not necessary** to read
through the
[Python API documentation](https://developers.google.com/protocol-buffers/docs/pythontutorial).

In [None]:
# Create a new (empty) instance.
example = tf.train.Example()
# (empty example will print nothing)
print example

In [None]:
# An example contains a map from feature name to "Feature".
# Every "Feature" contains a list of elements of the same
# type, which is one of:
# - bytes_list (similar to Python's "str")
# - float_list (float number)
# - int64_list (integer number)

# These values can be accessed as follows (no need to understand
# details):

# Add float value "3.1416" to feature "magic_numbers"
example.features.feature['magic_numbers'].float_list.value.append(3.1416)
# Add some more values to the float list "magic_numbers".
example.features.feature['magic_numbers'].float_list.value.extend([2.7183, 1.4142, 1.6180])

# YOUR ACTION REQUIRED:
# Create a second feature named "adversaries" and add the elements
# "Alice" and "Bob".
example.features.feature['adversaries'].

# This will now print a serialized representation of our protocol buffer
# with features "magic_numbers" and "adversaries" set...
print example

# .. et voila : that's all you need to know about protocol buffers
# for this workshop.

# 5 Create dataset

Now let's create a "dataset" of `tf.train.Example`
[protocol buffers](https://developers.google.com/protocol-buffers/) ("protos").

A single example contains all the information for a drawing (i.e. rasterized
image, label, and meta information).

A dataset consists of non-overlapping sets of examples that will be used for
training and evaluation of the classifier (the "test" set will be used for the
final evaluation). Because these files can quickly become very large, we
"shard" them into multiple smaller files of equal size.

In [None]:
# Let's first check how many [recognized=True] examples we have in each class.
# Depending on your choice of classes you could generate up to 200k examples...
for name in classes:
    print name, len(list(open('%s/%s.ndjson' % (data_path, name)))), 'recognized', len(list(loaditer(name)))

In [None]:
# Helper code to create sharded recordio files.
# (No need to read through this.)

# Well... Since you continue to read through this cell, I could as
# well explain in more detail what it is about :-)
# Because we work with large amounts of data, we will create "sharded"
# files, that is, we split a single dataset into a number of files, like
# train-00000-of-00005, ..., train-00004-of-00005 (if we're using 5 shards).
# This way we have smaller individual files, and we can also easily access
# e.g. 20% of all data, or have 5 threads reading through the data
# simultaneously.

# The code in this cell simply takes a list of iterators and then
# randomly distributes the values returned by these iterators into sharded
# datasets (e.g. a train/eval/test split).

def rand_key(counts):
    """Returns a random key from "counts", using values as distribution."""
    r = random.randint(0, sum(counts.values()))
    for key, count in counts.iteritems():
        if r > count or count == 0:
            r -= count
        else:
            counts[key] -= 1
            return key

def make_sharded_files(make_example, path, classes, iters, splits,
                       shards=10, overwrite=False, report_dt=10):
    """Create sharded files from "iters".

    Args:
      make_example: Converts object returned by elements of "iters"
          to tf.train.Example() proto.
      path: Directory that will contain recordio files.
      classes: Names of classes, will be written to "labels.txt".
      splits: Dictionary mapping filename to number of examples (of
          every class).
      shards: Number of files to be created per split.
      overwrite: Whether a pre-existing directory should be overwritten.
      report_dt: Number of seconds between status updates (0=no updates).

    Returns:
      Total number of examples written to disk (this should be equal to
      the number of classes times the sum of the number of examples of
      all the splits).
    """
    assert len(iters) == len(classes)
    if not os.path.exists(path):
        os.makedirs(path)
    paths = {
        split: ['%s/%s-%05d-of-%05d' % (path, split, i, shards)
                for i in range(shards)]
        for split in splits
    }
    assert overwrite or not os.path.exists(paths.values()[0][0])
    writers = {
        split: [tf.python_io.TFRecordWriter(ps[i]) for i in range(shards)]
        for split, ps in paths.iteritems()
    }
    t0 = time.time()
    n = sum(splits.values())
    examples = 0
    for i in range(n):
        split = rand_key(splits)
        writer = writers[split][splits[split] % shards]
        for j in range(len(classes)):
            example = make_example(j, iters[j].next())
            writer.write(example.SerializeToString())
            examples += 1
        remaining = sum(splits.values())
        if report_dt > 0 and time.time() - t0 > report_dt:
            print 'processed %d/%d (%.2f%%)' % (i, n, 100. * i / n)
            t0 = time.time()
    for split in splits:
        for writer in writers[split]:
            writer.close()
    with open('%s/labels.txt' % path, 'w') as f:
        f.write('\n'.join(classes))
    return examples

In [None]:
%%writefile _derived/1_make_example_img.py
# (Written into separate file for sharing between notebooks.)

# Convert drawing tf.train.Example proto.
# Uses json_to_img() from previous cell to create raster image.

def make_example_img(label, drawing):
    example = tf.train.Example()
    example.features.feature['label'].int64_list.value.append(label)
    img_64 = np.asarray(json_to_img(drawing, img_sz=64, lw=4, maximize=True)).reshape(-1)
    example.features.feature['img_64'].int64_list.value.extend(img_64)
    example.features.feature['countrycode'].bytes_list.value.append(drawing['countrycode'].encode())
    example.features.feature['recognized'].int64_list.value.append(drawing['recognized'])
    example.features.feature['word'].bytes_list.value.append(drawing['word'].encode())
    ts = drawing['timestamp']
    ts = time.mktime(time.strptime(ts[:ts.index('.')], '%Y-%m-%d %H:%M:%S'))
    example.features.feature['timestamp'].int64_list.value.append(long(ts))
    example.features.feature['key_id'].int64_list.value.append(long(drawing['key_id']))
    return example

In [None]:
# (Load code from previous cell -- make sure to have executed above cell first.)
%run -i _derived/1_make_example_img.py

# Create the (rasterized) dataset.

path = '%s/dataset_img' % data_path
t0 = time.time()
n = make_sharded_files(
    make_example=make_example_img,
    path=path,
    classes=classes,
    iters=[loaditer(name) for name in classes],
    # Note: We only generate few examples here so you won't be
    # blocked for too long while waiting for this cell to finish.
    # You can re-run the cell with larger values (don't forget to
    # update "path" above) in the background to get a larger
    # dataset...
    splits=dict(train=5000, eval=1000, test=1000),
)

print 'stored data to "%s"' % path
print 'generated %d examples in %d seconds' % (n, time.time() - t0)

# 6 Prepare dataset for RNN – bonus!

This section creates another dataset of example protos that contain the raw
stroke data, suitable for usage with a recurrent neural network.

Note that later notebooks will have a "bonus" section that uses this dataset,
but the "non-bonus" parts can be worked through without executing below
cells...

In [None]:
%%writefile _derived/1_json_to_stroke.py
# (Written into separate file for sharing between notebooks.)

# Convert stroke coordinates into normalized relative coordinates,
# one single list, and add a "third dimension" that indicates when
# a new stroke starts.

def json_to_stroke(d):
    norm = lambda x: (x - x.min()) / max(1, (x.max() - x.min()))
    xy = np.concatenate([np.array(s, dtype=np.float32) for s in d['drawing']], axis=1)
    z = np.zeros(xy.shape[1])
    if len(d['drawing']) > 1:
        z[np.cumsum(np.array(map(lambda x: x.shape[1], d['drawing'][:-1])))] = 1
    dxy = np.diff(norm(xy))
    return np.concatenate([dxy, z.reshape((1, -1))[:, 1:]])

In [None]:
# (Load code from previous cell -- make sure to have executed above cell first.)
%run -i _derived/1_json_to_stroke.py

# Visualize / control output of json_to_stroke().

stroke = json_to_stroke(sample[3])
# First 2 dimensions are normalized dx/dy coordinates
# third dimension indicates "new stroke".
xy = stroke[:2, :].cumsum(axis=1)
pyplot.plot(*xy)
pxy = xy[:, stroke[2] != 0]
pyplot.plot(pxy[0], pxy[1], 'ro')

In [None]:
%%writefile _derived/1_make_example_stroke.py
# (Written into separate file for sharing between notebooks.)

# Convert drawing tf.train.Example proto.
# Uses json_to_stroke() from previous cell to create raster image.

def make_example_stroke(label, drawing):
    example = tf.train.Example()
    example.features.feature['label'].int64_list.value.append(label)
    stroke = json_to_stroke(drawing)
    example.features.feature['stroke_x'].float_list.value.extend(stroke[0, :])
    example.features.feature['stroke_y'].float_list.value.extend(stroke[1, :])
    example.features.feature['stroke_z'].float_list.value.extend(stroke[2, :])
    example.features.feature['stroke_len'].int64_list.value.append(stroke.shape[1])
    example.features.feature['countrycode'].bytes_list.value.append(drawing['countrycode'].encode())
    example.features.feature['recognized'].int64_list.value.append(drawing['recognized'])
    example.features.feature['word'].bytes_list.value.append(drawing['word'].encode())
    ts = drawing['timestamp']
    ts = time.mktime(time.strptime(ts[:ts.index('.')], '%Y-%m-%d %H:%M:%S'))
    example.features.feature['timestamp'].int64_list.value.append(long(ts))
    example.features.feature['key_id'].int64_list.value.append(long(drawing['key_id']))
    return example

In [None]:
# (Load code from previous cell -- make sure to have executed above cell first.)
%run -i _derived/1_make_example_stroke.py

# Create the (stroke) dataset.

path = '%s/dataset_stroke' % data_path
t0 = time.time()
n = make_sharded_files(
    make_example=make_example_stroke,
    path=path,
    classes=classes,
    iters=[loaditer(name) for name in classes],
    splits=dict(train=50000, eval=10000, test=10000),
)

print 'stored examples to "%s"' % path
print 'generated %d examples in %d seconds' % (n, time.time() - t0)