# Notebook for Generating TensorFlow Records for Dataset
A record allows for serialized, structured data to be represented in a binary format and cross-platform. 
This allows models to be easily re-trained on platform independent data, and stored in a compiled format.

To prepare such a record, a few things are needed:
1. A collection of CSV records for the training of the model, including the labels, classifications, and file names
2. The image files themselves.

Credit is due to the TensorFlow team for providing a utility for this, modified below.

[Official TFRecord Reference](https://www.tensorflow.org/tutorials/load_data/tfrecord)

In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

import os
import io
import pandas as pd

from tensorflow.python.framework.versions import VERSION
if VERSION >= "2.0.0a0":
    import tensorflow.compat.v1 as tf
else:
    import tensorflow as tf

from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict

The variables below can be configured depending on the needs and locations of the data.

In [3]:
csv_input = "../data/train_labels.csv"
image_dir = "../data/images"
output_path = "train.record"

The function below allows for conversion between the textual naming identity of the classifications and
the integer representations. This is how the classifications are stored internally.

In [6]:
def class_text_to_int(row_label):
    if row_label == 'military tank':
        return 1
    elif row_label == 'military aircraft':
        return 2
    elif row_label == 'military truck':
        return 3
    elif row_label == 'civilian aircraft':
        return 4
    elif row_label == 'civilian car':
        return 5
    elif row_label == 'military helicopter':
        return 6
    else:
        return None
    
def split(df, group):
    data = namedtuple('data', ['filename', 'object'])
    gb = df.groupby(group)

    return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)]

The next step is to create the binary representation of the record and return it. This is formatted to allow TF to train on this data. This is done by first reading in the file as raw JPG data, setting all of the attributes in the binary format, and returning.

In [15]:
def create_tf_record(group, path):
    with tf.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid:
        encoded_jpg = fid.read()

    encoded_jpg_io = io.BytesIO(encoded_jpg)
    image = Image.open(encoded_jpg_io)
    width, height = image.size

    filename = group.filename.encode('utf8')
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    # Add all of the objects to the arrays.
    for index, row in group.object.iterrows():
        xmins.append(row['xmin'] / width)
        xmaxs.append(row['xmax'] / width)
        ymins.append(row['ymin'] / height)
        ymaxs.append(row['ymax'] / height)
        classes_text.append(row['class'].encode('utf8'))
        classes.append(class_text_to_int(row['class']))
        
    tf_record = tf.train.Example(features=tf.train.Features(feature = {
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename),
        'image/source_id': dataset_util.bytes_feature(filename),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes)
    }))

    return tf_record

In [16]:
writer = tf.python_io.TFRecordWriter(output_path)
path = os.path.join(os.getcwd(), image_dir)
examples = pd.read_csv(csv_input)
grouped = split(examples, 'filename')

for group in grouped:
    tf_record = create_tf_record(group, path)
    writer.write(tf_record.SerializeToString())

writer.close()
output_path = os.path.join(os.getcwd(), output_path)
print(f"Successfully created the TFRecords: {output_path}")

NotFoundError: ; No such file or directory