## Working with TFRecords and Protobufs

This notebook deals writing files to, reading from, and working with a very efficient TensorFlow format, TFRecords. Some parts of the notebook will also explain the Example Protobuf by Tensorflow.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.train import BytesList, Feature, Features, Example, Int64List

In [5]:
fashion_mnist = keras.datasets.fashion_mnist.load_data()

In [6]:
X_train, y_train, X_test, y_test = fashion_mnist[0][0], fashion_mnist[0][1], fashion_mnist[1][0], fashion_mnist[1][1]

In [7]:
X_train.shape, y_train.shape

((60000, 28, 28), (60000,))

In [12]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))) or isinstance(value, np.ndarray):
        value = value.numpy().tobytes()
    return Feature(bytes_list=BytesList(value=[value]))

def _int64_feature(value):
    return Feature(int64_list = Int64List(value=[value]))

In [13]:
def serialize_example(image, label):
    features = {
        'image': _bytes_feature(image),
        'label': _int64_feature(label)
    }
    
    serialize_example = Example(features=Features(feature=features))
    return serialize_example.SerializeToString()

In [14]:
def write_TFRExample(dataset, filename='data/fashion_mnist_train.tfrecord'):
    with tf.io.TFRecordWriter(filename) as f:
        for image, label in dataset:
            example = serialize_example(image, label)
            f.write(example)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
write_TFRExample(dataset)

In [17]:
serialize_example(b'123212131', 23)

b'\n(\n\x0e\n\x05label\x12\x05\x1a\x03\n\x01\x17\n\x16\n\x05image\x12\r\n\x0b\n\t123212131'