# Setup


In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import IPython.display as display



# 1. tf.Example
## 1.1 Data types for `tf.Example`
Fundamentally a `tf.Example` is a `{"string": tf.train.Feature}` mapping.
The `tf.train.Feature` message type can accept one of the following three types:
1. `tf.train.BytesList`
    * `string`
    * `byte`
2. `tf.train.FloatList`
    * `float(float32)`
    * `double(float64)`
3. `tf.train.Int64List`
    * `bool`
    * `enum`
    * `int32`
    * `uint32`
    * `int64`
    * `uint64`


In [2]:
# Each function takes a scalar input value and returns a `tf.train.Feature` 
# containing one of the three `list` types above.
# The following functions can be used to convert a value to a type compatible
# with tf.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))



In [3]:
print(_bytes_feature(b'test_string'))
print(_bytes_feature(u'test_bytes'.encode('utf-8')))

print(_float_feature(np.exp(1)))

print(_int64_feature(True))
print(_int64_feature(1))


bytes_list {
  value: "test_string"
}

bytes_list {
  value: "test_bytes"
}

float_list {
  value: 2.7182817459106445
}

int64_list {
  value: 1
}

int64_list {
  value: 1
}



In [4]:
feature = _float_feature(np.exp(1))
type(feature)
feature.SerializeToString()


b'\x12\x06\n\x04T\xf8-@'

## 1.2 Creating a `tf.Example` message
In practice, the dataset may come from anywhere, but the procedure of creating
the `tf.Example` message from a single observation will be the same.
1. Within each observation, each value needs to be converted to a `tf.train.Feature`
containing one of the 3 compatible types, using one of the functions above.
2. We create a map (dictionary) from the feature name string to the encoded feature 
value produced in #1.
3. The map produced in #2 is converted to a `Features` massage.


In the example, the dataset will have 4 features:
1. A boolean feature, `False` or `True`;
2. An integer feature uniformly randomly chosen from `[0, 5)`;
3. A string feature generated from a string table;
4. A float feature from a standard normal distribution.


In [5]:
# The number of observations in the dataset.
n_observations = int(1e4)

# boolean feature, encoded as False or True
feature0 = np.random.choice([False, True], n_observations)

# integer feature, random from 0 ... 4
feature1 = np.random.randint(0, 5, n_observations)

# string feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

# float feature, from a standard normal distribution
feature3 = np.random.randn(n_observations)



In [12]:
def serialize_example(feature0, feature1, feature2, feature3):
    """Creates a tf.Example message ready to be written to a file."""
    # Create a dictionary mapping the feature name to the `tf.Example`-compatible data type.
    feature = {'feature0': _int64_feature(feature0),
                'feature1': _int64_feature(feature1),
                'feature2': _bytes_feature(feature2),
                'feature3': _float_feature(feature3)}
    
    # Create a Features message using `tf.train.Example`.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


In [13]:
_serialize_example = serialize_example(False, 4, b"goat", 0.9876)
_serialize_example


b'\nR\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?'

To decode the message use the `tf.train.Example.FromString` method.


In [14]:
example_proto = tf.train.Example.FromString(_serialize_example)
example_proto


features {
  feature {
    key: "feature0"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature1"
    value {
      int64_list {
        value: 4
      }
    }
  }
  feature {
    key: "feature2"
    value {
      bytes_list {
        value: "goat"
      }
    }
  }
  feature {
    key: "feature3"
    value {
      float_list {
        value: 0.9876000285148621
      }
    }
  }
}

# 2. TFRecord files using `tf.data`
The `tf.data` module also provides tools for reading and writing data in tensorflow.


## 2.1 Writing a TFRecord file
THe easiest way to get the data into a dataset is to use the `from_tensor_slices` method.



In [20]:
# Applied to an array, it returns a dataset of scalars.
feature1_ds = tf.data.Dataset.from_tensor_slices(feature1)

# Applied to a tuple of arrays, it returns a dataset of tuples:
features_dataset = tf.data.Dataset.from_tensor_slices((feature0, feature1, feature2, feature3))


In [22]:
# Use `take(1) to only pull one example from the dataset.
for f0, f1, f2, f3 in features_dataset.take(2):
    print(f0)
    print(f1)
    print(f2)
    print(f3)
    

tf.Tensor(False, shape=(), dtype=bool)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(b'horse', shape=(), dtype=string)
tf.Tensor(-1.2120519723026093, shape=(), dtype=float64)
tf.Tensor(True, shape=(), dtype=bool)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(b'chicken', shape=(), dtype=string)
tf.Tensor(0.5513744864195331, shape=(), dtype=float64)


In [24]:
# Use the `tf.data.Dataset.map` method to apply a function to each element of a `Dataset`.
# The mapped function must operate in TensorFlow graph mode: It must operate on and return `tf.Tensors`.
def tf_serialize_example(f0,f1,f2,f3):
    tf_string = tf.py_func(
        serialize_example, 
        (f0,f1,f2,f3),  # pass these args to the above function.
        tf.string)      # the return type is <a href="../../api_docs/python/tf#string"><code>tf.string</code></a>.
    return tf.reshape(tf_string, ()) # The result is a scalar


In [25]:
serialized_features_dataset = features_dataset.map(tf_serialize_example)
serialized_features_dataset


<MapDataset shapes: (), types: tf.string>

In [27]:
# Write them to a `TFRecord` file:
filename = 'test.tfrecord'
writer = tf.contrib.data.TFRecordWriter(filename)
writer.write(serialized_features_dataset)


## 2.2 Reading a TFrecord file


In [28]:
filenames = [filename]
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset



<TFRecordDataset shapes: (), types: tf.string>

In [29]:
for raw_record in raw_dataset.take(10):
    print(repr(raw_record))


<tf.Tensor: id=132, shape=(), dtype=string, numpy=b'\nS\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x03\n\x15\n\x08feature2\x12\t\n\x07\n\x05horse\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\x85$\x9b\xbf\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00'>
<tf.Tensor: id=134, shape=(), dtype=string, numpy=b'\nU\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x02\n\x17\n\x08feature2\x12\x0b\n\t\n\x07chicken\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\xe1&\r?\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x01'>
<tf.Tensor: id=136, shape=(), dtype=string, numpy=b'\nS\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x03\n\x15\n\x08feature2\x12\t\n\x07\n\x05horse\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\x13\x08\xc2\xbf'>
<tf.Tensor: id=138, shape=(), dtype=string, numpy=b'\nR\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\rS\xdb\xbf\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04'>
<tf.Tenso

In [30]:
# These tensors can be parsed using the function below.
# Create a description of the features.  
feature_description = {
    'feature0': tf.FixedLenFeature([], tf.int64, default_value=0),
    'feature1': tf.FixedLenFeature([], tf.int64, default_value=0),
    'feature2': tf.FixedLenFeature([], tf.string, default_value=''),
    'feature3': tf.FixedLenFeature([], tf.float32, default_value=0.0),
}


def _parse_function(_example_proto):
    # Parse the input tf.Example proto using the dictionary above.
    return tf.parse_single_example(_example_proto, feature_description)


parsed_dataset = raw_dataset.map(_parse_function)
parsed_dataset 


<MapDataset shapes: {feature1: (), feature0: (), feature3: (), feature2: ()}, types: {feature1: tf.int64, feature0: tf.int64, feature3: tf.float32, feature2: tf.string}>

In [31]:
for parsed_record in parsed_dataset.take(10):
    print(repr(parsed_record))


{'feature1': <tf.Tensor: id=197, shape=(), dtype=int64, numpy=3>, 'feature0': <tf.Tensor: id=196, shape=(), dtype=int64, numpy=0>, 'feature3': <tf.Tensor: id=199, shape=(), dtype=float32, numpy=-1.212052>, 'feature2': <tf.Tensor: id=198, shape=(), dtype=string, numpy=b'horse'>}
{'feature1': <tf.Tensor: id=205, shape=(), dtype=int64, numpy=2>, 'feature0': <tf.Tensor: id=204, shape=(), dtype=int64, numpy=1>, 'feature3': <tf.Tensor: id=207, shape=(), dtype=float32, numpy=0.5513745>, 'feature2': <tf.Tensor: id=206, shape=(), dtype=string, numpy=b'chicken'>}
{'feature1': <tf.Tensor: id=213, shape=(), dtype=int64, numpy=3>, 'feature0': <tf.Tensor: id=212, shape=(), dtype=int64, numpy=0>, 'feature3': <tf.Tensor: id=215, shape=(), dtype=float32, numpy=-1.5158714>, 'feature2': <tf.Tensor: id=214, shape=(), dtype=string, numpy=b'horse'>}
{'feature1': <tf.Tensor: id=221, shape=(), dtype=int64, numpy=4>, 'feature0': <tf.Tensor: id=220, shape=(), dtype=int64, numpy=0>, 'feature3': <tf.Tensor: id=22

# 3. TFRecord files using `tf.python_io`


# 4. Walkthrough: Reading/Writing Image data
