In [26]:
import tensorflow as tf

In [31]:
x = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(3,), dtype=tf.int32, name=None)>

In [32]:
for item in dataset:
    print(item)

tf.Tensor([1 2 3], shape=(3,), dtype=int32)
tf.Tensor([4 5 6], shape=(3,), dtype=int32)
tf.Tensor([7 8 9], shape=(3,), dtype=int32)


Slicing nested structures (dictionaries, tuples...) retains the structure. This cand be useful when slicing on both inputs (X_train) and labels (y_train) at the same time.

In [33]:
x_nested = {
    'a': tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
    'b': tf.constant([[3, 2, 1], [6, 5, 4], [9, 8, 7]])
}
dataset2 = tf.data.Dataset.from_tensor_slices(x_nested)
dataset2

<_TensorSliceDataset element_spec={'a': TensorSpec(shape=(3,), dtype=tf.int32, name=None), 'b': TensorSpec(shape=(3,), dtype=tf.int32, name=None)}>

In [34]:
for item in dataset2:
    print(item)

{'a': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 3], dtype=int32)>, 'b': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([3, 2, 1], dtype=int32)>}
{'a': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([4, 5, 6], dtype=int32)>, 'b': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([6, 5, 4], dtype=int32)>}
{'a': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([7, 8, 9], dtype=int32)>, 'b': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([9, 8, 7], dtype=int32)>}


In [46]:
print(dataset.cardinality())
print(dataset.repeat(4).cardinality())
print(dataset.repeat(4).batch(6).cardinality())
for item in dataset.repeat(4).batch(6):
    print(item)

tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(12, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(
[[1 2 3]
 [4 5 6]
 [7 8 9]
 [1 2 3]
 [4 5 6]
 [7 8 9]], shape=(6, 3), dtype=int32)
tf.Tensor(
[[1 2 3]
 [4 5 6]
 [7 8 9]
 [1 2 3]
 [4 5 6]
 [7 8 9]], shape=(6, 3), dtype=int32)


In [51]:
for item in dataset.map(lambda x: x * 2, num_parallel_calls=2):
    print(item)

tf.Tensor([2 4 6], shape=(3,), dtype=int32)
tf.Tensor([ 8 10 12], shape=(3,), dtype=int32)
tf.Tensor([14 16 18], shape=(3,), dtype=int32)


In [93]:
for item in dataset.shuffle(buffer_size=1):
    print(item)

print('----')
for item in dataset.shuffle(buffer_size=len(dataset)):
    print(item)

tf.Tensor([1 2 3], shape=(3,), dtype=int32)
tf.Tensor([4 5 6], shape=(3,), dtype=int32)
tf.Tensor([7 8 9], shape=(3,), dtype=int32)
----
tf.Tensor([4 5 6], shape=(3,), dtype=int32)
tf.Tensor([7 8 9], shape=(3,), dtype=int32)
tf.Tensor([1 2 3], shape=(3,), dtype=int32)


In [96]:
files = tf.data.Dataset.list_files('./*')
for item in files:
    print(item)

tf.Tensor(b'./01-tf-data.ipynb', shape=(), dtype=string)


In [103]:
x = [0.] * 8 + [tf.constant([], dtype=tf.float32)]
tf.stack(x[:-1])

<tf.Tensor: shape=(8,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

# TFRecord

In [110]:
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('my_data.tfrecord', options=options) as f:
    f.write(b'This is the first record')
    f.write(b'And this is the second record')

In [111]:
filepaths = ['my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths, compression_type='GZIP')
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


# Predefined protobufs

In [115]:
person_example = tf.train.Example(
    features=tf.train.Features(
        feature={
            "name": tf.train.Feature(bytes_list=tf.train.BytesList(value=[b'Alice'])),
            "id": tf.train.Feature(int64_list=tf.train.Int64List(value=[123])),
            "emails": tf.train.Feature(bytes_list=tf.train.BytesList(value=[b'a@b.com', b'c@d.com'])),
        }
    )
)
person_example

features {
  feature {
    key: "name"
    value {
      bytes_list {
        value: "Alice"
      }
    }
  }
  feature {
    key: "id"
    value {
      int64_list {
        value: 123
      }
    }
  }
  feature {
    key: "emails"
    value {
      bytes_list {
        value: "a@b.com"
        value: "c@d.com"
      }
    }
  }
}

In [116]:
person_example.SerializeToString()

b'\n@\n\x11\n\x04name\x12\t\n\x07\n\x05Alice\n\x1e\n\x06emails\x12\x14\n\x12\n\x07a@b.com\n\x07c@d.com\n\x0b\n\x02id\x12\x05\x1a\x03\n\x01{'