# Load and parse data with TensorFlow 2.0 (tf.data)

A TensorFlow 2.0 example to build input pipelines for loading data efficiently.

- Numpy Arrays
- Images
- CSV file
- Custom data from a Generator

For more information about creating and loading TensorFlow's TFRecords data format, see: tfrecords.ipynb

In [2]:
import numpy as np
import random
import requests
import string
import tarfile
import tensorflow as tf

# Load Numpy Arrays

Build a data pipeline over numpy arrays.

In [11]:
# Create a toy dataset (even and odd numbers, with respective labels of 0 and 1).
evens = np.arange(0, 100, step=2, dtype=np.int32)
evens_label = np.zeros(50, dtype=np.int32)
odds = np.arange(1, 100, step=2, dtype=np.int32)
odds_label = np.ones(50, dtype=np.int32)
# Concatenate arrays
features = np.concatenate([evens, odds])
labels = np.concatenate([evens_label, odds_label])

# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_tensor_slices((features, labels))
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=100)
# Batch data (aggregate records together).
data = data.batch(batch_size=4)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [13]:

for batch_x, batch_y in data.take(4):
    print(batch_x, batch_y)

tf.Tensor([77 54 37 78], shape=(4,), dtype=int32) tf.Tensor([1 0 1 0], shape=(4,), dtype=int32)
tf.Tensor([10 40 52 15], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)
tf.Tensor([91 63 16 21], shape=(4,), dtype=int32) tf.Tensor([1 1 0 1], shape=(4,), dtype=int32)
tf.Tensor([ 2 25 72  0], shape=(4,), dtype=int32) tf.Tensor([0 1 0 0], shape=(4,), dtype=int32)


In [14]:

# Note: If you are planning on calling multiple time,
# you can user the iterator way:
ite_data = iter(data)
for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)

for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)

tf.Tensor([44 13 24 85], shape=(4,), dtype=int32) tf.Tensor([0 1 0 1], shape=(4,), dtype=int32)
tf.Tensor([15 49 98 52], shape=(4,), dtype=int32) tf.Tensor([1 1 0 0], shape=(4,), dtype=int32)
tf.Tensor([95  6 89 14], shape=(4,), dtype=int32) tf.Tensor([1 0 1 0], shape=(4,), dtype=int32)
tf.Tensor([10 67 12  8], shape=(4,), dtype=int32) tf.Tensor([0 1 0 0], shape=(4,), dtype=int32)
tf.Tensor([94 10 76  1], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)
tf.Tensor([19 81 91  7], shape=(4,), dtype=int32) tf.Tensor([1 1 1 1], shape=(4,), dtype=int32)
tf.Tensor([43 93 48 29], shape=(4,), dtype=int32) tf.Tensor([1 1 0 1], shape=(4,), dtype=int32)
tf.Tensor([68 80 90 63], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)
tf.Tensor([51 56 35 99], shape=(4,), dtype=int32) tf.Tensor([1 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([71 18 55 64], shape=(4,), dtype=int32) tf.Tensor([1 0 1 0], shape=(4,), dtype=int32)


# Load CSV files

Build a data pipeline from features stored in a CSV file. For this example, Titanic dataset will be used as a toy dataset stored in CSV format.

In [15]:

# Download Titanic dataset (in csv format).
d = requests.get("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv")
with open("titanic_dataset.csv", "wb") as f:
    f.write(d.content)

In [35]:

# Load Titanic dataset.
# Original features: survived,pclass,name,sex,age,sibsp,parch,ticket,fare
# Select specific columns: survived,pclass,name,sex,age,fare
column_to_use = [0, 1, 2, 3, 4, 8]
record_defaults = [tf.int32, tf.int32, tf.string, tf.string, tf.float32, tf.float32]

# Load the whole dataset file, and slice each line.
data = tf.data.experimental.CsvDataset("titanic_dataset.csv", record_defaults, header=True, select_cols=column_to_use)
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=1000)
# Batch data (aggregate records together).
data = data.batch(batch_size=2)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [36]:
for survived, pclass, name, sex, age, fare in data.take(1):
    print(survived.numpy())
    print(pclass.numpy())
    print(name.numpy())
    print(sex.numpy())
    print(age.numpy())
    print(fare.numpy())

[1 0]
[2 3]
[b'Herman, Mrs. Samuel (Jane Laver)' b'Ilmakangas, Miss. Ida Livija']
[b'female' b'female']
[48. 27.]
[65.     7.925]


# Load Images

Build a data pipeline by loading images from disk. For this example, Oxford Flowers dataset will be used.

In [37]:
# Download Oxford 17 flowers dataset
d = requests.get("http://www.robots.ox.ac.uk/~vgg/data/flowers/17/17flowers.tgz")
with open("17flowers.tgz", "wb") as f:
    f.write(d.content)
# Extract archive.
with tarfile.open("17flowers.tgz") as t:
    t.extractall()

In [38]:
with open('jpg/dataset.csv', 'w') as f:
    c = 0
    for i in range(1360):
        f.write("jpg/image_%04i.jpg,%i\n" % (i+1, c))
        if (i+1) % 80 == 0:
            c += 1

In [39]:

# Load Images
with open("jpg/dataset.csv") as f:
    dataset_file = f.read().splitlines()

# Load the whole dataset file, and slice each line.
data = tf.data.Dataset.from_tensor_slices(dataset_file)
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=1000)

# Load and pre-process images.
def load_image(path):
    # Read image from path.
    image = tf.io.read_file(path)
    # Decode the jpeg image to array [0, 255].
    image = tf.image.decode_jpeg(image)
    # Resize images to a common size of 256x256.
    image = tf.image.resize(image, [256, 256])
    # Rescale values to [-1, 1].
    image = 1. - image / 127.5
    return image
# Decode each line from the dataset file.
def parse_records(line):
    # File is in csv format: "image_path,label_id".
    # TensorFlow requires a default value, but it will never be used.
    image_path, image_label = tf.io.decode_csv(line, ["", 0])
    # Apply the function to load images.
    image = load_image(image_path)
    return image, image_label
# Use 'map' to apply the above functions in parallel.
data = data.map(parse_records, num_parallel_calls=4)

# Batch data (aggregate images-array together).
data = data.batch(batch_size=2)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [41]:
for batch_x, batch_y in data.take(1):
    print(batch_x, batch_y)

tf.Tensor(
[[[[ 0.34748775  0.06513482  0.34748775]
   [ 0.35533088  0.06960785  0.37775737]
   [ 0.34448528  0.0699755   0.3897059 ]
   ...
   [ 0.4196691   0.25496322  0.58621323]
   [ 0.45851713  0.27028185  0.65569854]
   [ 0.47053224  0.28229696  0.6744538 ]]

  [[ 0.38229167  0.09344363  0.37904412]
   [ 0.39350492  0.10268217  0.41268384]
   [ 0.39213914  0.11362064  0.43535537]
   ...
   [ 0.36305147  0.1983456   0.5295956 ]
   [ 0.39405638  0.2058211   0.5912378 ]
   [ 0.409375    0.22113973  0.61329657]]

  [[ 0.4353817   0.12165624  0.4118523 ]
   [ 0.43884802  0.14080882  0.45343137]
   [ 0.44332105  0.15312499  0.48069853]
   ...
   [ 0.36481935  0.1844272   0.5392066 ]
   [ 0.3849442   0.19670892  0.58212554]
   [ 0.4001838   0.21194851  0.60410535]]

  ...

  [[ 0.55012256  0.61286765  0.71482843]
   [ 0.41425925  0.47700435  0.5789651 ]
   [ 0.37647057  0.43137252  0.5568627 ]
   ...
   [ 0.18499684  0.20852625  0.25374687]
   [ 0.13633579  0.16770834  0.16660541]
   [ 

# Load data from a Generator

In [42]:
# Create a dummy generator.
def generate_features():
    # Function to generate a random string.
    def random_string(length):
        return ''.join(random.choice(string.ascii_letters) for m in range(length))
    # Return a random string, a random vector, and a random int.
    yield random_string(4), np.random.uniform(size=4), random.randint(0, 10)

In [43]:

# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_generator(generate_features, output_types=(tf.string, tf.float32, tf.int32))
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=100)
# Batch data (aggregate records together).
data = data.batch(batch_size=4)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [44]:

# Display data.
for batch_str, batch_vector, batch_int in data.take(5):
    print(batch_str, batch_vector, batch_int)

tf.Tensor([b'nAHY' b'Coyh' b'uxwg' b'Nsby'], shape=(4,), dtype=string) tf.Tensor(
[[0.2489901  0.5929806  0.8466293  0.3830717 ]
 [0.93540996 0.13090135 0.65848845 0.80020654]
 [0.44918388 0.57991904 0.87788606 0.7736005 ]
 [0.39685148 0.6460428  0.18117687 0.85636437]], shape=(4, 4), dtype=float32) tf.Tensor([1 8 7 7], shape=(4,), dtype=int32)
tf.Tensor([b'tjRB' b'aVQy' b'wxwy' b'Mjvq'], shape=(4,), dtype=string) tf.Tensor(
[[0.3744202  0.84489506 0.13605866 0.8272498 ]
 [0.14347935 0.39562988 0.04686269 0.40373808]
 [0.25841206 0.18076964 0.9293564  0.96094453]
 [0.73246574 0.22075218 0.5999108  0.8323621 ]], shape=(4, 4), dtype=float32) tf.Tensor([1 1 9 6], shape=(4,), dtype=int32)
tf.Tensor([b'LNhW' b'jPsY' b'ZZOj' b'QcyK'], shape=(4,), dtype=string) tf.Tensor(
[[0.42397648 0.96731174 0.3776846  0.5525027 ]
 [0.66115    0.38694072 0.289318   0.09513511]
 [0.22792904 0.7793351  0.40683118 0.8259607 ]
 [0.317663   0.2975126  0.41752937 0.973576  ]], shape=(4, 4), dtype=float32) tf.Te