In [307]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.data import Dataset

# Create dataset pipeline from numpy or lists

Tensorflow dataset is used work with large datasets and to "create complex pipelines from simple, reusable pieces". instead of loading your entire data on the start, it loads each batch when needed.
This comes in handy when working with large amounts of data, which in machine learning is most of the time. think about it like using a file stream, you dont read the entire file but instead you read it line by line. 

In [303]:
# From lists or numpy arrays
data = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
dataset = Dataset.from_tensor_slices(data)

for i in dataset:
    print(i)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(13, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(15, shape=(), dtype=int32)


# Applying instructions (transformations)

Because your data is loaded only when accessed, you can specify a set of instructions that your data will go through right after you load it.

Your instructions would execute only when your dataset is being accessed, an example of it is when you iterate on your dataset yourself or when fitting a model.

When you access your dataset it should execute all instructions (I.e. map, etc) per batch. It doesn’t do it for all the dataset before you need it, that’s the point of the dataset API.

In [304]:
# Map
dataset = dataset.map(lambda x: (x, x*2))

for x, y in dataset:
    print(x, y)

tf.Tensor(1, shape=(), dtype=int32) tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32) tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32) tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32) tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32) tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32) tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32) tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32) tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32) tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32) tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32) tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32) tf.Tensor(24, shape=(), dtype=int32)
tf.Tensor(13, shape=(), dtype=int32) tf.Tensor(26, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32) tf.Tensor(28, 

In [305]:
# Shuffle
dataset = dataset.shuffle(5)

# Batch
dataset = dataset.batch(3)

for i in dataset:
    print(i)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([5, 1, 2])>, <tf.Tensor: shape=(3,), dtype=int32, numpy=array([10,  2,  4])>)
(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 4,  6, 10])>, <tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 8, 12, 20])>)
(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 9, 12,  3])>, <tf.Tensor: shape=(3,), dtype=int32, numpy=array([18, 24,  6])>)
(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([13,  7, 11])>, <tf.Tensor: shape=(3,), dtype=int32, numpy=array([26, 14, 22])>)
(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([14,  8, 15])>, <tf.Tensor: shape=(3,), dtype=int32, numpy=array([28, 16, 30])>)


In [301]:
# From generator function
def dataset_generator():
    for x in data:
        yield np.cos(x)

dataset = dataset.from_generator(dataset_generator, tf.float32)
for x in dataset:
    print(x)

tf.Tensor(0.5403023, shape=(), dtype=float32)
tf.Tensor(-0.41614684, shape=(), dtype=float32)
tf.Tensor(-0.9899925, shape=(), dtype=float32)
tf.Tensor(-0.6536436, shape=(), dtype=float32)
tf.Tensor(0.2836622, shape=(), dtype=float32)
tf.Tensor(0.96017027, shape=(), dtype=float32)
tf.Tensor(0.75390226, shape=(), dtype=float32)
tf.Tensor(-0.14550003, shape=(), dtype=float32)
tf.Tensor(-0.91113025, shape=(), dtype=float32)
tf.Tensor(-0.8390715, shape=(), dtype=float32)
tf.Tensor(0.004425698, shape=(), dtype=float32)
tf.Tensor(0.84385395, shape=(), dtype=float32)
tf.Tensor(0.9074468, shape=(), dtype=float32)
tf.Tensor(0.13673721, shape=(), dtype=float32)
tf.Tensor(-0.7596879, shape=(), dtype=float32)


# Working with Tensors

When doing transformations on a dataset pipeline, you are working with Tensors and not Numpy arrays and we need to adjust accordingly.

In [306]:
def func(x, y):
    return x, keras.utils.to_categorical(y, 31)

# one_hot instead of to_categorical
# dataset = dataset.map(func)

# Wrapping functions with py_function
dataset = dataset.map(lambda x, y: tf.py_function(func, [x, y], [tf.int32, tf.int32]))
for i in dataset:
    print(i)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([3, 6, 2])>, <tf.Tensor: shape=(3, 31), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0]])>)
(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([5, 7, 8])>, <tf.Tensor: shape=(3, 31), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0]])>)
(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 1, 11,  4])>, <tf.Tensor: shape=(3, 31), dtype=int32, numpy=
a

# Training your model with the Dataset API
You can train your model with your dataset with ease! just pass it to the .fit() and your good to go.

In [None]:
train = dataset.skip(2)
valid = dataset.take(2)

model = keras.Model()
model.fit(train, epochs=5, validation_data=valid)