# Working with datasets

In [1]:
import tensorflow as tf
import os
import pathlib
import scipy.ndimage as ndimage
import numpy as np
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'tensorflow'

## Creating dataset

Construct a Dataset from data in memory:<br>
`tf.data.Dataset.from_tensors()`<br>
`tf.data.Dataset.from_tensor_slices()   # from any tensor-like object (numpy array, list, dictionary)`<br>

Construct from a .tfrec file (for large datasets that do not fit in memory): <br>
`tf.data.TFRecordDataset()`<br>

Construct from generator:<br>
`tf.data.Dataset.from_generator()`<br>

Construct from text files: <br>
`tf.data.TextLineDataset()` <br>

Construct from CSV file: <br>
`tf.data.experimental.make_csv_dataset()`<br>
`tf.data.experimental.CsvDataset` <-- lower-level for finer grained control<br>
or (if the dataset fits into memory) you can consume file with pandas and then use:<br> 
`tf.data.Dataset.from_tensor_slices(dict(df))`

In [None]:
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0, 8, 2, 1])
dataset

In [None]:
# The Dataset object is a Python iterable --> use a for loop
for elem in dataset:
  print(elem.numpy())

# or python iterator
it = iter(dataset)
print(next(it).numpy())

### Dataset structure
 - Each element is the same nested structure of components.<br>
 Eg. tuple, dict, NamedTuple OrderedDict **(but NOT list!)**
 - Each component can be any type representable by tf.TypeSpec <br>
 (tf.Tensor, tf.sparse.SparseTensor, tf.RaggedTensor, tf.TensorArray, or tf.data.Dataset).<br>

In [None]:
dataset2 = tf.data.Dataset.from_tensor_slices(
   (tf.random.uniform([4]),
    tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))

dataset2.element_spec

In [None]:
# Dataset containing a sparse tensor.
dataset3 = tf.data.Dataset.from_tensors(tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]))
dataset3.element_spec

In [None]:
# Use value_type to see the type of value represented by the element spec
dataset3.element_spec.value_type

## Transformations
per-element transformations, eg.: <br>
`Dataset.map()` <br>

multi-element transformations, eg.: <br>
`Dataset.batch()`

reduce all elements to a single result, eg.: <br>
`print(dataset.reduce(0, lambda state, value: state + value).numpy())`

### map() 
Preprocessing data

In [None]:
# get labels from file path:
'''
b'/home/kbuilder/.keras/datasets/flower_photos/daisy/2045022175_ad087f5f60_n.jpg'
b'/home/kbuilder/.keras/datasets/flower_photos/tulips/19425920580_cdc8f49aed_n.jpg'
b'/home/kbuilder/.keras/datasets/flower_photos/dandelion/160456948_38c3817c6a_m.jpg'
'''

flowers_root = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)
    
flowers_root = pathlib.Path(flowers_root)

list_ds = tf.data.Dataset.list_files(str(flowers_root/'*/*'))

def preprocess(file_path):
  parts = tf.strings.split(file_path, os.sep)
  label = parts[-2]
  
  image = tf.io.read_file(file_path)
  image = tf.io.decode_jpeg(image)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, [128, 128])
  return image, label

flower_ds = list_ds.map(preprocess)

def show(image, label):
  plt.figure()
  plt.imshow(image)
  plt.title(label.numpy().decode('utf-8'))
  plt.axis('off')

for image, label in flower_ds.take(2):
  show(image, label)


For performance reasons, use TensorFlow operations for preprocessing your data whenever possible. However, it is sometimes useful to call external Python libraries when parsing your input data. 

You can use the `tf.py_function()` operation in a `Dataset.map()` transformation.

For example, if you want to apply a random rotation, the tf.image module only has tf.image.rot90, which is not very useful for image augmentation. 

To demonstrate tf.py_function, try using the scipy.ndimage.rotate function instead:

In [None]:
def random_rotate_image(image):
  image = ndimage.rotate(image, np.random.uniform(-30, 30), reshape=False)
  return image

image, label = next(iter(flower_ds))
image = random_rotate_image(image)
show(image, label)

To use this function with Dataset.map the same caveats apply as with Dataset.from_generator, you need to describe the return shapes and types when you apply the function:

In [None]:
def tf_random_rotate_image(image, label):
  im_shape = image.shape
  [image,] = tf.py_function(random_rotate_image, [image], [tf.float32])
  image.set_shape(im_shape)
  return image, label

rot_ds = flower_ds.map(tf_random_rotate_image)

for image, label in rot_ds.take(2):
  show(image, label)

Note: tensorflow_addons has a TensorFlow compatible rotate in tensorflow_addons.image.rotate.

### batch()

In [None]:
inc_dataset = tf.data.Dataset.range(100)
dec_dataset = tf.data.Dataset.range(0, -100, -1)
dataset = tf.data.Dataset.zip((inc_dataset, dec_dataset))
batched_dataset = dataset.batch(4)

for batch in batched_dataset.take(4):
  print([arr.numpy() for arr in batch])

While tf.data tries to propagate shape information, the default settings of Dataset.batch result in an unknown batch size because the last batch may not be full. Note the Nones in the shape:

In [None]:
batched_dataset

In [None]:
# ignore that last batch, and get full shape propagation:
batched_dataset = dataset.batch(7, drop_remainder=True)
batched_dataset

In [None]:
# Batching tensors with padding for elements of varying size
dataset = tf.data.Dataset.range(100)
dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
dataset = dataset.padded_batch(4, padded_shapes=(None,))

for batch in dataset.take(2):
  print(batch.numpy())
  print()

### repeat() 
with no arguments this will repeat the input indefinitely

In [None]:
def plot_batch_sizes(ds):
  batch_sizes = [batch.shape[0] for batch in ds]
  plt.bar(range(len(batch_sizes)), batch_sizes)
  plt.xlabel('Batch number')
  plt.ylabel('Batch size')

titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)

# repeat 3 times and then batch
titanic_batches = titanic_lines.repeat(3).batch(128)
plot_batch_sizes(titanic_batches)

In [None]:
# batch and then repeat 3 times
titanic_batches = titanic_lines.batch(128).repeat(3)
plot_batch_sizes(titanic_batches)

If you would like to perform a custom computation (e.g. to collect statistics) at the end of each epoch then it's simplest to restart the dataset iteration on each epoch:

In [None]:
epochs = 3
dataset = titanic_lines.batch(128)

for epoch in range(epochs):
  for batch in dataset:
    print(batch.shape)
  print("End of epoch: ", epoch)


### Dataset.shuffle()
Maintains a fixed-size buffer and chooses the next element uniformly at random from that buffer.<br>
While large buffer_sizes shuffle more thoroughly, they can take a lot of memory, and significant time to fill. Consider using Dataset.interleave() across files if this becomes a problem.

In [None]:
lines = tf.data.TextLineDataset(titanic_file)
counter = tf.data.experimental.Counter()

dataset = tf.data.Dataset.zip((counter, lines))
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(20)
dataset

Since the buffer_size is 100, and the batch size is 20, the first batch contains no elements with an index over 120.

In [None]:
n,line_batch = next(iter(dataset))
print(n.numpy())

Dataset.shuffle doesn't signal the end of an epoch until the shuffle buffer is empty. So a shuffle placed before a repeat will show every element of one epoch before moving to the next. Repeat before a shuffle mixes the epoch boundaries together.

In [None]:
dataset = tf.data.Dataset.zip((counter, lines))

# shuffle --> repeat
shuffled = dataset.shuffle(buffer_size=100).batch(10).repeat(2)
shuffle_repeat = [n.numpy().mean() for n, line_batch in shuffled]

#print item ID's near the epoch boundary
print("shuffle --> repeat")
for n, line_batch in shuffled.skip(60).take(5):
  print(n.numpy())

# repeat --> shuffle
shuffled = dataset.repeat(2).shuffle(buffer_size=100).batch(10)
repeat_shuffle = [n.numpy().mean() for n, line_batch in shuffled]

print("\nrepeat --> shuffle")
for n, line_batch in shuffled.skip(55).take(5):
  print(n.numpy())


plt.plot(shuffle_repeat, label="shuffle().repeat()")
plt.plot(repeat_shuffle, label="repeat().shuffle()")
plt.ylabel("Mean item ID")
plt.legend()

## Resampling
When you have a very inbalanced dataset, you might want to split it into classes and then randomly sample from them to get a balanced representation.

In [None]:
zip_path = tf.keras.utils.get_file(
    origin='https://storage.googleapis.com/download.tensorflow.org/data/creditcard.zip',
    fname='creditcard.zip',
    extract=True)

csv_path = zip_path.replace('.zip', '.csv')

creditcard_ds = tf.data.experimental.make_csv_dataset(
    csv_path, batch_size=1024, label_name="Class",
    # Set the column types: 30 floats and an int.
    column_defaults=[float()]*30+[int()])

# negative = 99.73%, positive = 0.27%
negative_ds = (
  creditcard_ds
    .unbatch()
    .filter(lambda features, label: label==0)
    .repeat())
positive_ds = (
  creditcard_ds
    .unbatch()
    .filter(lambda features, label: label==1)
    .repeat())

balanced_ds = tf.data.Dataset.sample_from_datasets(
    [negative_ds, positive_ds], [0.5, 0.5]).batch(10)

# Now the dataset produces examples of each class with 50/50 probability
for features, labels in balanced_ds.take(10):
  print(labels.numpy())


Using Dataset.filter works, but results in all the data being loaded twice. To avoid this, you can drop elements to achieve balance instead.

class_func passed as an argument, is applied to each dataset element, and is used to determine which class an example belongs to for the purposes of balancing.

In [None]:
def class_func(features, label):
  return label

def count(counts, batch):
  features, labels = batch
  class_1 = labels == 1
  class_1 = tf.cast(class_1, tf.int32)

  class_0 = labels == 0
  class_0 = tf.cast(class_0, tf.int32)

  counts['class_0'] += tf.reduce_sum(class_0)
  counts['class_1'] += tf.reduce_sum(class_1)

  return counts

counts = creditcard_ds.take(10).reduce(
    initial_state={'class_0': 0, 'class_1': 0},
    reduce_func = count)

counts = np.array([counts['class_0'].numpy(),
                   counts['class_1'].numpy()]).astype(np.float32)

fractions = counts/counts.sum() # [0.9973 0.0027]

resampler = tf.data.experimental.rejection_resample(
    class_func, target_dist=[0.5, 0.5], initial_dist=fractions)

# The resampler deals with individual examples, so you must unbatch the dataset before applying the resampler:
resample_ds = creditcard_ds.unbatch().apply(resampler).batch(10)

# The resampler returns creates (class, example) pairs from the output of the class_func. 
# In this case, the example was already a (feature, label) pair, 
# so use map to drop the extra copy of the labels:
balanced_ds = resample_ds.map(lambda extra_label, features_and_label: features_and_label)

# Now the dataset produces examples of each class with 50/50 probability
for features, labels in balanced_ds.take(10):
  print(labels.numpy())

## Iterator Checkpointing
In addition to checkpointing the model variables, you can also checkpoint the progress of the dataset iterator. This could be useful if you have a large dataset and don't want to start the dataset from the beginning on each restart. Note however that iterator checkpoints may be large, since transformations such as shuffle and prefetch require buffering elements within the iterator. 

Note: It is not possible to checkpoint an iterator which relies on external state such as a tf.py_function. 

In [None]:
range_ds = tf.data.Dataset.range(20)

iterator = iter(range_ds)
ckpt = tf.train.Checkpoint(step=tf.Variable(0), iterator=iterator)
manager = tf.train.CheckpointManager(ckpt, '/tmp/my_ckpt', max_to_keep=3)
print([next(iterator).numpy() for _ in range(5)])

save_path = manager.save()
print([next(iterator).numpy() for _ in range(5)])

ckpt.restore(manager.latest_checkpoint)
print([next(iterator).numpy() for _ in range(5)])

## Using tf.data with tf.keras
`train_ds = tf.data.Dataset.from_tensor_slices((images, labels))`<br>
...

`model.fit(train_ds, epochs=2)`<br>

for infinite dataset:<br>
`model.fit(train_ds.repeat(), epochs=2, steps_per_epoch=20)`

`loss, accuracy = model.evaluate(train_ds)`<br>

For long datasets, set the number of steps to evaluate:<br>
`loss, accuracy = model.evaluate(train_ds.repeat(), steps=10)`

The labels are not required when predicting:<br>
`predict_ds = tf.data.Dataset.from_tensor_slices(images).batch(32)`
`result = model.predict(predict_ds, steps = 10)`

But if you do pass a dataset containing them, the labels are ignored:
`result = model.predict(train_ds, steps = 10)`
