#Import Libraries

In [None]:
# import data manipulation libraries
import numpy as np
import pandas as pd

# import library to display plots/images
import matplotlib.pyplot as plt

# import tensorflow libraries
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

# import standard libraries to deal with files
import pathlib
import shutil

# Utility Functions

In [None]:
# create huge numpy array and save it to disk
def make_huge_data(path: str) -> None:
    huge_data_file = pathlib.Path(path)
    huge_data_shape = (3000, 256, 256, 3)
    huge_data_dtype = np.float64

    if not huge_data_file.exists():
        huge_data = np.random.rand(*huge_data_shape).astype(huge_data_dtype)
        np.save(huge_data_file, huge_data)
        del huge_data

In [None]:
# setup the huge data file
HUGE_DATA_FILE = "/content/huge_data.npy"
make_huge_data(HUGE_DATA_FILE)

In [None]:
# shows original and augmented image side by side
def visualize_augmentation(original: tf.Tensor, augmented: tf.Tensor) -> None:
    fig = plt.figure()
    plt.subplot(1,2,1)
    plt.title('Original image')
    plt.imshow(original)

    plt.subplot(1,2,2)
    plt.title('Augmented image')
    plt.imshow(augmented)

In [None]:
# returns a class name given the index number
def get_class_name(info: tfds.core.dataset_info.DatasetInfo, 
                   class_num: int) -> str:
    return info.features['label'].int2str(class_num)

# Outline
---
1. Basics of `tf.data.Dataset`
> 1.1 Methods for a manipulating `Dataset`\
> 1.2 Creating pipelines by chaining methods\
> 1.3 Working with a structured `Dataset`

2. Creating `Dataset` from data in memory/disk
> 2.1 `Dataset` from a NumPy array\
> 2.2 `Dataset` from a structured image directory\
> 2.3 `Dataset` from a CSV file

3. Create a image classification model using data pipelines
> 3.1 Load Dataset from TF Hub\
> 3.2 Build the pipeline\
> 3.3 Define the model\
> 3.4 Train the model\
> 3.5 Manually evaluate the model

# Basic Mechanics of TF Datasets
---

Let's first create a dataset from a normal python list of 10 numbers

In [None]:
lst = list(range(10))
dataset = tf.data.Dataset.from_tensor_slices(lst)

If we look at the dataset object we just created, we can see that it says that shapes is () and the types are `int32`

We'll see why shapes doesn't have any numbers in it shortly

In [None]:
dataset

Let's try to look at an element inside the dataset

In [None]:
dataset[3]

Hmmm... That's interesting. Why can't we specify an element that we want to view in the dataset?

Well, **datasets are actually iterable objects**, meaning that we can't actually access a specific element, like an array. If we want to view the contents of the dataset, we can either iterate over it using a `for` loop, or turn it into a python iterable object and use the `iter.next()` method on it

In [None]:
# iterate over the dataset using for loop
for x in dataset:
    print(x)

Looking at the output above, we actually see that the shape of each tensor is (), or a scalar value. So looking back at the description of the dataset object, we see that the `shapes: ()` part is actually describing the shape of each item in the dataset.

Also, if you're not familiar with tensors, they are essentially like numpy arrays or multidimensional arrays.

In [None]:
# view the first element of the dataset
next(iter(dataset))

We can also turn the `iter` object into a `list`, but this is highly unadvisable because we're defeating the purpose of the datasets, which is to not load the entire dataset into memory

In [None]:
list(iter(dataset))

Okay, now that we're familiar with dataset objects, lets take a look at some methods that we can manipulate the datasets with.

## Manipulating Datasets

### Dataset.shuffle()

First, we'll look at the shuffle method.
Shuffling is an important part of the preprocessing steps, since we always want to make sure that our dataset is shuffled to not introduce bias in our model.

**TensorFlow docs description:**
```
tf.Data.Dataset.shuffle(
    buffer_size, 
    seed=None, 
    reshuffle_each_iteration=None
)
```

- Randomly shuffles the elements of this dataset.

- This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.

- *We have to give an buffer size argument, which tells the dataset how many samples to use for the random shuffling*

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle)



In [None]:
list(iter(dataset.shuffle(10)))

Okay, lets try shuffling with a `buffer_size` of 2

In [None]:
list(iter(dataset.shuffle(2)))

Above, we see that the a couple elements were shuffled, but the rest remained relatively in order. 

It's important to use specify a large enough `buffer_size`, but we also need to keep in mind that more memory is allocated with a larger `buffer_size`. So if our dataset is very large, we may not want to allocate the entire dataset to memory for shuffling. A typical recommended `buffer_size` is 1000.

Though, having a `buffer_size` larger than the dataset doesn't allocate more memory. For instance, I can have a `buffer_size` of `200000` and it won't crash our colab instance since we're not actually putting any information in the extra buffer

In [None]:
list(iter(dataset.shuffle(200000)))

### Dataset.filter()

The filter method is similar to python's `filter` method where we can filter the dataset according to a function returning a boolean

**TensorFlow docs description:**\
`tf.Data.Dataset.filter(predicate)`:

- Filters this dataset according to `predicate`.

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#filter)



In [None]:
# filter out odd numbers from dataset
list(iter(dataset.filter(lambda x: x % 2 == 0)))

### Dataset.map()

The `Dataset.map` method is similar to python's `map` method where we can map a function to each element in the dataset.

**TensorFlow docs description:**
```
tf.Data.Dataset.map(
    map_func,
    num_parallel_calls=None, 
    deterministic=None
)
```

- Maps `map_func` across the elements of this dataset.

- This transformation applies `map_func` to each element of this dataset, and returns a new dataset containing the transformed elements, in the same order as they appeared in the input. 

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map)

In [None]:
# multiply each element by 2
list(iter(dataset.map(lambda x: x*2)))

We can also enable parallel processing by calling the `map` method with the `num_parallel_calls` argument.

And, instead of specifying a number of parallel processes to the `num_parallel_calls` argument, we can pass `tf.data.AUTOTUNE` to let TensorFlow decide the optimal number of parallel calls

In [None]:
# multiply each element by 2 using parallel processing
list(iter(dataset.map(lambda x: x*2, num_parallel_calls=tf.data.AUTOTUNE)))

### Dataset.take()

The `take` method allows us to construct a new dataset taking a specified number of elements from our original dataset

**TensorFlow docs description:**\
`tf.Data.Dataset.take(count)`:
- Creates a `Dataset` with at most `count` elements from this dataset.

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#take)

In [None]:
list(iter(dataset.take(3)))

### Dataset.batch()

The `batch` method essentially groups together our dataset elements into a specified batch size. This is a really useful method for when we train our models, as batching essentially allows us to show a limited number of samples to the model every time the model is updated in the training phase.

**TensorFlow docs description:**
```
tf.Data.Dataset.batch(
    batch_size, 
    drop_remainder=False, 
    num_parallel_calls=None, 
    deterministic=None
)
```
- Combines consecutive elements of this dataset into batches.

- The components of the resulting element will have an additional outer dimension, which will be `batch_size` (or `N % batch_size` for the last element if `batch_size` does not divide the number of input elements `N` evenly and `drop_remainder` is `False`).

- If your program depends on the batches having the same outer dimension, you should set the `drop_remainder` argument to `True` to prevent the smaller batch from being produced.

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)

In [None]:
list(iter(dataset.batch(3)))

We can see above that the shapes of the tensors are no longer empty because we turned them from scalars into a vector or array of numbers.

In [None]:
dataset.batch(3)

Also, if we look at the shape of the dataset after we apply the `batch` method, we see that it has a shape of `(None,)`. `None` is essentially a placeholder value that shows that the shape of the tensors are indeterminant.

When we set the `drop_remainder` argument to `True`, we see that we instead get a tensor shape of `batch_size` instead

In [None]:
dataset.batch(3, drop_remainder=True)

### Dataset.unbatch()

The `unbatch` method undoes the batch operation and removes a dimension

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#unbatch)

In [None]:
list(iter(dataset.batch(3)))

In [None]:
list(iter(dataset.batch(3).unbatch()))

## Optimizing pipeline efficiency

### Dataset.cache()

Caching allows us to cache a section of the pipeline to improve the performance of subsequent iterations on the pipeline.

**TensorFlow docs description:**
```
tf.Data.Dataset.cache(
    filename=''
)
```
- Caches the elements in this dataset.

- The first time the dataset is iterated over, its elements will be cached either in the specified file or in memory. Subsequent iterations will use the cached data.

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#cache)

In [None]:
list(iter(dataset.map(lambda x: x*2)))

In [None]:
list(iter(dataset.shuffle(10).cache()))

### Dataset.prefetch()

Prefetching essentially allows us to specify the number of samples to prepare in the background while the current sample is being used to train a model. It's a vital component of data pipelines allowing us to improve the overall speed of training ML models

**TensorFlow docs description:**
```
tf.Data.Dataset.prefetch(
    buffer_size
)
```
- Creates a Dataset that prefetches elements from this dataset.

- **Most dataset input pipelines should end with a call to prefetch.** This allows later elements to be prepared while the current element is being processed. This often improves latency and throughput, at the cost of using additional memory to store prefetched elements.

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#prefetch)

## Chaining methods (a.k.a data pipelines)

Now that you're more familiar with datasets and their methods, let's look at how we can build pipelines from them.

All of the methods shown above don't actually modify the original dataset; instead, they produce a new modified dataset. So we can use this to chain together methods and create a pipeline with our datasets.

In [None]:
pipeline = (dataset.map(lambda x: x + 1, num_parallel_calls=tf.data.AUTOTUNE)
                   .filter(lambda x: x > 3)
                   .cache()
                   .shuffle(10)
                   .batch(2, num_parallel_calls=tf.data.AUTOTUNE))
                   .prefetch(tf.data.AUTOTUNE)

list(iter(pipeline))

The methods introduced above are only the tip of the iceberg, if you want to see all the available methods, visit the [Dataset documentation](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#methods_2)

## Working with Structured Datasets

So far, we've only been looking at a dataset containing an array. But what if we have more than one array that we want to put in our dataset? 

Typically, we have the label and the features that we want to put into our datasets, so let's look at an example of how we do that.

In [None]:
features = np.random.rand(4, 3, 3)
labels = np.random.rand(4)

structured_ds = tf.data.Dataset.from_tensor_slices((labels, features))

In [None]:
structured_ds

Let's iterate over this dataset and see what we get

In [None]:
for x in structured_ds:
    print(x)

In [None]:
type(next(iter(structured_ds)))

We see that the elements are tuples, which was the data structure that we passed when we constructed the dataset

If we want to perform operations on this dataset, we need to unpack the tuple

In [None]:
for label, feature in structured_ds:
    print(label.numpy(), feature.numpy())

And lets run a map on this structured dataset

In [None]:
def map_fn(label, feature):
    return label + 1, feature + 1

list(iter(structured_ds.map(map_fn)))

# Create Datasets from memory/disk
---

## Create Dataset from NumPy Array

In [None]:
file_byte_size = pathlib.Path(HUGE_DATA_FILE).stat().st_size
print(f'{(file_byte_size/10**9):.2f}GB')

It would take a long time load the data and would use a significant amount of memory. Instead, lets load the data using a memory mapping.

In [None]:
del huge_data

In [None]:
# load huge dataset using memmap
huge_data = np.load(HUGE_DATA_FILE, mmap_mode='r')

Next, lets create labels for the dataset

In [None]:
# create labels for the huge dataset
huge_data_labels = np.random.randint(0, 10, 3000)

Now lets create a dataset from the array

In [None]:
huge_dataset = tf.data.Dataset.from_tensor_slices((huge_data_labels, huge_data))

In [None]:
huge_dataset

In [None]:
next(iter(huge_dataset))

In [None]:
for label, data in huge_dataset.take(1):
  print(label)
  print(data)

## Create Dataset from structured image directory

###Download the flowers directory from tensorflow hub

[TF Datasets Catalog](https://www.tensorflow.org/datasets/catalog/overview)

**TensorFlow docs description:**
```
tf.keras.utils.get_file(
    fname, 
    origin, 
    untar=False,
    md5_hash=None, 
    file_hash=None,
    cache_subdir='datasets', 
    hash_algorithm='auto',
    extract=False, 
    archive_format='auto', 
    cache_dir=None
)
```

- Downloads a file from a URL if it not already in the cache.

- By default the file at the url `origin` is downloaded to the `cache_dir ~/.keras`, placed in the cache_subdir `datasets`, and given the filename `fname`. The final location of a file `example.txt` would therefore be `~/.keras/datasets/example.txt`.

[Documentation](https://www.tensorflow.org/api_docs/python/tf/keras/utils/get_file)

In [None]:
flowers_dir = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)

In [None]:
flowers_dir

Let's move the downloaded directory into our workspace so we can examine it

In [None]:
shutil.move(flowers_dir, '/content/flower_photos')

In [None]:
flowers_dir = '/content/flower_photos'

###Load Dataset from an image directory

`tf.keras.preprocessing.image_dataset_from_directory()`

**TensorFlow docs description:**
```
tf.keras.preprocessing.image_dataset_from_directory(
    directory, 
    labels='inferred', 
    label_mode='int',
    class_names=None, 
    color_mode='rgb', 
    batch_size=32, 
    image_size=(256,
    256), 
    shuffle=True, 
    seed=None,
    validation_split=None, 
    subset=None,
    interpolation='bilinear', 
    follow_links=False, 
    smart_resize=False
)
```

- Generates a `tf.data.Dataset` from image files in a directory.

- If your directory structure is:
```
main_directory/
...class_a/
......a_image_1.jpg
......a_image_2.jpg
...class_b/
......b_image_1.jpg
......b_image_2.jpg
```

- Then calling `image_dataset_from_directory(main_directory, labels='inferred')` will return a `Dataset` that yields batches of images from the subdirectories `class_a` and `class_b`, together with labels `0` and `1` (`0` corresponding to `class_a` and `1` corresponding to `class_b`).

[Documentation](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image_dataset_from_directory)

In [None]:
flowers_dataset = tf.keras.preprocessing.image_dataset_from_directory(flowers_dir)

In [None]:
next(iter(flowers_dataset))

In [None]:
next(iter(flowers_dataset.unbatch()))

## Create Dataset from CSV file

### Create a Dataset from a CSV loaded into a DataFrame

In [None]:
housing_csv_file = '/content/sample_data/california_housing_train.csv'
df = pd.read_csv(housing_csv_file)

In [None]:
df.head()

In [None]:
housing_labels = df.loc[:, 'median_house_value']
housing_features = df.drop(columns='median_house_value')

In [None]:
housing_labels.head()

In [None]:
housing_features.head()

In [None]:
housing_dataset = tf.data.Dataset.from_tensor_slices((housing_labels.values, housing_features.values))

### Create a Dataset directly from the CSV file

**TensorFlow docs description:**
```
tf.data.experimental.make_csv_dataset(
    file_pattern, 
    batch_size, 
    column_names=None, 
    column_defaults=None,
    label_name=None, 
    select_columns=None, 
    field_delim=',',
    use_quote_delim=True, 
    na_value='', 
    header=True, 
    num_epochs=None,
    shuffle=True, 
    shuffle_buffer_size=10000, 
    shuffle_seed=None,
    prefetch_buffer_size=None, 
    num_parallel_reads=None, 
    sloppy=False,
    num_rows_for_inference=100, 
    compression_type=None, 
    ignore_errors=False
)
```

- Reads CSV files into a dataset, where each element of the dataset is a (features, labels) tuple that corresponds to a batch of CSV rows. The features dictionary maps feature column names to `Tensor`s containing the corresponding feature data, and labels is a `Tensor` containing the batch's label data.

[Documentation](https://www.tensorflow.org/api_docs/python/tf/data/experimental/make_csv_dataset)

In [None]:
housing_dataset = tf.data.experimental.make_csv_dataset(housing_csv_file, label_name='median_house_value', batch_size=1)

In [None]:
next(iter(housing_dataset))

In [None]:
next(iter(housing_dataset.unbatch()))

# Full workflow example
---



## Load Dataset from TensorFlow Hub

**Load the `tf_flowers` dataset from the [TensorFlow Dataset catalog](https://www.tensorflow.org/datasets/catalog/overview)**

In [None]:
(train_ds, val_ds), info = tfds.load(
    'tf_flowers',
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True,
    with_info=True
)

In [None]:
tfds.show_examples(train_ds.take(3), info)

In [None]:
info.features['label'].int2str(2)

## Build Pipeline

Create a function for resizing and rescaling the image

In [None]:
def resize_and_rescale(image, label, size):
    image = tf.cast(image, tf.float32)
    image = tf.image.resize(image, size)
    image = image / 255.0
    return image, label

Visualize the function results

In [None]:
img, label = next(iter(train_ds))

resized_img, _ = resize_and_rescale(img, label, (256, 256))

visualize_augmentation(img, resized_img)

Create a function for augmentating the image

In [None]:
def img_augmentations(image, label):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_saturation(image, 1, 2)
    image = tf.image.random_brightness(image, 0.1)
    image = tf.image.random_hue(image, 0.1)
    return image, label

Create our pipeline

In [None]:
IMG_SIZE = (256, 256)
pipeline = (train_ds
                .map(lambda img, label: resize_and_rescale(img, label, IMG_SIZE), num_parallel_calls=tf.data.AUTOTUNE)
                .map(img_augmentations, num_parallel_calls=tf.data.AUTOTUNE)
                )

Visualize the pipeline results

In [None]:
def show(image, label):
    plt.imshow(image)
    plt.title(str(label.numpy()))

In [None]:
show(*next(iter(pipeline)))

## Define the Model

We will be using the [MobileNet V3 Small](https://tfhub.dev/google/imagenet/mobilenet_v3_small_100_224/classification/5) (input image size must be 224x224)

In [None]:
model_handle = 'https://tfhub.dev/google/imagenet/mobilenet_v3_small_100_224/feature_vector/5'

In [None]:
IMG_SIZE = (224, 224)
input_shape = IMG_SIZE + (3,)

In [None]:
num_classes = info.features['label'].num_classes

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=input_shape),
    hub.KerasLayer(model_handle),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
model.build((None,)+input_shape)
model.summary()

In [None]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), 
  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
  metrics=['accuracy'])

In [None]:
BATCH_SIZE = 32
train_ds_size = train_ds.cardinality().numpy()
val_ds_size = val_ds.cardinality().numpy()

train_processed = (train_ds
                .map(lambda img, label: resize_and_rescale(img, label, IMG_SIZE), num_parallel_calls=tf.data.AUTOTUNE)
                .cache()
                .map(img_augmentations, num_parallel_calls=tf.data.AUTOTUNE)
                .shuffle(train_ds_size)
                .batch(BATCH_SIZE)
                .prefetch(tf.data.AUTOTUNE)
)

val_processed = (val_ds
                 .map(lambda img, label: resize_and_rescale(img, label, IMG_SIZE), num_parallel_calls=tf.data.AUTOTUNE)
                 .cache()
                 .shuffle(val_ds_size)
                 .batch(BATCH_SIZE)
                 .prefetch(tf.data.AUTOTUNE)
)

## Train Model

In [None]:
hist = model.fit(
    train_processed,
    validation_data=val_processed,
    epochs=5,
    steps_per_epoch=10,
    validation_steps=10
)

## Create Predictions

In [None]:
test_img, test_label = next(iter(val_processed.unbatch().take(1)))

In [None]:
prediction = model.predict(np.expand_dims(test_img, axis=0))
prediction

In [None]:
plt.imshow(test_img)
plt.title(get_class_name(info, prediction.argmax()))

make a function to compare prediction results on 1 batch

In [None]:
def show_predictions(processed_val_dataset, info):
    processed_val_dataset = processed_val_dataset.unbatch()

    for test_img, test_label in processed_val_dataset:
        plt.figure()
        plt.axis('off')
        plt.imshow(test_img)
        prediction = model.predict(np.expand_dims(test_img, axis=0))
        prediction_label = get_class_name(info, prediction.argmax()) 
        actual_label = get_class_name(info, test_label.numpy())
        plt.title(f'Predicted: {prediction_label}, Actual: {actual_label}')

In [None]:
show_predictions(val_processed.take(1), info)