In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  !pip install wget
  #!pip install split-folders
  import wget
  import zipfile
  #import split-folders

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.layers import Dense, Flatten, Normalization, Dropout, Conv2D, MaxPooling2D, RandomFlip, RandomRotation, RandomZoom, BatchNormalization, Activation, InputLayer
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras import utils
import os
import matplotlib.pyplot as plt

2023-03-31 09:00:30.212515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Pipelines and Dealing with Larger Data Efficiently

When dealing with larger amounts of data in our neural networks we need some tools to manage the data pipeline. We have started to look at a couple of these in the image generators and the datasets from directories. We can build on the dataset specifically to create data pipelines that both apply any data preparation steps and load our data efficiently. 

<b>Note:</b> in the project stuff I suggested using either generators or datasets for performance reasons, as I read more I found that the speed difference between the two has actually become really large, with datasets being much faster. We'll focus only on those here. Some articles said it is up to 30 times faster, which is pretty massive. Even if the difference isn't near that much, it is substantial, so the datasets will be more efficient for larger data. 

In [None]:
# Download and Unzip Data
ROOT_DIR = "/content/simpsons_dataset"

def bar_custom(current, total, width=80):
    print("Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total))

zip_name = "simpsons.zip"

url = "https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/simpsons.zip"

if not os.path.exists(zip_name):
    wget.download(url, zip_name, bar=bar_custom)

if not os.path.exists("/content/simpsons_dataset"):
    with zipfile.ZipFile(zip_name, 'r') as zip_ref:
            zip_ref.extractall()

In [None]:
BATCH_SIZE = 16
BUFFER = 10 
VAL_SPLIT = .2
IMG_SIZE = (224, 224)
EPOCHS = 10


## TF.Data Pipelines

The data pipeline setup in tensorflow is a little different than what we are used to in sklearn. Here, the big thing that our pipeline is going to do for us is to offer greater efficiency. Neural networks are generally used with very large datasets that are too large to fint in memory, like a dataframe that we are used to using. The tensorflow pipeline works to pull the data off of disk and into memory in a way that efficiently uses resources. The pipeline that we create will do a better job of ensuring that data is being prepared while the model is training, and is prepped and ready to go without delay, which can really matter with large datasets. The way that it does this is to do a better job of batching and processing the data, to ensure that we waste as little time idle as possible. Recall that when training models we normally use the GPU (graphics processor) becuase it is way faster, but most of the work outside of the fit method still relies on the CPU. The data pipelines work to make sure that the CPU can process and prepare data in parallel with the GPU doing the fitting, so as soon as one batch is finished processing, another is ready to go. Less delay means more of the run time spent processing rather than waiting. 

<b>From:</b>
![Data Pipelines](images/tf_data_idle.png "Data Pipelines")
<b>To:</b>
![Data Pipelines](images/tf_data_idle2.png "Data Pipelines")

We have already used these datasets to load data, now we will look at some things that we can do to make the data pipeline more efficient when the data gets larger. 

### Construct the Datasets

We start by making the dataset objects - here we will load in image data from a directory structure, there are other methods to create datasets from other stuff like arrays. 

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(ROOT_DIR, validation_split=VAL_SPLIT, subset="training", seed=123, batch_size=BATCH_SIZE)

In [None]:
labels = os.listdir(ROOT_DIR)
NUM_CLASSES = len(labels)
print(NUM_CLASSES)
print(labels)

### Setup Preprocessing Pipeline

Now that we have the basics of the dataset created, we can work on the pipeline to deliver that data in the form we want it. There are several things that we can do to our dataset, some that we'll focus on here are:
<ul>
<li> Cache - preload some data to speed the process. 
<li> Map - apply a function to all the data, usefull to apply transformations like normalization. 
<li> Batch - creates batches of data. 
<li> Prefetch - retreives data early to eliminate delays. 
<li> Shuffle - pulls data randomly to create batches. 
</ul>

### Autotune

One lesser known fact about neural networks is that T-Pain has done a large amount of research into efficiently loading data, even developing a tool predicably named autotune. In addition to making the voice of T-Pain resemble that of an angel, autotune works to make our data pipeline more efficient by monitoring some metrics on performance as the dataset works, and automatically making adjustments to improve things. The ramp-up process for the autotune to learn can impose some performance penalties on the early steps of training as the algorithm is analyzing the data, but once it learns an optimal set of values the efficiency will improve. This makes the autotune tool good for larger and longer training times, as the inital tuning time will become negligable as training progresses. 

There are a few things that we can apply the autotune tool to in our pipeline. 

### Pipeline Syntax

The creation of the entire pipeline is simple from a coding perspective, we just chain all of the functions that we are going to use onto the dataset. 

One key thing is that the things in the pipeline are done in order, sometimes this can matter. There are lots of potential combinations of actions that could be put together into a pipeline, so we can't make universal rules on the order, there are a few guidelines that we can use though:
<ul>
<li> Prefetch should be last. 
</ul>

In [None]:
TPAIN = tf.data.AUTOTUNE

#### Map and (Not) Batch

Mapping works here like it works anywhere else, we can apply a function to the entire dataset. For dataprep, this is useful, as if we need to do a transformation we can build it into the pipeline here. 

Batches work like we are used to, but we won't use them now. The image_dataset_from_directory batches the data itself, so if we batch it again here we end up with batches of batches, which makes the data we give to the model 5D instead of 4 - (batch_size1), (batch_size2), (height), (width), (color_depth). If we were loading data from something that didn't batch the data already, such as from a dataframe, we could enable this here. This is an error that is very annoying, take my word for it. 

In [None]:
#train_ds = train_ds.map()
#train_ds = train_ds.batch(4)

#### Cache and Shuffle

Cache just pulls data into memory early, so there is less delay to load it. In addition, you can optionally specify a file location for the cache location - while this probably won't help us, in a server environment you may have a RAM Disk, which is exactly what it sounds like, so you could potentially cache from a regular disk to the super fast ram disk. Shuffle just randomizes the order of the data, the buffersize controls how many items are shuffled at once; here we've let the autotune manage that size. 

In [None]:
train_ds = train_ds.cache()
train_ds = train_ds.shuffle(buffer_size=BUFFER)

#### Prefetch

Prefetch is what forces the system to do its data preparation work in parallel with the modelling work, so both the CPU and GPU can stay busy. We allow the autotune to control how much is prepared in advance, as well as the number of parallel calls, which controls the number of processes to run at one time, similar to n_jobs in sklearn. 

In [None]:
train_ds = train_ds.prefetch(buffer_size=TPAIN, num_parallel_calls=TPAIN)

### Finishing Pipelines and Handling Datasets

We can finish up by mirroring the steps above on our validation dataset. 

### Managing Resources

One thing that we can do with these datasets is set the parameters to limit resource use. Resource usage can be monitored on colab by clicking the RAM/CPU icon towards the top right, on a computer you could use the activity manager, task manager, or any other program that monitors RAM usage. 
<ul>
<li> GPU Ram is exceeded - almost certainly can be addressed by the batch size. 
<li> System RAM is exceeded - this could be many things, the most likely cause is that too much data was loaded in for some operation that causes the RAM usage to spike.
    <ul>
    <li> Shuffle and prefetch options both load more data at a time the higher the limit is, we can lower them to limit RAM usage. 
    </ul>
</ul>

Unfortunately the autotune won't assure us that memory use limits don't get hit. 

#### Side Note on Memory Usage

This is one of the (likely) few times we really need to monitor RAM usage. In general, your computer is capable of swapping - moving stuff in and out of RAM and back to disk on the fly to ensure everything works. Here, swapping data to disk is so much slower that it is essentially impossible, so if we put 16.1GB in 16GB of RAM, everything dies. The most likely culprits are things that try to do something to all the data at once, it is likely that with larger datasets such actions aren't even possible. We can use the .map() function in the dataset to apply things to the dataset in a managed way. 

In [None]:
val_ds = tf.keras.utils.image_dataset_from_directory(ROOT_DIR, validation_split=VAL_SPLIT, subset="validation", batch_size=BATCH_SIZE)
val_ds = val_ds.cache()
#val_ds = val_ds.batch(4)
val_ds = val_ds.shuffle(buffer_size=BUFFER)
val_ds = val_ds.prefetch(buffer_size=TPAIN)

### Pulling Data

Getting some example data out of our dataset is a little different because they aren't a basic data structure like a dataframe or an array, so we can't just say "give me item 7". We need to approach getting data from the dataset similarly to how it provides did to a fit method, we ask for some data and the dataset produces one batch for us. We can do this with the "as_numpy_iterator" method, which returns an iterator. The iterator, well, iterates over the dataset, so to get some more data we can ask it for the next() piece of data. 

In [None]:
# Grab Some Data
some_data = train_ds.as_numpy_iterator()
sample_data = some_data.next()
plt.imshow(sample_data[0][0].astype("int"))
print(sample_data[1][0])

## Modelling

Once the data pipelines are setup, using them is the same as always. Our datasets will handle all the things that we setup above all on their own, and will provide data to the fit method as it requires it. 

Since this training process may take a while, we will also write a checkpoint callback to save the weights every time we improve the model. The wonky stuff in the file name just assigns each set of weights saved with a label of their epoch and accuracy, a common way to log multiple sets of weights. 

In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint("logs/weights.{epoch:02d}-{val_loss:.2f}.hdf5", monitor='val_accuracy', verbose=2, save_best_only=False, save_weights_only=True, mode='max', save_freq='epoch')

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1./255),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(256, (3,3), activation='relu'),
    tf.keras.layers.Conv2D(256, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(512, (3,3), activation='relu'),
    tf.keras.layers.Conv2D(512, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(NUM_CLASSES)
])


In [None]:
model.compile(
  optimizer='adam',
  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy'])

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback]
)