First, we need to download the required libraries and the dataset we will be working on.

In [0]:
# Import TensorFlow and TensorFlow Datasets
import tensorflow_datasets as tfds
import tensorflow as tf
tfds.disable_progress_bar()

import os

In [0]:
print(tf.__version__)

2.2.0-rc3


In [0]:
# Downloading the dataset 
datasets, info = tfds.load('beans', with_info=True, as_supervised=True)

beans_train, beans_test = datasets['train'], datasets['test']

[1mDownloading and preparing dataset beans/0.1.0 (download: Unknown size, generated: 171.63 MiB, total: 171.63 MiB) to /root/tensorflow_datasets/beans/0.1.0...[0m




Shuffling and writing examples to /root/tensorflow_datasets/beans/0.1.0.incompleteICDDYX/beans-train.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/beans/0.1.0.incompleteICDDYX/beans-validation.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/beans/0.1.0.incompleteICDDYX/beans-test.tfrecord
[1mDataset beans downloaded and prepared to /root/tensorflow_datasets/beans/0.1.0. Subsequent calls will reuse this data.[0m


### Now, some essential steps are required before we start our training. The hyperparameters need to be defined and more importantly, we need to initialize the distributed learning algorithm.

In [0]:
# Initializing the distributed learning algorithm
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [0]:
# Just checking the number of devices available for distributing
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 1


In [0]:
# Defining some hyperparameters
num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples

BUFFER_SIZE = 10000

BATCH_SIZE_PER_REPLICA = 32
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

In [0]:
# Normalizing the pixel values
def scale(image, label):
  image = tf.cast(image, tf.float32)
  image /= 255

  return image, label

In [0]:
# Batching the dataset and keeping a memory buffer for better performance
train_dataset = beans_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = beans_test.map(scale).batch(BATCH_SIZE)

In [0]:
# Building our model
with strategy.scope():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(16, 3, activation='relu', input_shape=(500, 500, 3)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(32, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(64, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(3)
  ])

  model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [0]:
# Define the checkpoint directory to store the checkpoints

checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [0]:
# Function for decaying the learning rate.

def decay(epoch):
  if epoch < 3:
    return 1e-3
  elif epoch >= 3 and epoch < 7:
    return 1e-4
  else:
    return 1e-5

In [0]:
# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
                                                      model.optimizer.lr.numpy()))


In [0]:
# Defining hte callbacks
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                       save_weights_only=True),
    tf.keras.callbacks.LearningRateScheduler(decay),
    PrintLR()
]

In [0]:
# Training the model
import time

start = time.time()
model.fit(train_dataset, epochs=10, callbacks=callbacks)
end = time.time()
print("Time elapsed: {}".format(end-start))

Epoch 1/10
Learning rate for epoch 1 is 0.0010000000474974513
Epoch 2/10
Learning rate for epoch 2 is 0.0010000000474974513
Epoch 3/10
Learning rate for epoch 3 is 0.0010000000474974513
Epoch 4/10
Learning rate for epoch 4 is 9.999999747378752e-05
Epoch 5/10
Learning rate for epoch 5 is 9.999999747378752e-05
Epoch 6/10
Learning rate for epoch 6 is 9.999999747378752e-05
Epoch 7/10
Learning rate for epoch 7 is 9.999999747378752e-05
Epoch 8/10
Learning rate for epoch 8 is 9.999999747378752e-06
Epoch 9/10
Learning rate for epoch 9 is 9.999999747378752e-06
Epoch 10/10
Learning rate for epoch 10 is 9.999999747378752e-06
Time elapsed: 40.08736324310303
