In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
# Version Information
# tensorflow 2.2.0 , Cudnn7.6.5 and Cuda 10.1 , python 3.8

In [None]:
#This command shows list of physical devices available for tensorflow
tf.config.experimental.list_physical_devices()

In [None]:
tf.__version__

In [None]:
tf.test.is_built_with_cuda()

In [None]:
"""
Load the dataset
Our dataset contains 60000 small training images that belongs to one of the below 10 classes
"""
(X_train, y_train), (X_test,y_test) = tf.keras.datasets.cifar10.load_data()

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
classes = ["airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"]

In [None]:
classes[y_train[3][0]]

In [None]:
y_train[:3]

In [None]:
y_test.shape

In [None]:
X_train.shape

In [None]:
#Preprocessing: Scale images

X_train_scaled = X_train / 255
X_test_scaled = X_test / 255

In [None]:
#one-hot encoded format
y_train_categorical = keras.utils.to_categorical(
    y_train, num_classes=10, dtype='float32'
)
y_test_categorical = keras.utils.to_categorical(
    y_test, num_classes=10, dtype='float32'
)

In [None]:
y_train[0:5]

In [None]:
y_train_categorical[0:5]

In [None]:
#Model building and training

def get_model():
    model = keras.Sequential([
            keras.layers.Flatten(input_shape=(32,32,3)),
            keras.layers.Dense(3000, activation='relu'),
            keras.layers.Dense(1000, activation='relu'),
            keras.layers.Dense(10, activation='sigmoid')    
        ])

    model.compile(optimizer='SGD',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
train_tf_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train_categorical))
test_tf_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test_categorical))

In [None]:
"""
This line initializes a MirroredStrategy, which is a data parallelism strategy in TensorFlow for multi-GPU training.

Where the bathch size is split across the multiple gpus
"""
strategy = tf.distribute.MirroredStrategy()

In [None]:
strategy.num_replicas_in_sync  #4 Here we have four GPUs so we split data into 4 parts

In [None]:
"""
Each replica deals with 250 images four cores so total batchzie is 1000
Autotune lets the tensorflow know how many samples it needs to prefetch for the next iteration
"""
BATCH_SIZE_PER_REPLICA = 250
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync


train_dataset = train_tf_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_tf_dataset.batch(BATCH_SIZE)

In [None]:
#Measure training time on a GPU
%%timeit -n1 -r1 
with strategy.scope():
    gpu_model = get_model()
    gpu_model.fit(train_dataset, epochs=50)

In [None]:
#Measure training time on a CPU
%%timeit -n1 -r1 
with tf.device('/CPU:0'):
    cpu_model = get_model()
    cpu_model.fit(train_dataset, epochs=50)