### Basic knowledge about GPU
- Colab uses Tesla K80, if lucky, you can get Tesla P100 or T4.
- Kaggle uses Tesla P100.
- Both Colab and Kaggle only provide single core cpu (if you want to using ImageGenerator, one core is very slow).
- Performance benchmark: 1080Ti = (2 ~ 3)x K80; 1080Ti = P100
- V100 > P100 = 1080Ti > T4 > K80

### A great artical comparing the specs, performances, and prices of Colab, AWS, Google Cloud, etc.
https://towardsdatascience.com/maximize-your-gpu-dollars-a9133f4e546a

### Google Cloud: $300 free, and how to setup:
https://medium.com/@jamsawamsa/running-a-google-cloud-gpu-for-fast-ai-for-free-5f89c707bae6

or even $500 free credit??

https://codefresh.io/google-cloud/?utm_source=Google&utm_medium=Search&utm_campaign=KubGCredit2&gclid=Cj0KCQiAw4jvBRCJARIsAHYewPPTDCKVx9nmfxO1TKL-6a-XOg3NZamNFVsGLj_9MlclxbajHWNcDq8aAp9lEALw_wcB

# Check CPU, Memory, Disk

In [2]:
import multiprocessing
print(multiprocessing.cpu_count())

!lscpu |grep 'Model name'

#no.of sockets i.e available slots for physical processors
!lscpu | grep 'Socket(s):'

#no.of threads each core is having
!lscpu | grep 'Thread(s) per core'

!lscpu | grep 'L3 cache'

!lscpu | grep MHz

!cat /proc/meminfo | grep 'MemAvailable'

!df -h / | awk '{print $4}'

8
Model name:            Intel(R) Xeon(R) CPU @ 2.20GHz
Socket(s):             1
Thread(s) per core:    2
L3 cache:              56320K
CPU MHz:               2200.000
MemAvailable:   29784976 kB
Avail
74G


# Check GPU

In [6]:
# %tensorflow_version 2.x

In [3]:
import tensorflow as tf
print(tf.__version__)
print(tf.test.is_gpu_available())
print(tf.test.gpu_device_name())

2.0.0
True
/device:GPU:0


In [1]:
!pip3 install gputil
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip3 install psutil
!pip3 install humanize

import psutil
import humanize
import os
import GPUtil as GPU
import json

print('--------------------')
GPUs = GPU.getGPUs()
print('GPU number: {}'.format(len(GPUs)))

if len(GPUs) > 0:
    gpu = GPUs[0]
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: "\
          + humanize.naturalsize( process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB"\
          .format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
    print("GPU Name: " + gpu.name)
    print(json.dumps(gpu.__dict__, indent = 2))

--------------------
GPU number: 1
Gen RAM Free: 30.5 GB  | Proc size: 186.0 MB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total 16280MB
GPU Name: Tesla P100-PCIE-16GB
{
  "memoryFree": 16280.0,
  "name": "Tesla P100-PCIE-16GB",
  "display_mode": "Enabled",
  "load": 0.0,
  "memoryTotal": 16280.0,
  "temperature": 41.0,
  "display_active": "Disabled",
  "memoryUsed": 0.0,
  "uuid": "GPU-3b3ea54c-a961-24d4-5942-133fe25ad1b9",
  "driver": "410.104",
  "memoryUtil": 0.0,
  "id": 0,
  "serial": "0324317003687"
}


In [1]:
!nvidia-smi

Tue Dec  3 06:07:57 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    36W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# CNN Benchmark Test

### Model: ResNet with 17 Conv2D layers. 
### Dataset: Cifar-10

### On Colab, GPU = K80:
- w/o image augmentation: 50s/epoch
- w/  image augmentation (1 cpu): 150s/epoch

### On Colab, GPU = P100:
- w/o image augmentation: 13s/epoch
- w/  image augmentation (1 cpu): 90s/epoch

In [None]:
# %tensorflow_version 2.x

In [4]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import os

# import tensorflow as tf
# from tensorflow.keras.datasets import cifar10
# from tensorflow.keras.models import Model, Sequential 
# from tensorflow.keras.layers import Input, Conv2D, Dense
# from tensorflow.keras.layers import MaxPooling2D, AveragePooling2D, ZeroPadding2D
# from tensorflow.keras.layers import Add, Activation, Flatten, Dropout, BatchNormalization
# from tensorflow.keras.optimizers import Adam, RMSprop
# from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
# from tensorflow.keras.preprocessing.image import ImageDataGenerator
# from tensorflow.keras.utils import to_categorical
# from tensorflow.keras import models
# from tensorflow.keras import backend as K

from keras.datasets import cifar10
from keras.models import Model, Sequential 
from keras.layers import Input, Conv2D, Dense
from keras.layers import MaxPooling2D, AveragePooling2D, ZeroPadding2D
from keras.layers import Add, Activation, Flatten, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import to_categorical
from keras import models
from keras import backend as K

from sklearn.metrics import confusion_matrix
import seaborn as sns

print(tf.__version__)


(X_train, y_train), (X_test, y_test) = cifar10.load_data() 
num_classes = 10

# Convert y_train and y_test into one-hot labels
y_train = to_categorical(y_train, num_classes)
y_test  = to_categorical(y_test, num_classes)

# Rescale X_train and X_test
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train = X_train/255
X_test  = X_test/255

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


def identity_block(X, f, filters, shortcut_on, dropout_on, dropout_rates, stage, block):
    """
    Implementation of the identity block 
    
    Arguments:
    X -- input tensor of shape (m, n_H_prev, n_W_prev, n_C_prev)
    f -- integer, specifying the shape of the Conv kernel for the main path
    filters -- python list of integers, defining the number of filters in the Conv layers of the main path
    shortcut_on -- boolean, with shortcut or not
    dropout_on -- boolean, with dropout layers or not
    dropout_rates -- python list of float, defining the dropout_rate after each Conv layer
    stage -- integer, used to name the layers, depending on their position in the network
    block -- string/character, used to name the layers, depending on their position in the network    
    
    Returns:
    X -- output of the identity block, tensor of shape (n_H, n_W, n_C)
    
    Note:
    the input channel number and the output channel number (filters[2]) must be the same.
    """
    
    # defining name basis
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    
    # Retrieve Filters
    F1, F2 = filters

    # Retrieve dropout_rates
    D1, D2 = dropout_rates
    
    # Save the input value. You'll need this later to add back to the main path. 
    X_shortcut = X
    
    ##### MAIN PATH #####
    # First component of main path
    X = Conv2D(filters=F1, kernel_size=(3,3), strides=(1,1), padding='same', name=conv_name_base+'2a')(X)
    X = BatchNormalization(name=bn_name_base+'2a')(X)
    X = Activation('relu')(X)
    if dropout_on:
      X = Dropout(D1)(X)
    
    # Second component of main path 
    X = Conv2D(filters=F2, kernel_size=(3,3), strides=(1,1), padding='same', name=conv_name_base+'2b')(X)
    X = BatchNormalization(name=bn_name_base+'2b')(X)
    ##### MAIN PATH END #####

    ##### SHORTCUT PATH #### 
    if dropout_on:
        X_shortcut = Dropout(D2)(X_shortcut)  # Shortcut with dropout performs better

    # Final step: Add shortcut value to main path, and pass it through a RELU activation (≈2 lines)
    if shortcut_on:
        X = Add()([X, X_shortcut])    

    X = Activation('relu')(X)

    return X


def convolutional_block(X, f, filters, stride, shortcut_on, dropout_on, dropout_rates, stage, block):
    """
    Implementation of the convolutional_block block 
    
    Arguments:
    X -- input tensor of shape (m, n_H_prev, n_W_prev, n_C_prev)
    f -- integer, specifying the shape of the Conv kernel for the main path
    filters -- python list of integers, defining the number of filters in the Conv layers of the main path
    stride -- Integer, specifying the stride to be used
    shortcut_on -- boolean, with shortcut or not
    dropout_on -- boolean, with dropout layers or not
    dropout_rates -- python list of float, defining the dropout_rate after each Conv layer
    stage -- integer, used to name the layers, depending on their position in the network
    block -- string/character, used to name the layers, depending on their position in the network

    Returns:
    X -- output of the identity block, tensor of shape (n_H, n_W, n_C)
    """
    
    # defining name basis
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    
    # Retrieve Filters
    F1, F2 = filters

    # Retrieve dropout_rates
    D1, D2 = dropout_rates
    
    # Save the input value. You'll need this later to add back to the main path. 
    X_shortcut = X
    
    ##### MAIN PATH #####
    # First component of main path
    X = Conv2D(filters=F1, kernel_size=(3,3), strides=(stride,stride), padding='same', name=conv_name_base+'2a')(X)
    X = BatchNormalization(name=bn_name_base+'2a')(X)
    X = Activation('relu')(X)
    if dropout_on:
        X = Dropout(D1)(X)
    
    # Second component of main path (≈3 lines)
    X = Conv2D(filters=F2, kernel_size=(3,3), strides=(1,1), padding='same', name=conv_name_base+'2b')(X)
    X = BatchNormalization(name=bn_name_base+'2b')(X)
    ##### MAIN PATH END #####

    ##### SHORTCUT PATH #### 
    X_shortcut = Conv2D(filters=F2, kernel_size=(1,1), strides=(stride,stride), padding='valid', name=conv_name_base+'1')(X_shortcut)
    if dropout_on:
        X_shortcut = Dropout(D2)(X_shortcut)  # Shortcut with dropout performs better
    
    # Final step: Add shortcut value to main path, and pass it through a RELU activation (≈2 lines)
    if shortcut_on:
        X = Add()([X, X_shortcut])
    X = Activation('relu')(X)

    return X


def ResNet(shortcut_on, dropout_on, dropout_rates, input_shape=(32, 32, 3), classes=10):
    """
    Arguments:

    shortcut_on -- boolean, with shortcut or not
    dropout_on -- boolean, with dropout layers or not
    dropout_rates -- python list of float, defining the dropout_rate after each Conv layer
    input_shape -- shape of the images of the dataset
    classes -- integer, number of classes

    Returns:
    model -- a Model() instance in Keras
    """

    # Define the input as a tensor with shape input_shape
    X_input = Input(input_shape)

    # Zero-Padding
    X = ZeroPadding2D((1, 1))(X_input)  # output: 34 x 34 x 3

    # Stage 1
    X = Conv2D(32, (3, 3), strides=(1, 1), activation='relu', name='conv0')(X)  # output: 32 x 32 x 32
    X = BatchNormalization(name='bn_conv0')(X)
    X = identity_block(X, 3, [32, 32], shortcut_on, dropout_on, dropout_rates, stage=1, block='a')    # output: 32 x 32 x 32
    X = MaxPooling2D((2, 2))(X)   # output: 16 x 16 x 32

    # Stage 2
    X = convolutional_block(X, 3, [64, 64], 1, shortcut_on, dropout_on, dropout_rates, stage=2, block='a')  
    X = identity_block(X, 3, [64, 64], shortcut_on, dropout_on, dropout_rates, stage=2, block='b')    # output: 16 x 16 x 64
    X = MaxPooling2D((2, 2))(X)   # output: 8 x 8 x 64

    # Stage 3 
    X = convolutional_block(X, 3, [128, 128], 1, shortcut_on, dropout_on, dropout_rates, stage=3, block='a')  
    X = identity_block(X, 3, [128, 128], shortcut_on, dropout_on, dropout_rates, stage=3, block='b')    # output: 8 x 8 x 128
    X = MaxPooling2D((2, 2))(X)   # output: 4 x 4 x 128

    # Stage 4 
    X = convolutional_block(X, 3, [256, 256], 1, shortcut_on, dropout_on, dropout_rates, stage=4, block='a')  
    X = identity_block(X, 3, [256, 256], shortcut_on, dropout_on, dropout_rates, stage=4, block='b')    # output: 4 x 4 x 256
    X = MaxPooling2D((2, 2))(X)   # output: 2 x 2 x 256

    # Fully connected layers
    X = Flatten()(X)
    X = BatchNormalization()(X)
    X = Dense(units=1024, activation='relu', name='fc1')(X)
    X = Dropout(0.4)(X)
    X = BatchNormalization()(X)
    X = Dense(units=1024, activation='relu', name='fc2')(X)
    X = Dropout(0.4)(X)
    X = BatchNormalization()(X)
    X = Dense(units=512, activation='relu', name='fc3')(X)
    X = BatchNormalization()(X)

    # output layer
    X = Dense(units=classes, activation='softmax', name='fc_out')(X)
    
    # Create model
    model = Model(inputs=X_input, outputs=X, name='ResNet')

    return model


def run(cnn_model, X_train, y_train, batch_size, epochs, data_augmentation, validation_data):
    if not data_augmentation:
        print('Not using data augmentation.')
        history = cnn_model.fit(X_train,y_train,
                                batch_size=batch_size,
                                epochs=epochs,
                                validation_data=validation_data,
                                shuffle=True)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            zca_epsilon=1e-06,  # epsilon for ZCA whitening
            rotation_range=30,  # randomly rotate images in the range (degrees, 0 to 180)
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.2,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.2,
            shear_range=0.2,  # set range for random shear
            zoom_range=0.2,  # set range for random zoom
            channel_shift_range=0.,  # set range for random channel shifts
            # set mode for filling points outside the input boundaries
            fill_mode='nearest',
            cval=0.,  # value used for fill_mode = "constant"
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False,  # randomly flip images
            # set rescaling factor (applied before any other transformation)
            rescale=None,
            # set function that will be applied on each input
            preprocessing_function=None,
            # image data format, either "channels_first" or "channels_last"
            data_format=None,
            # fraction of images reserved for validation (strictly between 0 and 1)
            validation_split=0.0)

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(X_train)

        # Fit the model on the batches generated by datagen.flow().
        history = cnn_model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
                                          epochs=epochs,
                                          validation_data=validation_data,
                                          workers=8,
                                          use_multiprocessing=True)


shortcut_on = True
dropout_on = True
dropout_rates = [0.3, 0.2]
opt = Adam(learning_rate=0.001) #RMSprop(lr=0.001)

cnn_model = ResNet(shortcut_on, dropout_on, dropout_rates)
cnn_model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

# Run w/o data augmentation
data_augmentation = False
batch_size = 1024
epochs = 2
validation_data=(X_test, y_test)
run(cnn_model, X_train, y_train, batch_size, epochs, data_augmentation, validation_data)

# Run with data augmentation (Note: ImageDataGenerator runs on CPU, so it's slow.)
data_augmentation = True
batch_size = 1024
epochs = 2
validation_data=(X_test, y_test)
run(cnn_model, X_train, y_train, batch_size, epochs, data_augmentation, validation_data)

2.0.0
(50000, 32, 32, 3)
(50000, 10)
(10000, 32, 32, 3)
(10000, 10)
Not using data augmentation.
Train on 50000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Using real-time data augmentation.
Epoch 1/2
Epoch 2/2
