### CNN Case Study: VGGNet

Visual Geometry Group + DeepMind, which revealed the relationship between the depth and performance of the CNN. It achieved 16 to 19 layers depth of CNN structures.<br>
* Simple structures: 5 conv-layers, 3 fully-connected layer, 1 softmax output and each layer was divided by max-pooling. 
* Smaller conv-kernel $3*3$: Shrink the params, and More non-linear mapping.
* Smaller max-pooling $2*2$
* More channels (64 - 128 - 256 - 512 - 512) and make sure more features would be extracted.

#### VGGNet structures as shown (6 networks: A, A-LRN, B, C, D, E), sub-layer from 1 to 4, layer from 11 to 19:

![title](VGGNets.png)

#### For example, VVGNet 16 gives input layer, conv layer, fully-connected layer, and output.

![title](VGG16.png)

    1. Input 224*224*3, 2 convolutions by 64 conv-kernels with size 3*3 and ReLU, gives post-convolution szie of 224*224*64
    2. Max-pooling with 2*2, gives size 112*112*64
    
    3. 2 convolutions by 128 conv-kernels with size 3*3 and ReLU, gives 112*112*128
    4. Max-pooling with 2*2, gives size 56*56*128
    
    5. 3 convolutions by 256 conv-kernels with size 3*3 and ReLU, gives 56*56*256
    6. Max-pooling with 2*2, gives size 28*28*256
    
    7. 3 convolutions by 512 conv-kernels with size 3*3 and ReLU, gives 28*28*512
    8. Max-pooling with 2*2, gives size 14*14*512
    
    9. 3 convolutions by 512 conv-kernels with size 3*3 and ReLU, gives 14*14*512
    10. Max-pooling with 2*2, gives size 7*7*512
    
    11. Fully-connected and ReLU with 2 layers of 1*1*4096, and 1 layer of 1*1*1000
    12. Softmax to give 1000 predictions

The Authors has drawn conclusions by camparing all level of networks:
* LRN are not that help
* Performance gets better when layer depth growing
* $1*1$ convolution is effective, but $3*3$ works better, and larger kernels can be used to learn larger features

VGGNet used 4 Geforce GTX titan GPU. This time we are not going to use ImageNet to train, but evaluate the inference (forward) and training (backward) time consume.<br>
VGGNet 16 implemented for training time consume evaluation in this study.

In [2]:
from datetime import datetime
import math
import time
import tensorflow as tf

# create conv-layer, pass the params into list. 
# kh: kernel height; kw: kernel width; n_out: Channels
# dh: step height; dw: step width; p: params list
def conv_builder(input_op, name, kh, kw, n_out, dh, dw, p): 
    n_in = input_op.get_shape()[-1].value # Channel number

    with tf.name_scope(name) as scope:
        kernel = tf.get_variable(scope+"w", shape=[kh,kw,n_in,n_out], 
                                 dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer_conv2d())
        conv = tf.nn.conv2d(input_op, kernel, (1,dh,dw,1), padding='SAME')
        bias_init_val = tf.constant(0.0, shape=[n_out], dtype=tf.float32)
        biases = tf.Variable(bias_init_val, trainable=True, name='b')
        z = tf.nn.bias_add(conv, biases)
        activation = tf.nn.relu(z, name=scope)
        p += [kernel, biases]
        return activation
    
# create fully-connected layer.
def fc_builder(input_op, name, n_out, p):
    n_in = input_op.get_shape()[-1].value
    
    with tf.name_scope(name) as scope:
        kernel = tf.get_variable(scope+"w", shape=[n_in,n_out], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
        biases = tf.Variable(tf.constant(0.1, shape=[n_out], dtype=tf.float32), name='b')
        activation = tf.nn.relu_layer(input_op, kernel, biases, name=scope)
        p += [kernel, biases]
        return activation
    
# create max-pooling layer.
def mpool_builder(input_op, name, kh, kw, dh, dw):
    return tf.nn.max_pool(input_op, ksize=[1,kh,kw,1],strides=[1,dh,dw,1],padding='SAME',name=name)


In [3]:
def inference_builder(input_op, keep_prob): # Build VGGNet16 inference
    p = []
    
    conv1_1 = conv_builder(input_op, name='conv1_1', kh=3, kw=3, n_out=64, dh=1, dw=1, p=p)  # 224*224*3 -> 224*224*64
    conv1_2 = conv_builder(conv1_1, name='conv1_2', kh=3, kw=3, n_out=64, dh=1, dw=1, p=p)   # ...
    pool1 = mpool_builder(conv1_2, name='pool1', kh=2, kw=2, dw=2, dh=2)                      # 224*224*64 -> 112*112*64
    
    conv2_1 = conv_builder(pool1, name='conv2_1', kh=3, kw=3, n_out=128, dh=1, dw=1, p=p)    # 112*112*64 -> 112*112*128
    conv2_2 = conv_builder(conv2_1, name='conv2_2', kh=3, kw=3, n_out=128, dh=1, dw=1, p=p)  # ...
    pool2 = mpool_builder(conv2_2, name='pool2', kh=2, kw=2, dw=2, dh=2)                      # 112*112*128 -> 56*56*128
    
    conv3_1 = conv_builder(pool2, name='conv3_1', kh=3, kw=3, n_out=256, dh=1, dw=1, p=p)    # 56*56*128 -> 56*56*256
    conv3_2 = conv_builder(conv3_1, name='conv3_2', kh=3, kw=3, n_out=256, dh=1, dw=1, p=p)  # ...
    conv3_3 = conv_builder(conv3_2, name='conv3_3', kh=3, kw=3, n_out=256, dh=1, dw=1, p=p)  # ...
    pool3 = mpool_builder(conv3_3, name='pool3', kh=2, kw=2, dw=2, dh=2)                      # 56*56*256 -> 28*28*256
    
    conv4_1 = conv_builder(pool3, name='conv4_1', kh=3, kw=3, n_out=512, dh=1, dw=1, p=p)    # 28*28*256 -> 28*28*512
    conv4_2 = conv_builder(conv4_1, name='conv4_2', kh=3, kw=3, n_out=512, dh=1, dw=1, p=p)  # ...
    conv4_3 = conv_builder(conv4_2, name='conv4_3', kh=3, kw=3, n_out=512, dh=1, dw=1, p=p)  # ...
    pool4 = mpool_builder(conv4_3, name='pool4', kh=2, kw=2, dw=2, dh=2)                      # 28*28*512 -> 14*14*512
    
    conv5_1 = conv_builder(pool4, name='conv5_1', kh=3, kw=3, n_out=512, dh=1, dw=1, p=p)    # ...
    conv5_2 = conv_builder(conv5_1, name='conv5_2', kh=3, kw=3, n_out=512, dh=1, dw=1, p=p)  # ...
    conv5_3 = conv_builder(conv5_2, name='conv5_3', kh=3, kw=3, n_out=512, dh=1, dw=1, p=p)  # ...
    pool5 = mpool_builder(conv5_3, name='pool5', kh=2, kw=2, dw=2, dh=2)                      # 14*14*512 -> 7*7*512
    
    shape = pool5.get_shape()
    flattened_shape = shape[1].value * shape[2].value * shape[3].value
    reshape1 = tf.reshape(pool5, [-1, flattened_shape], name="reshape1")   # 7*7*512 = 25088 vector
    
    fc6 = fc_builder(reshape1, name='fc6', n_out=4096, p=p)    # Hidden nodes 4096
    fc6_drop = tf.nn.dropout(fc6, keep_prob, name='fc6_drop')  # drop out
    
    fc7 = fc_builder(fc6_drop, name='fc7', n_out=4096, p=p)
    fc7_drop = tf.nn.dropout(fc7, keep_prob, name='fc7_drop')
    
    fc8 = fc_builder(fc7_drop, name='fc8', n_out=1000, p=p)    # Output nodes 1000 -> Softmax
    softmax = tf.nn.softmax(fc8)
    predictions = tf.argmax(softmax,1)
    
    return predictions, softmax, fc8, p

In [None]:
def run_time(session, target, feed, info):
    num_steps_burn_in = 10 # warm-up
    total_duration = 0.0
    total_duration_squared = 0.0
    
    for i in range(num_batches + num_steps_burn_in):
        start_time = time.time()
        _ = session.run(target, feed_dict=feed) # feed_dict for drop-out probabilities control
        duration = time.time() - start_time
        if i >= num_steps_burn_in:
            if not i % 10:
                print('%s: step %d, duration = %.3f' % (datetime.now(), i - num_steps_burn_in, duration))
            total_duration += duration
            total_duration_squared += duration * duration
            
        mean = total_duration / num_batches
        vari = total_duration_squared / num_batches - mean*mean
        stan = math.sqrt(vari)
        
        print('%s: %s arcoss %d steps, %.3f +/- %.3f sec / batch' % (datetime.now(), info_string, num_batches, mean, stan))


def benchmark():
    with tf.Graph().as_default():
        image_size = 224
        images = tf.Variable(tf.random_normal([batch_size,image_size,image_size,3],dtype=tf.float32,stddev=1e-1)) # not using ImageNet
        
        keep_prob = tf.placeholder(tf.float32)
        predictions, softmax, fc8, p = inference_builder(images, keep_prob) # construct VGGNet 16
        
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)
        
        run_time(sess, predictions, {keep_prob:1.0}, "Forward") # Forward run-time
        objective = tf.nn.l2_loss(fc8)
        grad = tf.gradients(objective, p)
        run_time(sess, grad, {keep_prob:0.5}, "Forward-backward") # Backward run-time

batch_size = 4
num_batches = 100
benchmark()

VGGNet reaches 7.3% error rate in ILSVRC 2014. Even though model parameters used in VGGNet are larger than that used in AlexNet, less iterations will reach convergence. This is because the deeper network and smaller kernel could bring hidden normalizations.