### CNN Case Study: AlexNet

###### Features: ReLU, Dropout, LRN, GPU...

AlexNet reinforced the idea of the CNN structure LeCun proposed:<br>
* Sigmoid suffered from gradient dispersion, ReLU implemented instead.
* Dropout to ignore some of the neurons in the last FC layers.
* Overlapping max-pooling rather than the average-pooling.
* Utilize LRN to mimic side inhibition, which increased the generalization of the model.
* CUDA acceleration, 2*GTX 580 GPU.
* Data Augmentation.

#### AlexNet structure as shown:

![title](AlexNet.png)

Implementation of AlexNet. It is a large time consumer to train the AlexNet on ImageNet dataset. Here, I am not going into the ImageNet data, but the complete structure will be fulfilled, and test the training speed of the Forward-prop and Back-prop in each signal batch.

In [1]:
from datetime import datetime
import math
import time
import tensorflow as tf

batch_size = 32
num_batches = 100

def report_info(x):
    print(x.op.name, ' ',x.get_shape().as_list())

def inference(images):
    params = []
    
    with tf.name_scope('conv1') as scope:
        kernel = tf.Variable(tf.truncated_normal([11,11,3,64], dtype=tf.float32, stddev=1e-1), name='weights')
        conv   = tf.nn.conv2d(images, kernel, [1,4,4,1], padding='SAME')
        biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32), trainable=True, name='biases')
        bias   = tf.nn.bias_add(conv,biases)
        conv1  = tf.nn.relu(bias, name=scope)
        
        report_info(conv1)
        params += [kernel, biases]
    
    lrn1 = tf.nn.lrn(conv1, 4, bias=1, alpha=0.001/9, beta=0.75, name='lrn1')
    pool1 = tf.nn.max_pool(lrn1, ksize=[1,3,3,1], strides=[1,2,2,1], padding='VALID', name='pool1')

    report_info(pool1)

    with tf.name_scope('conv2') as scope:
        kernel = tf.Variable(tf.truncated_normal([5,5,64,192], dtype=tf.float32, stddev=1e-1), name='weights')
        conv   = tf.nn.conv2d(pool1, kernel, [1,1,1,1], padding='SAME')
        biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=tf.float32), trainable=True, name='biases')
        bias   = tf.nn.bias_add(conv,biases)
        conv2  = tf.nn.relu(bias, name=scope)
        
        params += [kernel, biases]
        report_info(conv2)
    
    lrn2 = tf.nn.lrn(conv2, 4, bias=1, alpha=0.001/9, beta=0.75, name='lrn2')
    pool2 = tf.nn.max_pool(lrn2, ksize=[1,3,3,1], strides=[1,2,2,1], padding='VALID', name='pool2')

    report_info(pool2)

    with tf.name_scope('conv3') as scope:
        kernel = tf.Variable(tf.truncated_normal([3,3,192,384], dtype=tf.float32, stddev=1e-1), name='weights')
        conv   = tf.nn.conv2d(pool2, kernel, [1,1,1,1], padding='SAME')
        biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=tf.float32), trainable=True, name='biases')
        bias   = tf.nn.bias_add(conv,biases)
        conv3  = tf.nn.relu(bias, name=scope)
        
        params += [kernel, biases]
        report_info(conv3)
        
    with tf.name_scope('conv4') as scope:
        kernel = tf.Variable(tf.truncated_normal([3,3,384,256], dtype=tf.float32, stddev=1e-1), name='weights')
        conv   = tf.nn.conv2d(conv3, kernel, [1,1,1,1], padding='SAME')
        biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32), trainable=True, name='biases')
        bias   = tf.nn.bias_add(conv,biases)
        conv4  = tf.nn.relu(bias, name=scope)
        
        params += [kernel, biases]
        report_info(conv4)
        
    with tf.name_scope('conv5') as scope:
        kernel = tf.Variable(tf.truncated_normal([3,3,256,256], dtype=tf.float32, stddev=1e-1), name='weights')
        conv   = tf.nn.conv2d(conv4, kernel, [1,1,1,1], padding='SAME')
        biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32), trainable=True, name='biases')
        bias   = tf.nn.bias_add(conv,biases)
        conv5  = tf.nn.relu(bias, name=scope)
        
        params += [kernel, biases]
        report_info(conv5)
        
    pool5 = tf.nn.max_pool(conv5, ksize=[1,3,3,1],strides=[1,2,2,1],padding='VALID',name='pool5')
    report_info(pool5)
    
    return pool5, params


The following function was designed for evaluating the running time of AlexNet.

In [2]:
def run_time(session, target, info):
    num_steps_burn_in = 10 # warm-up
    total_duration = 0.0
    total_duration_squared = 0.0
    
    for i in range(num_batches + num_steps_burn_in):
        start_time = time.time()
        _ = session.run(target)
        duration = time.time() - start_time
        if i >= num_steps_burn_in:
            if not i % 10:
                print('%s: step %d, duration = %.3f' % (datetime.now(), i - num_steps_burn_in, duration))
            total_duration += duration
            total_duration_squared += duration * duration
            
        mean = total_duration / num_batches
        vari = total_duration_squared / num_batches - mean*mean
        stan = math.sqrt(vari)
        
        print('%s: %s arcoss %d steps, %.3f +/- %.3f sec / batch' % (datetime.now(), info_string, num_batches, mean, stan))

Now the main function.

In [None]:
def benchmark():
    with tf.Graph().as_default():
        image_size = 224
        images = tf.Variable(tf.random_normal([batch_size,image_size,image_size,3],dtype=tf.float32,stddev=1e-1)) # not using ImageNet
        pool5, parameters = inference(images)
        
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)
        
        run_time(sess, pool5, "Forward")
        objective = tf.nn.l2_loss(pool5)
        grad = tf.gradients(objective, parameters)
        run_time(sess, grad, "Forward-backward")

benchmark()
        

conv1   [32, 56, 56, 64]
pool1   [32, 27, 27, 64]
conv2   [32, 27, 27, 192]
pool2   [32, 13, 13, 192]
conv3   [32, 13, 13, 384]
conv4   [32, 13, 13, 256]
conv5   [32, 13, 13, 256]
pool5   [32, 6, 6, 256]


CNN training process needs more time, which requires many iterations. AlexNet won ILSVRC 2012, and reached 16.4% error. <br>

It is important to mention, the large dataset like ImageNet could serve the deep learning from trapping into overfitting. The traditional machine learning model needs smaller dataset, but deep learning requires large learning capacity.

To summary, CNN has strong abilities to extract features.