In [163]:
# Import necessary packages
import tensorflow as tf
import tqdm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [205]:
def conv_layer(input_, filter_size, out_channels, stride_size, padding_name, is_training, 
               activation_fn=None, identifier=''):
    
    in_channels = input_.shape[-1].value

    # initialize filter weights 
    filter_weights = tf.Variable(tf.truncated_normal(shape=[filter_size, filter_size, in_channels, out_channels]))
    
    conv_output = tf.nn.conv2d(input_, filter_weights, strides = [1, stride_size, stride_size, 1], 
                               padding = padding_name, name=identifier)

    print("conv_output shape", conv_output.shape)
    out_channels = conv_output.shape[-1].value
    #
    gamma = tf.Variable(tf.ones([out_channels]))
    beta = tf.Variable(tf.zeros([out_channels]))
    
    popu_mean = tf.Variable(tf.zeros([out_channels]), trainable=False)
    popu_var = tf.Variable(tf.ones([out_channels]), trainable=False)
    
    epsilon = 1e-3
    
    def batch_norm_training():
        
        batch_mean, batch_var = tf.nn.moments(conv_output, [0,1,2])
        
        decay=0.99
        train_mean = tf.assign(popu_mean, popu_mean * decay + batch_mean*(1-decay))
        train_var = tf.assign(popu_var, popu_var * decay + batch_var*(1-decay))

        with tf.control_dependencies([train_mean, train_var]):
            normalized_conv_output = (conv_output - batch_mean) / tf.sqrt(batch_var + epsilon)
            return gamma * normalized_conv_output + beta
            
    def batch_norm_inference():
        normalized_conv_output = (conv_output - popu_mean) / tf.sqrt(popu_var + epsilon)
        return gamma * normalized_conv_output + beta
        
    batch_normalized_output = tf.cond(is_training, batch_norm_training, batch_norm_inference)
    
    if activation_fn:
        return activation_fn(batch_normalized_output)
    else:
        return batch_normalized_output

In [206]:
def max_polling_2d(input_, filter_size, stride_size, padding_name, identifier):
    return tf.nn.max_pool(input_, ksize=[1, filter_size, filter_size, 1], strides=[1, stride_size, stride_size, 1],
                          padding=padding_name, name=identifier)

In [207]:
def identity_block(input_, is_training, filter_numbers, stage, block, filters=2):
    
    prefix = 's' + str(stage) + '_' + str(block)
    main_branch = prefix + '_main_branch_layer'
    
    F1, F2, F3 = filter_numbers
    activation_fn = tf.nn.relu
    short_cut = input_
    conv_output = conv_layer(input_, 1, F1, 1, "VALID", is_training, tf.nn.relu, main_branch+'1')
    conv_output = conv_layer(conv_output, filters, F2, 1, "SAME", is_training, tf.nn.relu, main_branch+'2')
    conv_output = conv_layer(conv_output, 1, F3, 1, "VALID", is_training, None, main_branch+'3')
    
    # add conv_output and short_cut
    print(prefix + "_conv_output:", conv_output.shape)
    print(prefix + "_short_cut:", short_cut.shape)
    merge_output = tf.add(conv_output, short_cut)
    return activation_fn(merge_output)
    

In [208]:
tf.reset_default_graph()

with tf.Session() as test:
    A_prev = tf.placeholder("float", [3, 4, 4, 6])
    is_training = tf.placeholder(tf.bool, name="is_training")
    X = np.random.randn(3, 4, 4, 6)
    A = identity_block(A_prev, is_training, filter_numbers = [2, 4, 6], stage = 1,block = 1)
    test.run(tf.global_variables_initializer())
    out = test.run([A], feed_dict={A_prev: X, is_training:True})
    print("out shape:", out[0].shape)
    print("out = " + str(out[0]))

conv_output shape (3, 4, 4, 2)
conv_output shape (3, 4, 4, 4)
conv_output shape (3, 4, 4, 6)
s1_1_conv_output: (3, 4, 4, 6)
s1_1_short_cut: (3, 4, 4, 6)
out shape: (3, 4, 4, 6)
out = [[[[1.86426163e-01 1.09220004e+00 1.49914622e+00 0.00000000e+00
    4.10130620e-01 3.02119160e+00]
   [0.00000000e+00 2.28442326e-01 0.00000000e+00 9.30261552e-01
    0.00000000e+00 0.00000000e+00]
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.24325192e+00
    1.50421178e+00 7.57815540e-02]
   [1.37787247e+00 0.00000000e+00 0.00000000e+00 5.04302025e-01
    1.14520073e-01 1.85038030e-01]]

  [[7.90697932e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
    0.00000000e+00 2.25848150e+00]
   [0.00000000e+00 7.33518600e-03 0.00000000e+00 8.78578186e-01
    0.00000000e+00 0.00000000e+00]
   [2.78966331e+00 6.95649505e-01 1.67922139e+00 0.00000000e+00
    8.54227364e-01 3.63426375e+00]
   [1.31497681e-01 0.00000000e+00 0.00000000e+00 1.79516482e+00
    1.79905856e+00 0.00000000e+00]]

  [[1.69496405e+00 1.

In [209]:
def convolutional_block(input_, is_training, filter_numbers, stage, block, filters=2, strides=2):
    
    prefix = 's' + str(stage) + '_conv_block' + str(block)
    main_branch = prefix + '_main_branch_layer'
    short_cut_branch = prefix + 'short_cut_branch_layer'
    
    F1, F2, F3 = filter_numbers
    activation_fn = tf.nn.relu
    short_cut = input_
    
    conv_output = conv_layer(input_, 1, F1, strides, "VALID", is_training, activation_fn, main_branch+'1')
    conv_output = conv_layer(conv_output, filters, F2, 1, "SAME", is_training, activation_fn, main_branch+'2')
    conv_output = conv_layer(conv_output, 1, F3, 1, "VALID", is_training, None, main_branch + '3')
    
    short_cut = conv_layer(short_cut, 1, F3, strides, "VALID", is_training, None, short_cut_branch + '1')
    
    # add conv_output and short_cut
    print("conv_output:", conv_output.shape)
    print("short_cut:", short_cut.shape)
    merge_output = tf.add(conv_output, short_cut)
    return activation_fn(merge_output)

In [210]:
tf.reset_default_graph()

with tf.Session() as test:
    A_prev = tf.placeholder("float", [3, 4, 4, 6])
    is_training = tf.placeholder(tf.bool, name="is_training")
    X = np.random.randn(3, 4, 4, 6)
    A = convolutional_block(A_prev, is_training, filter_numbers = [2, 4, 6], stage=1, block=1)
    test.run(tf.global_variables_initializer())
    out = test.run([A], feed_dict={A_prev: X, is_training:True})
    print("out shape:", out[0].shape)
    print("out = " + str(out[0]))

conv_output shape (3, 2, 2, 2)
conv_output shape (3, 2, 2, 4)
conv_output shape (3, 2, 2, 6)
conv_output shape (3, 2, 2, 6)
conv_output: (3, 2, 2, 6)
short_cut: (3, 2, 2, 6)
out shape: (3, 2, 2, 6)
out = [[[[0.         1.6671418  0.         0.         0.         0.        ]
   [2.6019182  1.3565595  2.7631233  0.         0.         0.        ]]

  [[0.         0.         0.         1.1593063  0.         2.2558084 ]
   [0.62923217 0.         1.5977818  0.15540045 0.8058351  0.93204105]]]


 [[[0.         0.         0.         1.0328366  1.2184459  0.        ]
   [2.0531018  0.         1.6534753  0.         0.         0.22060102]]

  [[0.         2.120238   0.         0.         0.         0.        ]
   [0.         0.         0.         2.2181234  3.047166   2.1592522 ]]]


 [[[0.19914399 0.         0.250405   0.3909369  0.         0.        ]
   [0.24958289 0.5537703  0.         2.0094137  0.8390813  0.7181118 ]]

  [[0.05527666 1.4917161  0.         0.40387866 0.         0.        ]
 

## 3 - Building your ResNet model (50 layers)

You now have the necessary blocks to build a very deep ResNet. The following figure describes in detail the architecture of this neural network. "ID BLOCK" in the diagram stands for "Identity block," and "ID BLOCK x3" means you should stack 3 identity blocks together.

<img src="images/resnet_kiank.png" style="width:850px;height:150px;">
<caption><center> <u> <font color='purple'> **Figure 5** </u><font color='purple'>  : **ResNet-50 model** </center></caption>

The details of this ResNet-50 model are:
- Zero-padding pads the input with a pad of (3,3)
- Stage 1:
    - The 2D Convolution has 64 filters of shape (7,7) and uses a stride of (2,2). Its name is "conv1".
    - BatchNorm is applied to the channels axis of the input.
    - MaxPooling uses a (3,3) window and a (2,2) stride.
- Stage 2:
    - The convolutional block uses three set of filters of size [64,64,256], "f" is 3, "s" is 1 and the block is "a".
    - The 2 identity blocks use three set of filters of size [64,64,256], "f" is 3 and the blocks are "b" and "c".
- Stage 3:
    - The convolutional block uses three set of filters of size [128,128,512], "f" is 3, "s" is 2 and the block is "a".
    - The 3 identity blocks use three set of filters of size [128,128,512], "f" is 3 and the blocks are "b", "c" and "d".
- Stage 4:
    - The convolutional block uses three set of filters of size [256, 256, 1024], "f" is 3, "s" is 2 and the block is "a".
    - The 5 identity blocks use three set of filters of size [256, 256, 1024], "f" is 3 and the blocks are "b", "c", "d", "e" and "f".
- Stage 5:
    - The convolutional block uses three set of filters of size [512, 512, 2048], "f" is 3, "s" is 2 and the block is "a".
    - The 2 identity blocks use three set of filters of size [512, 512, 2048], "f" is 3 and the blocks are "b" and "c".
- The 2D Average Pooling uses a window of shape (2,2) and its name is "avg_pool".
- The flatten doesn't have any hyperparameters or name.
- The Fully Connected (Dense) layer reduces its input to the number of classes using a softmax activation. Its name should be `'fc' + str(classes)`.

**Exercise**: Implement the ResNet with 50 layers described in the figure above. We have implemented Stages 1 and 2. Please implement the rest. (The syntax for implementing Stages 3-5 should be quite similar to that of Stage 2.) Make sure you follow the naming convention in the text above. 

In [280]:
class ResNet:
    
    def __init__(self, class_num):
        
        self.X = tf.placeholder(tf.float32, [None, 28, 28, 1])
        self.keep_probability = tf.placeholder(tf.float32, name="keep_probability")
        self.is_training = tf.placeholder(tf.bool, name="is_training")
        self.Y = tf.placeholder(tf.float32, shape=[None, class_num])

        # Stage 1
        x = conv_layer(self.X, 7, 64, 2, "VALID", self.is_training, tf.nn.relu, 'stage_1')
        x = max_polling_2d(x, 3, 2, "VALID", "stage_1_max_pooling")

        # stage 2
        x = convolutional_block(x, self.is_training, [64,64,256], 2, 1, filters=3, strides=1)
        x = identity_block(x, self.is_training, [64,64,256], 2, 2, filters=3)
        x = identity_block(x, self.is_training, [64,64,256], 2, 3, filters=3)

        # stage 3
        x = convolutional_block(x, self.is_training, [128,128,512], 3, 1, filters=3, strides=2)
        x = identity_block(x, self.is_training, [128,128,512], 3, 2, filters=3)
        x = identity_block(x, self.is_training, [128,128,512], 3, 3, filters=3)
        x = identity_block(x, self.is_training, [128,128,512], 3, 4, filters=3)

#         # stage 4
#         x = convolutional_block(x, self.is_training, [256, 256, 1024], 4, 1, filters=3, strides=2)
#         x = identity_block(x, self.is_training, [256, 256, 1024], 4, 2, filters=3)
#         x = identity_block(x, self.is_training, [256, 256, 1024], 4, 3, filters=3)
#         x = identity_block(x, self.is_training, [256, 256, 1024], 4, 4, filters=3)
#         x = identity_block(x, self.is_training, [256, 256, 1024], 4, 5, filters=3)
#         x = identity_block(x, self.is_training, [256, 256, 1024], 4, 6, filters=3)

#         # stage 5
#         x = convolutional_block(x, self.is_training, [512, 512, 2048], 5, 1, filters=3, strides=2)
#         x = identity_block(x, self.is_training, [512, 512, 2048], 5, 2, filters=3)
#         x = identity_block(x, self.is_training, [512, 512, 2048], 5, 3, filters=3)

        print("input to dense layer:", x.shape)
        x = tf.reshape(x, [-1, x.shape[1].value*x.shape[2].value*x.shape[3].value])
        print("input to dense layer:", x.shape)
    #     x = Flatten()(x)

        dense = tf.layers.dense(inputs=x, units=1024, activation=tf.nn.relu)
#         dropout = tf.nn.dropout(dense, self.keep_probability)

        self.logits = tf.layers.dense(inputs=dense, units=class_num)
        self.prediction = tf.nn.softmax(self.logits, name="softmax_tensor")

        print("prediction shape:", self.prediction.shape)
        print("Y shape:", self.Y.shape)
        # loss
        self.loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.Y))

        # optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        self.train_op = self.optimizer.minimize(self.loss_op)
        
        # Accuracy
        self.correct_pred = tf.equal(tf.argmax(self.prediction, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))
    
    def train(self, epochs=10, batch_size=128):
        
        test_valid_size = 256
        init = tf.global_variables_initializer()
        with tf.Session() as sess:

            sess.run(init)

            for epoch in range(epochs):

                for batch in range(mnist.train.num_examples//batch_size + 1):
                    batch_x, batch_y = mnist.train.next_batch(batch_size)
                    _, loss = sess.run([self.train_op, self.loss_op] , feed_dict={self.X:batch_x, self.Y:batch_y, self.is_training:True})
                    acc = sess.run(self.accuracy, feed_dict={
                            self.X: mnist.validation.images[:test_valid_size],
                            self.Y: mnist.validation.labels[:test_valid_size], self.is_training:False})
                    if batch % 50 == 0:
                        print("batch " + str(batch) + ", Minibatch Loss= " + \
                          "{:.4f}".format(loss) + ", Training Accuracy= " + \
                          "{:.3f}".format(acc))
                        
            test_acc = sess.run(self.accuracy, feed_dict={
            self.X: mnist.test.images[:test_valid_size],
            self.Y: mnist.test.labels[:test_valid_size], self.is_training:False})
            print('Testing Accuracy: {}'.format(test_acc))
    
#     def test(self):
#         test_valid_size = 256
#         with tf.Session() as sess:
#             # Calculate Test Accuracy
#             print("here")
#             test_acc = sess.run(self.accuracy, feed_dict={
#             self.X: mnist.test.images[:test_valid_size],
#             self.Y: mnist.test.labels[:test_valid_size], self.is_training:False})
#             print('Testing Accuracy: {}'.format(test_acc))

In [281]:
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("/tmp/tensorflow/mnist/input_data", one_hot=True, reshape=False)

Extracting /tmp/tensorflow/mnist/input_data/train-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/train-labels-idx1-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/t10k-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/t10k-labels-idx1-ubyte.gz


In [282]:
resNet = ResNet(10)

conv_output shape (?, 11, 11, 64)
conv_output shape (?, 5, 5, 64)
conv_output shape (?, 5, 5, 64)
conv_output shape (?, 5, 5, 256)
conv_output shape (?, 5, 5, 256)
conv_output: (?, 5, 5, 256)
short_cut: (?, 5, 5, 256)
conv_output shape (?, 5, 5, 64)
conv_output shape (?, 5, 5, 64)
conv_output shape (?, 5, 5, 256)
s2_2_conv_output: (?, 5, 5, 256)
s2_2_short_cut: (?, 5, 5, 256)
conv_output shape (?, 5, 5, 64)
conv_output shape (?, 5, 5, 64)
conv_output shape (?, 5, 5, 256)
s2_3_conv_output: (?, 5, 5, 256)
s2_3_short_cut: (?, 5, 5, 256)
conv_output shape (?, 3, 3, 128)
conv_output shape (?, 3, 3, 128)
conv_output shape (?, 3, 3, 512)
conv_output shape (?, 3, 3, 512)
conv_output: (?, 3, 3, 512)
short_cut: (?, 3, 3, 512)
conv_output shape (?, 3, 3, 128)
conv_output shape (?, 3, 3, 128)
conv_output shape (?, 3, 3, 512)
s3_2_conv_output: (?, 3, 3, 512)
s3_2_short_cut: (?, 3, 3, 512)
conv_output shape (?, 3, 3, 128)
conv_output shape (?, 3, 3, 128)
conv_output shape (?, 3, 3, 512)
s3_3_conv_ou

In [283]:
resNet.train(epochs=3)

batch 0, Minibatch Loss= 3.9124, Training Accuracy= 0.102
batch 50, Minibatch Loss= 0.9122, Training Accuracy= 0.074
batch 100, Minibatch Loss= 0.4972, Training Accuracy= 0.094
batch 150, Minibatch Loss= 0.3825, Training Accuracy= 0.191
batch 200, Minibatch Loss= 0.3823, Training Accuracy= 0.328
batch 250, Minibatch Loss= 0.2498, Training Accuracy= 0.691
batch 300, Minibatch Loss= 0.2710, Training Accuracy= 0.895
batch 350, Minibatch Loss= 0.2500, Training Accuracy= 0.918
batch 400, Minibatch Loss= 0.1698, Training Accuracy= 0.938
batch 0, Minibatch Loss= 0.0922, Training Accuracy= 0.941
batch 50, Minibatch Loss= 0.0946, Training Accuracy= 0.953
batch 100, Minibatch Loss= 0.1208, Training Accuracy= 0.941
batch 150, Minibatch Loss= 0.0478, Training Accuracy= 0.953
batch 200, Minibatch Loss= 0.1751, Training Accuracy= 0.945
batch 250, Minibatch Loss= 0.1487, Training Accuracy= 0.949
batch 300, Minibatch Loss= 0.1572, Training Accuracy= 0.953
batch 350, Minibatch Loss= 0.1936, Training Ac