# ImageNet Classification with AlexNet

In [1]:
# EE488C Special Topics in EE <Deep Learning and AlphaGo>, Fall 2016
# Information Theory & Machine Learning Lab (http://itml.kaist.ac.kr), School of EE, KAIST
# written by Jongmin Yoon 
# 2016/11/08 

import numpy as np
from numpy import random
import math
import tensorflow as tf
from PIL import Image
from scipy.ndimage import zoom
import h5py
from caffe_classes import class_names

## Load an image (cat.jpg)

In [2]:
im1 = Image.open('cat.jpg') # Load a test image 'cat.jpg' 256 x 256 
if im1.mode != 'RGB':
    im1 = im1.convert('RGB') # Convert it into an RGB image

im1 = im1.resize((256, 256), Image.ANTIALIAS)
im1 = np.asarray(im1, dtype='float32')
im1 /= 255.
print np.shape(im1)
print type(im1[1][1][1])

(256, 256, 3)
<type 'numpy.float32'>


## Cropping

In [3]:
# We obtain 10 images of size 227 x 227 by shifting and cropping the image.
crop_entry = [[0, 0], [0, 29], [29, 0], [29, 29], [14, 14]]
im1_crop = np.empty((10, 227, 227, 3), dtype=np.float32)
for k in range(5):
    im1_crop[k, :, :, :] = im1[crop_entry[k][0]:crop_entry[k][0] + 227,
                               crop_entry[k][1]:crop_entry[k][1] + 227, :]
im1_crop[5:10, :, :, :] = im1_crop[0:5, :, ::-1, :]

im1_crop = im1_crop[:, :, :, [2, 1, 0]]
im1_crop = 255. * im1_crop
print np.shape(im1_crop)
print type(im1_crop[1][1][1][1])

(10, 227, 227, 3)
<type 'numpy.float32'>


## Subtract mean

In [4]:
mean_file = np.load('ilsvrc_2012_mean.npy').mean(1).mean(1) # https://github.com/BVLC/caffe/blob/master/python/caffe/imagenet/ilsvrc_2012_mean.npy
mean_file = np.expand_dims(mean_file, axis=0)
mean_file = np.expand_dims(mean_file, axis=0)
mean_file = np.expand_dims(mean_file, axis=0)
mean_repeat = np.repeat(mean_file, 10, axis=0)
mean_repeat = np.repeat(mean_repeat, 227, axis=1)
mean_repeat = np.repeat(mean_repeat, 227, axis=2)
im1_crop = im1_crop - mean_repeat

np.save('im1_cropped.npy', im1_crop)
print np.shape(im1_crop)
print type(im1[1][1][1])

(10, 227, 227, 3)
<type 'numpy.float32'>


## Load a weight set pre-trained by AlexNet

In [5]:
net_data = np.load("bvlc_alexnet.npy").item() # data type : dictionary
for x in net_data:
    exec ("%s = %s" % (str(x) + "W", "tf.Variable(net_data[x][0])"))
    exec ("%s = %s" % (str(x) + "b", "tf.Variable(net_data[x][1])"))

In [6]:
print len(net_data) # number of convolution layers + fully-connected layers
print net_data.keys()

8
[u'fc6', u'fc7', u'fc8', u'conv3', u'conv2', u'conv1', u'conv5', u'conv4']


In [7]:
print net_data.keys()[5]

conv1


In [8]:
print net_data.values()[5]

[array([[[[ -2.83153467e-02,  -7.19704404e-02,   3.78898042e-03, ...,
           -6.75531849e-03,   1.57427334e-03,  -7.34161213e-03],
         [  8.39668140e-03,  -6.77743629e-02,  -4.38617989e-02, ...,
            3.69289308e-03,  -2.12163124e-02,   2.08328497e-02],
         [ -6.01018872e-03,  -3.69863659e-02,   3.24545279e-02, ...,
            1.04015123e-03,  -1.75325349e-02,  -1.08853113e-02]],

        [[ -6.76262565e-03,  -9.85573977e-02,   2.73596449e-03, ...,
           -4.93216841e-03,  -4.97266697e-03,  -1.09553523e-02],
         [  3.13869417e-02,  -9.95327160e-02,  -3.94608229e-02, ...,
            8.02036002e-03,  -2.56089102e-02,   1.36672202e-02],
         [  1.04832407e-02,  -6.84838146e-02,   4.58296984e-02, ...,
            5.48922690e-04,  -1.75612103e-02,  -8.77811201e-03]],

        [[  1.69464536e-02,  -1.14213303e-01,  -1.37726509e-03, ...,
           -8.87093227e-03,  -8.55328236e-03,   1.48361339e-03],
         [  2.52514854e-02,  -1.22040361e-01,  -3.4470718

## Constructing AlexNet

In [9]:
def conv(input, kernel, biases, k_h, k_w, c_o, s_h, s_w, padding="VALID",group=1):
    
    input_groups, kernel_groups = tf.split(3, group, input), tf.split(3, group, kernel)
    convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
    output_groups = [ convolve(i, k) for i, k in zip(input_groups, kernel_groups) ]
    conv = tf.concat(3, output_groups)
    
    return tf.reshape(tf.nn.bias_add(conv, biases), [-1] + conv.get_shape().as_list()[1:])


# Input Layer
x = tf.placeholder(tf.float32, shape=(None, 227, 227, 3))

# Convoulational Layer 1
conv1 = tf.nn.relu( conv(x, conv1W, conv1b, 11, 11, 96, 4, 4, padding="VALID", group=1) )
lrn1 = tf.nn.local_response_normalization(conv1, depth_radius=2, alpha=2e-5, beta=0.75, bias=1.0)
maxpool1 = tf.nn.max_pool(lrn1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID')

# Convolutional Layer 2
conv2 = tf.nn.relu( conv(maxpool1, conv2W, conv2b, 5, 5, 256, 1, 1, padding="SAME", group=2) )
lrn2 = tf.nn.local_response_normalization( conv2, depth_radius=2, alpha=2e-5, beta=0.75, bias=1.0 )
maxpool2 = tf.nn.max_pool(lrn2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID')

# Convolutional Layer 3
conv3 = tf.nn.relu( conv(maxpool2, conv3W, conv3b, 3, 3, 384, 1, 1, padding="SAME", group=1))

# Convolutional Layer 4
conv4 = tf.nn.relu( conv(conv3, conv4W, conv4b, 3, 3, 384, 1, 1, padding="SAME", group=2))

# Convolutional Layer 5
conv5 = tf.nn.relu( conv(conv4, conv5W, conv5b, 3, 3, 256, 1, 1, padding="SAME", group=2))
maxpool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID')

# Fully-Connected Layer 1
fc6 = tf.nn.relu_layer( tf.reshape(maxpool5, [-1, int(np.prod(maxpool5.get_shape()[1:]))]), fc6W, fc6b)

# Fully-Connected Layer 2
fc7 = tf.nn.relu_layer(fc6, fc7W, fc7b)

# Fully-Connected Layer 3
fc8 = tf.nn.xw_plus_b(fc7, fc8W, fc8b)

# Output Layer
y_softmax = tf.nn.softmax(fc8)
y_ = tf.reduce_mean(y_softmax, 0)

## Eevulate AlexNet

In [10]:
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
output = sess.run(y_, feed_dict={x: im1_crop})

top_5 = sess.run(tf.nn.top_k(y_, 5), feed_dict={x: im1_crop})
print("Softmax\tLabel")
for k in range(5):
    print("%5.5f\t%s" % (top_5[0][k], class_names[top_5[1][k]]))

Softmax	Label
0.60385	tiger cat
0.26037	tabby, tabby cat
0.09443	Egyptian cat
0.03329	lynx, catamount
0.00271	Persian cat
