In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# pylint: disable=missing-docstring
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import math
import numpy as np
import tensorflow as tf

sys.path.append(os.path.join(os.getcwd(), '..'))
from nets import vgg
from utils.utils import *
from utils.tf_utils import *
sys.path.append(os.path.join(os.getcwd(), '..', '002_image_classification'))
from cifar10_loader import CIFAR10_loader

slim = tf.contrib.slim
image_size = vgg.vgg_16.default_image_size

 딥 러닝이 여러 분야에서 높은 성능으로 각광을 받고 있지만, 이는 ImageNet과 같은 빅 데이터의 출현에 힘입어 이뤄진 일로 딥 러닝으로 높은 성능을 내기 위해서는 충분한 양의 학습 데이터가 필요하다. 특히, detection, segmentation 등의 task는 classification보다 고수준의 annotation이 필요하기에 충분한 데이터를 모은다는 건 굉장한 노력과 비용을 필요로 한다. 하지만, classification을 위해 ImageNet에서 학습 된 네트워크는 물체를 컴퓨터가 인식하기에 적합한 feature를 추출하므로 다른 task에서 좋은 시작점으로 사용될 수 있고, 결과적으로 상대적으로 적은 데이터로 높은 성능을 낼 수 있는 모델을 학습하는 데 큰 도움이 된다. <br><br>
 이번 튜토리얼에서는 ImageNet에서 학습된 vgg 16-layer 네트워크를 선언하고 checkpoint로부터 weight를 가져오는 방법을 공부해보자.

### Download pre-trained model (vgg-16)

In [4]:
data_dir = 'vgg_models'
data_url = 'http://download.tensorflow.org/models/vgg_16_2016_08_28.tar.gz'
maybe_download_and_extract(data_url, data_dir, 'vgg_16.ckpt')

### Create model
vgg 네트워크를 선언하고, checkpoint로부터 가져올 weight를 정해보자.
vgg 네트워크를 스스로 선언해도 되지만, 그러기 위해서는 checkpoint에 정의된 variable들과 name이 같도록 선언해야하므로 여기에서는 TF-Slim에서 제공하는 vgg 네트워크 선언 함수 (vgg.vgg_16())을 사용하여 네트워크를 정의하고, tf.contrib.slim.get_variables_to_restore()을 통해 checkpoint로부터 가져 올 weight를 정하도록 한다.

In [2]:
num_classes = 10
batch_size = 100
checkpoint_path = 'vgg_models/vgg_16.ckpt'

g = tf.Graph()

# input placeholders
images = tf.placeholder(dtype=tf.float32, shape=[batch_size, image_size, image_size, 3],
                           name='images')
labels = tf.placeholder(dtype=tf.int64, shape=[batch_size], name='labels')

# Create the model, use the default arg scope to configure the batch norm parameters.
with slim.arg_scope(vgg.vgg_arg_scope()):
    # 10 classes instead of 1001.
    logits, _ = vgg.vgg_16(images, num_classes=num_classes, is_training=True)

# Before defining remaining layers (softmax, optimizer), selecting the
# variables to be restored
exclude_layers = ['vgg_16/fc8']
#exclude_layers = ['vgg_16/fc8'] # when discaring only the last classification layer
variables_to_restore = slim.get_variables_to_restore(exclude=exclude_layers)
print('===> The list of variables to be restored:')
for i in variables_to_restore: print(i.op.name)

"""
# Below code is equivalent to slim.get_variables_to_restore()
exclusions = [scope.strip() for scope in exclude_layers]

variables_to_restore = []
for var in tf.global_variables():
    excluded = False
    for exclusion in exclusions:
        if var.op.name.startswith(exclusion):
            excluded = True
            break
    if not excluded:
        variables_to_restore.append(var)
"""


# Define the loss function
probabilities = tf.nn.softmax(logits)
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=labels, logits=logits, name='cross_entropy_per_example')

# TODO: select variables to be learned
fix_layers = ['vgg_16/conv1', 'vgg_16/conv2', 'vgg_16/conv3', 'vgg_16/conv4', 'vgg_16/conv5']
variables_to_learn = slim.get_variables_to_restore(exclude=fix_layers)
#variables_to_learn = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
print('\n===> The list of variables to be learned:')
for i in variables_to_learn: print(i.op.name)

# Specify the optimizer and create the train op:
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
train_op = optimizer.minimize(loss, var_list=variables_to_learn)

global_step = tf.Variable(initial_value=0, name='global_step', trainable=False)

===> The list of variables to be restored:
vgg_16/conv1/conv1_1/weights
vgg_16/conv1/conv1_1/biases
vgg_16/conv1/conv1_2/weights
vgg_16/conv1/conv1_2/biases
vgg_16/conv2/conv2_1/weights
vgg_16/conv2/conv2_1/biases
vgg_16/conv2/conv2_2/weights
vgg_16/conv2/conv2_2/biases
vgg_16/conv3/conv3_1/weights
vgg_16/conv3/conv3_1/biases
vgg_16/conv3/conv3_2/weights
vgg_16/conv3/conv3_2/biases
vgg_16/conv3/conv3_3/weights
vgg_16/conv3/conv3_3/biases
vgg_16/conv4/conv4_1/weights
vgg_16/conv4/conv4_1/biases
vgg_16/conv4/conv4_2/weights
vgg_16/conv4/conv4_2/biases
vgg_16/conv4/conv4_3/weights
vgg_16/conv4/conv4_3/biases
vgg_16/conv5/conv5_1/weights
vgg_16/conv5/conv5_1/biases
vgg_16/conv5/conv5_2/weights
vgg_16/conv5/conv5_2/biases
vgg_16/conv5/conv5_3/weights
vgg_16/conv5/conv5_3/biases
vgg_16/fc6/weights
vgg_16/fc6/biases
vgg_16/fc7/weights
vgg_16/fc7/biases

===> The list of variables to be learned:
vgg_16/fc6/weights
vgg_16/fc6/biases
vgg_16/fc7/weights
vgg_16/fc7/biases
vgg_16/fc8/weights
vgg_16

In [63]:
tf.reset_default_graph()

네트워크를 선언하였으니 tf.train.Saver()를 이용하여 checkpoint로부터 weight를 읽어오자.<br>
위에서 convolutional layer들의 weight만 읽어오도록 하였으므로 아래에서 처음엔 모든 weight를 랜덤으로 초기화한 후, vgg checkpoint로부터 weight를 읽어와서 conv1_1과 fc7의 weight가 어떻게 프린트되는 지 확인해보자.

In [3]:
# Create the saver with variables to be restored
restorer = tf.train.Saver(variables_to_restore)
# Open the session
sess = tf.Session()
conv1_1_weights = tf.get_default_graph().get_tensor_by_name('vgg_16/conv1/conv1_1/weights:0')
fc7_weights = tf.get_default_graph().get_tensor_by_name('vgg_16/fc7/weights:0')

### Random initialization

In [4]:
sess.run(tf.global_variables_initializer())
rand_conv1_1 = conv1_1_weights[:,:,:,:].eval(session=sess)
rand_fc7 = fc7_weights[:,:,:,:].eval(session=sess)

print('Weight of conv1_1:')
print(rand_conv1_1[1,1,1,:10])
print('Weight of fc7:')
print(rand_fc7[0,0,0,:10])

Weight of conv1_1:
[-0.07034407  0.09699172  0.05994357  0.01191355  0.0790222  -0.05156257
  0.07133409  0.05756056  0.0888675  -0.05755238]
Weight of fc7:
[ 0.00698913 -0.00441733 -0.02221106 -0.02669755  0.01187419 -0.00269974
 -0.00629738 -0.00543988  0.00157767  0.02522648]


### Initialize the weights from vgg-16

In [5]:
restorer.restore(sess, save_path=checkpoint_path)
vgg_conv1_1 = conv1_1_weights[:,:,:,:].eval(session=sess)
vgg_fc7 = fc7_weights[:,:,:,:].eval(session=sess)

print('Weight of conv1_1:')
print(vgg_conv1_1[1,1,1,:10])
print('Weight of fc7:')
print(vgg_fc7[0,0,0,:10])

Weight of conv1_1:
[ 0.04063221  0.06581022  0.2203114  -0.42466447  0.20586449 -0.23609307
 -0.04312737 -0.10727409 -0.33554825 -0.09185937]
Weight of fc7:
[ 0.00390148 -0.00180807  0.00136159 -0.00262455 -0.00475213 -0.00220464
 -0.00268095  0.00679421 -0.00499181  0.00407928]


In [17]:
print('difference in conv1_1: ',np.sum(rand_conv1_1 - vgg_conv1_1))
print('difference in fc7: ', np.sum(rand_fc7 - vgg_fc7))

('difference in conv1_1: ', 4.4200602)
('difference in fc7: ', 0.0)


### Training the model in CIFAR-10

In [9]:
loader = CIFAR10_loader()
class_names = loader.get_class_names()

# Model parameters
init_from = '' # checkpoint path
save_path = 'cifar10_checkpoints/cifar10_cnn'
if not os.path.exists('cifar10_checkpoints'): os.makedirs('cifar10_checkpoints')
num_epochs = 10
iteration_per_epoch = int(math.floor(loader.get_num_train_examples() / batch_size))
save_checkpoint_frequency = 250
print_frequency = 30

In [11]:
# Train the model
saver = tf.train.Saver()
for ie in range(num_epochs):
    for ii in range(iteration_per_epoch):
        # Load a batch data
        batch = loader.get_batch(batch_size, 'train', (224,224))

        # Run the optimizer
        _ = sess.run([train_op], feed_dict={images:batch['images'],
                                            labels:batch['labels']})

        # Print the accuracy and loss of current batch data
        if (ii+1) % print_frequency == 0:
            batch_loss, batch_prob = sess.run([loss, probabilities], 
                                             feed_dict={images:batch['images'],
                                                        labels:batch['labels']})
            pred_labels = np.argmax(batch_prob, axis=1)
            batch_loss = np.mean(batch_loss)
            batch_acc =np.mean(np.equal(pred_labels, batch['labels']))
            print('%d Epoch %d iteration - Loss (%.3f) Accuracy (%.3f)'
                      %(ie+1, ii+1, batch_loss, batch_acc))

        # Save checkpoint
        if (ii+1) % save_checkpoint_frequency == 0:
            saver.save(sess, save_path=save_path, global_step=ie*iteration_per_epoch + ii + 1)
            print('Saved checkpoint %s_%d' % (save_path, ie*iteration_per_epoch + ii + 1))

1 Epoch 10 iteration - Loss (0.145) Accuracy (0.930)
Saved checkpoint cifar10_checkpoints/cifar10_cnn_10
1 Epoch 20 iteration - Loss (0.114) Accuracy (0.970)


KeyboardInterrupt: 

In [24]:
# Evaluate the model
iteration_per_epoch = int(math.floor(loader.get_num_test_examples() / batch_size))
checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir='cifar10_checkpoints/')
print('Last checkpoint path is %s' % (checkpoint_path))

# Load the checkpoint or initialize the variables
saver.restore(sess, save_path=checkpoint_path)
print('Model is restored from %s' % checkpoint_path)

# Evaluate the model
loader.reset()
ii = 1
num_correct = 0
num_examples = 0
while True:
    # Load a batch data
    batch = loader.get_batch(batch_size, 'test', (224,224))
    if batch['wrapped']: break

    # Compute the correct numbers
    batch_prob = sess.run(probabilities, feed_dict={images:batch['images'],
                                                      labels:batch['labels']})

    pred_labels = np.argmax(batch_prob, axis=1)
    batch_correct_num = np.sum(np.equal(pred_labels, batch['labels']))
    num_correct += batch_correct_num
    num_examples += batch_size
    
    if (ii+1) % 10 == 0:
        print('%d/%d done' % (ii+1, iteration_per_epoch))
    ii += 1
print('Test accuracy: %.2f%%' % (num_correct / num_examples * 100.0))

Last checkpoint path is cifar10_checkpoints/cifar10_cnn-10
Model is restored from cifar10_checkpoints/cifar10_cnn-10
10/100 done
20/100 done
30/100 done
40/100 done
50/100 done
60/100 done
70/100 done
80/100 done
90/100 done
100/100 done
Test accuracy: 83.45%


아래 함수는 nets/vgg.py에 정의된 vgg 16-layer 네트워크 정의 함수로 참고용으로 첨부하였음.

In [None]:
def vgg_16(inputs,
           num_classes=1000,
           is_training=True,
           dropout_keep_prob=0.5,
           spatial_squeeze=True,
           scope='vgg_16',
           fc_conv_padding='VALID'):
    """Oxford Net VGG 16-Layers version D Example.
    Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 224x224.
    Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.
    fc_conv_padding: the type of padding to use for the fully connected layer
      that is implemented as a convolutional layer. Use 'SAME' padding if you
      are applying the network in a fully convolutional manner and want to
      get a prediction map downsampled by a factor of 32 as an output. Otherwise,
      the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
    Returns:
    the last op containing the log predictions and end_points dict.
    """
    with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
        end_points_collection = sc.name + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
                            outputs_collections=end_points_collection):
            net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
            net = slim.max_pool2d(net, [2, 2], scope='pool3')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
            net = slim.max_pool2d(net, [2, 2], scope='pool4')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
            net = slim.max_pool2d(net, [2, 2], scope='pool5')
            # Use conv2d instead of fully_connected layers.
            net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
            net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                             scope='dropout6')
            net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
            net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                             scope='dropout7')
            net = slim.conv2d(net, num_classes, [1, 1],
                            activation_fn=None,
                            normalizer_fn=None,
                            scope='fc8')
            
        # Convert end_points_collection into a end_point dict.
        end_points = slim.utils.convert_collection_to_dict(end_points_collection)
        if spatial_squeeze:
        net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
        end_points[sc.name + '/fc8'] = net
        return net, end_points