In [1]:
import os
import sys
import numpy as np
import tensorflow as tf

sys.path.append(os.path.join(os.getcwd(), '..'))
from nets import vgg
from utils.utils import *
from utils.tf_utils import *

slim = tf.contrib.slim
image_size = vgg.vgg_16.default_image_size

 딥 러닝이 여러 분야에서 높은 성능으로 각광을 받고 있지만, 이는 ImageNet과 같은 빅 데이터의 출현에 힘입어 이뤄진 일로 딥 러닝으로 높은 성능을 내기 위해서는 충분한 양의 학습 데이터가 필요하다. 특히, detection, segmentation 등의 task는 classification보다 고수준의 annotation이 필요하기에 충분한 데이터를 모은다는 건 굉장한 노력과 비용을 필요로 한다. 하지만, classification을 위해 ImageNet에서 학습 된 네트워크는 물체를 컴퓨터가 인식하기에 적합한 feature를 추출하므로 다른 task에서 좋은 시작점으로 사용될 수 있고, 결과적으로 상대적으로 적은 데이터로 높은 성능을 낼 수 있는 모델을 학습하는 데 큰 도움이 된다. <br><br>
 이번 튜토리얼에서는 ImageNet에서 학습된 vgg 16-layer 네트워크를 선언하고 checkpoint로부터 weight를 가져오는 방법을 공부해보자.

### Download pre-trained model (vgg-16)

In [4]:
data_dir = 'vgg_models'
data_url = 'http://download.tensorflow.org/models/vgg_16_2016_08_28.tar.gz'
maybe_download_and_extract(data_url, data_dir, 'vgg_16.ckpt')

### Create model
vgg 네트워크를 선언하고, checkpoint로부터 가져올 weight를 정해보자.
vgg 네트워크를 스스로 선언해도 되지만, 그러기 위해서는 checkpoint에 정의된 variable들과 name이 같도록 선언해야하므로 여기에서는 TF-Slim에서 제공하는 vgg 네트워크 선언 함수 (vgg.vgg_16())을 사용하여 네트워크를 정의하고, tf.contrib.slim.get_variables_to_restore()을 통해 checkpoint로부터 가져 올 weight를 정하도록 한다.

In [2]:
num_classes = 10
batch_size = 10
checkpoint_path = 'vgg_models/vgg_16.ckpt'

g = tf.Graph()

# input placeholders
images = tf.placeholder(dtype=tf.float32, shape=[batch_size, image_size, image_size, 3],
                           name='images')
labels = tf.placeholder(dtype=tf.int64, shape=[batch_size], name='labels')

# Create the model, use the default arg scope to configure the batch norm parameters.
with slim.arg_scope(vgg.vgg_arg_scope()):
    # 10 classes instead of 1001.
    logits, _ = vgg.vgg_16(images, num_classes=num_classes, is_training=True)

# Before defining remaining layers (softmax, optimizer), selecting the
# variables to be restored
exclude_layers = ['vgg_16/fc6', 'vgg_16/fc7', 'vgg_16/fc8']
#exclude_layers = ['vgg_16/fc8'] # when discaring only the last classification layer
variables_to_restore = slim.get_variables_to_restore(exclude=exclude_layers)
print('===> The list of variables to be restored:')
for i in variables_to_restore: print(i.op.name)

"""
# Below code is equivalent to slim.get_variables_to_restore()
exclusions = [scope.strip() for scope in exclude_layers]

variables_to_restore = []
for var in tf.global_variables():
    excluded = False
    for exclusion in exclusions:
        if var.op.name.startswith(exclusion):
            excluded = True
            break
    if not excluded:
        variables_to_restore.append(var) 
"""

# Define the loss function
probabilities = tf.nn.softmax(logits)
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=labels, logits=logits, name='cross_entropy_per_example')

# Specify the optimizer and create the train op:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss)

===> The list of variables to be restored:
vgg_16/conv1/conv1_1/weights
vgg_16/conv1/conv1_1/biases
vgg_16/conv1/conv1_2/weights
vgg_16/conv1/conv1_2/biases
vgg_16/conv2/conv2_1/weights
vgg_16/conv2/conv2_1/biases
vgg_16/conv2/conv2_2/weights
vgg_16/conv2/conv2_2/biases
vgg_16/conv3/conv3_1/weights
vgg_16/conv3/conv3_1/biases
vgg_16/conv3/conv3_2/weights
vgg_16/conv3/conv3_2/biases
vgg_16/conv3/conv3_3/weights
vgg_16/conv3/conv3_3/biases
vgg_16/conv4/conv4_1/weights
vgg_16/conv4/conv4_1/biases
vgg_16/conv4/conv4_2/weights
vgg_16/conv4/conv4_2/biases
vgg_16/conv4/conv4_3/weights
vgg_16/conv4/conv4_3/biases
vgg_16/conv5/conv5_1/weights
vgg_16/conv5/conv5_1/biases
vgg_16/conv5/conv5_2/weights
vgg_16/conv5/conv5_2/biases
vgg_16/conv5/conv5_3/weights
vgg_16/conv5/conv5_3/biases


네트워크를 선언하였으니 tf.train.Saver()를 이용하여 checkpoint로부터 weight를 읽어오자.<br>
위에서 convolutional layer들의 weight만 읽어오도록 하였으므로 아래에서 처음엔 모든 weight를 랜덤으로 초기화한 후, vgg checkpoint로부터 weight를 읽어와서 conv1_1과 fc7의 weight가 어떻게 프린트되는 지 확인해보자.

In [3]:
# Create the saver with variables to be restored
restorer = tf.train.Saver(variables_to_restore)
# Open the session
sess = tf.Session()
conv1_1_weights = tf.get_default_graph().get_tensor_by_name('vgg_16/conv1/conv1_1/weights:0')
fc7_weights = tf.get_default_graph().get_tensor_by_name('vgg_16/fc7/weights:0')

### Random initialization

In [15]:
sess.run(tf.global_variables_initializer())
rand_conv1_1 = conv1_1_weights[:,:,:,:].eval(session=sess)
rand_fc7 = fc7_weights[:,:,:,:].eval(session=sess)

print('Weight of conv1_1:')
print(rand_conv1_1[1,1,1,:10])
print('Weight of fc7:')
print(rand_fc7[0,0,0,:10])

Weight of conv1_1:
[-0.06183929 -0.03114004 -0.08775672 -0.00964408  0.0169952  -0.00359296
  0.05429178  0.01242758  0.07776228  0.08395071]
Weight of fc7:
[  1.10718589e-02   1.43526811e-02   1.79001484e-02  -2.61634178e-02
   2.50892732e-02   2.06239615e-02   5.19613735e-03  -1.34535339e-02
  -7.20545650e-05  -2.08393931e-02]


### Initialize the weights from vgg-16

In [16]:
restorer.restore(sess, save_path=checkpoint_path)
vgg_conv1_1 = conv1_1_weights[:,:,:,:].eval(session=sess)
vgg_fc7 = fc7_weights[:,:,:,:].eval(session=sess)

print('Weight of conv1_1:')
print(vgg_conv1_1[1,1,1,:10])
print('Weight of fc7:')
print(vgg_fc7[0,0,0,:10])

Weight of conv1_1:
[ 0.04063221  0.06581022  0.2203114  -0.42466447  0.20586449 -0.23609307
 -0.04312737 -0.10727409 -0.33554825 -0.09185937]
Weight of fc7:
[  1.10718589e-02   1.43526811e-02   1.79001484e-02  -2.61634178e-02
   2.50892732e-02   2.06239615e-02   5.19613735e-03  -1.34535339e-02
  -7.20545650e-05  -2.08393931e-02]


In [17]:
print('difference in conv1_1: ',np.sum(rand_conv1_1 - vgg_conv1_1))
print('difference in fc7: ', np.sum(rand_fc7 - vgg_fc7))

('difference in conv1_1: ', 4.4200602)
('difference in fc7: ', 0.0)


아래 함수는 nets/vgg.py에 정의된 vgg 16-layer 네트워크 정의 함수로 참고용으로 첨부하였음.

In [None]:
def vgg_16(inputs,
           num_classes=1000,
           is_training=True,
           dropout_keep_prob=0.5,
           spatial_squeeze=True,
           scope='vgg_16',
           fc_conv_padding='VALID'):
    """Oxford Net VGG 16-Layers version D Example.
    Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 224x224.
    Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.
    fc_conv_padding: the type of padding to use for the fully connected layer
      that is implemented as a convolutional layer. Use 'SAME' padding if you
      are applying the network in a fully convolutional manner and want to
      get a prediction map downsampled by a factor of 32 as an output. Otherwise,
      the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
    Returns:
    the last op containing the log predictions and end_points dict.
    """
    with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
        end_points_collection = sc.name + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
                            outputs_collections=end_points_collection):
            net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
            net = slim.max_pool2d(net, [2, 2], scope='pool3')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
            net = slim.max_pool2d(net, [2, 2], scope='pool4')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
            net = slim.max_pool2d(net, [2, 2], scope='pool5')
            # Use conv2d instead of fully_connected layers.
            net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
            net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                             scope='dropout6')
            net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
            net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                             scope='dropout7')
            net = slim.conv2d(net, num_classes, [1, 1],
                            activation_fn=None,
                            normalizer_fn=None,
                            scope='fc8')
            
        # Convert end_points_collection into a end_point dict.
        end_points = slim.utils.convert_collection_to_dict(end_points_collection)
        if spatial_squeeze:
        net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
        end_points[sc.name + '/fc8'] = net
        return net, end_points