# End-to-End Training

In [1]:
import os
import cv2
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import pickle
import random

from ssd.ssd import SSD, Detector, UpSample, DownSample
from ssd.ssd_loss import MultiboxLoss
from ssd.ssd_bbox import BBoxUtility

%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['image.interpolation'] = 'nearest'

np.set_printoptions(suppress=True)

# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.9
# set_session(tf.Session(config=config))

# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
% load_ext autoreload
% autoreload 2

  return f(*args, **kwds)


### load data

In [2]:
# some constants
NUM_CLASSES = 4 # remember the background 
input_shape = (384, 384, 3)

In [7]:
"""
tf.reset_default_graph()
## priorbox defined by ourselves
prior_tensor = tf.ones((1, 300, 300, 3))
ssd_net = SSD(input_shape, NUM_CLASSES)
_, priors = ssd_net(prior_tensor, PRIORS = True)
print(priors.shape)
"""

(?, 7308, 8)


In [7]:
# DEBUG by tensorboard

g = tf.Graph()
with g.as_default():
    #a = tf.placeholder(tf.float32, name="a")
    #b = tf.placeholder(tf.float32, name="b")
    #c = a + b
    ## priorbox defined by ourselves
    input_tensor = tf.ones((1, 384, 384, 3))
    ssd_net = SSD()
    feats = ssd_net(input_tensor)
    detect_net = Detector(input_shape, NUM_CLASSES)
    priors = detect_net(feats, PRIORS=True)
    print(priors.shape)
    
tf.summary.FileWriter("logs", g).close()

(?, 11508, 8)


In [8]:
with g.as_default():
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        priors_np = sess.run(priors)

# save priors tensor
with open('./data/priors384.pkl', 'wb') as handle:
    pickle.dump(np.squeeze(priors_np), handle)

In [3]:
# load priors tensor
with open('./data/priors384.pkl', 'rb') as handle:
    priors_np = pickle.load(handle)
    print(priors_np.shape)

(11508, 8)


### bbox

In [4]:
#sess = tf.InteractiveSession()
#sess.run(tf.global_variables_initializer())
#priors_np = np.squeeze(sess.run(priors))
bbox_util = BBoxUtility(NUM_CLASSES, priors_np)

In [5]:
print([None,] + list(priors_np.shape))

[None, 11508, 8]


## Conv2DLSTM

In [6]:
seq_length = 4
batch_size = 4

#### final end-to-end model

In [105]:
#tf.reset_default_graph()
g = tf.Graph()
with g.as_default():
    seq_input_ph = tf.placeholder(tf.float32, [None, seq_length] + list(input_shape), name="input_image")
    gt_ph = tf.placeholder(tf.float32, [None, priors_np.shape[0], 16], name="ground_truth")

    with tf.variable_scope("ssd_net"):
        ssd_net = SSD()
        ssd_input = tf.reshape(seq_input_ph, [-1,]+list(input_shape), name="reshape_seq_input")
        feats = ssd_net(ssd_input)
        #seq_feats = tf.reshape(ssd_input, [-1, seq_length]+list(input_shape), name="reshape_seq")

    #with tf.variable_scope("up_net"):
    #    upsample_net = UpSample()
    #    feats_up, up_factors = upsample_net(feats)
    #    seq_feats_up = tf.reshape(ssd_input, [-1, seq_length]+feats_up.get_shape().as_list()[-3:], name="reshape_seq_up")

    with tf.variable_scope("core_conv_lstm"):
        pred_feats = []
        for i, feat in enumerate(feats[:-1]):
            seq_feat = tf.reshape(feat, [-1, seq_length]+feat.get_shape().as_list()[-3:], name="reshape_seq_{}".format(i))
            cell = tf.contrib.rnn.Conv2DLSTMCell(input_shape=feat.get_shape().as_list()[-3:], 
                                                 output_channels=128, kernel_shape=[5,5], name="convrnncell_{}".format(i))
            init_state = cell.zero_state(batch_size, tf.float32)
            outputs, final_state = tf.nn.dynamic_rnn(cell, seq_feat, dtype=tf.float32, scope="rnn_{}".format(i))
            pred_feats.append(final_state[-1])
            
        # last feature vector
        seq_feat = tf.reshape(feats[-1], [-1, seq_length]+feats[-1].get_shape().as_list()[-1:], name="last_reshape_seq")
        cell = tf.contrib.rnn.LSTMCell(128)
        init_state = cell.zero_state(batch_size, tf.float32)
        outputs, final_state = tf.nn.dynamic_rnn(cell, seq_feat, dtype=tf.float32, scope="last_dense_rnn")
        pred_feats.append(final_state[-1])
        
    #with tf.variable_scope("down_pooling"):
    #    down_net = DownSample()
    #    feats_down = down_net(final_state[-1], up_factors) # final_state: (c, h)
        
    with tf.variable_scope("final_prediction"):
        detector = Detector(input_shape, NUM_CLASSES)
        #predictions = detector(feats, PRIORS=False)
        final_predictions = detector(pred_feats, PRIORS=False)
        
    with tf.variable_scope("loss"):
        loss = MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss(gt_ph, final_predictions)
    
    with tf.variable_scope("train"):
        global_step = tf.Variable(0, trainable=False)
        lr = tf.train.exponential_decay(3e-4, global_step, num_train//batch_size, 0.9, staircase=True, name="lr")
        train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [51]:
tf.summary.FileWriter("logs", g).close()

#### load data

In [7]:
# load annotations
with open('./data/0254.pkl', 'rb') as handle:
    annotation0254 = pickle.load(handle)

In [8]:
## bounding box ground truth
gt = dict(pickle.load(open('./data/0254.pkl', 'rb')))
keys = list(gt.keys())
num_train = int(round(0.8 * len(keys)))
train_keys = keys[:num_train]
val_keys = keys[num_train:]
num_val = len(val_keys)
print(num_train)
print(num_val)

58
14


In [9]:
from ssd.preprocess import *
path_prefix = './data/0254img/'

# batch_size = 4
gen = SeqGenerator(gt, bbox_util, batch_size, path_prefix,
                train_keys, val_keys, input_shape, seq_length)

Using TensorFlow backend.


In [10]:
train_generator = gen.generate(True)
val_generator = gen.generate(False)

#### train

In [11]:
tf.reset_default_graph()
#g = tf.Graph()
#with g.as_default():
seq_input_ph = tf.placeholder(tf.float32, [None, seq_length] + list(input_shape), name="input_image")
gt_ph = tf.placeholder(tf.float32, [None, priors_np.shape[0], 16], name="ground_truth")

with tf.variable_scope("ssd_net"):
    ssd_net = SSD()
    ssd_input = tf.reshape(seq_input_ph, [-1,]+list(input_shape), name="reshape_seq_input")
    feats = ssd_net(ssd_input)
    #seq_feats = tf.reshape(ssd_input, [-1, seq_length]+list(input_shape), name="reshape_seq")

#with tf.variable_scope("up_net"):
#    upsample_net = UpSample()
#    feats_up, up_factors = upsample_net(feats)
#    seq_feats_up = tf.reshape(ssd_input, [-1, seq_length]+feats_up.get_shape().as_list()[-3:], name="reshape_seq_up")

with tf.variable_scope("core_conv_lstm"):
    pred_feats = []
    for i, feat in enumerate(feats[:-1]):
        seq_feat = tf.reshape(feat, [-1, seq_length]+feat.get_shape().as_list()[-3:], name="reshape_seq_{}".format(i))
        cell = tf.contrib.rnn.Conv2DLSTMCell(input_shape=feat.get_shape().as_list()[-3:], 
                                             output_channels=128, kernel_shape=[5,5], name="convrnncell_{}".format(i))
        init_state = cell.zero_state(batch_size, tf.float32)
        outputs, final_state = tf.nn.dynamic_rnn(cell, seq_feat, dtype=tf.float32, scope="rnn_{}".format(i))
        pred_feats.append(final_state[-1])

    # last feature vector
    seq_feat = tf.reshape(feats[-1], [-1, seq_length]+feats[-1].get_shape().as_list()[-1:], name="last_reshape_seq")
    cell = tf.contrib.rnn.LSTMCell(128)
    init_state = cell.zero_state(batch_size, tf.float32)
    outputs, final_state = tf.nn.dynamic_rnn(cell, seq_feat, dtype=tf.float32, scope="last_dense_rnn")
    pred_feats.append(final_state[-1])

#with tf.variable_scope("down_pooling"):
#    down_net = DownSample()
#    feats_down = down_net(final_state[-1], up_factors) # final_state: (c, h)

with tf.variable_scope("final_prediction"):
    detector = Detector(input_shape, NUM_CLASSES)
    #predictions = detector(feats, PRIORS=False)
    final_predictions = detector(pred_feats, PRIORS=False)

with tf.variable_scope("loss"):
    loss = MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss(gt_ph, final_predictions)

with tf.variable_scope("train"):
    global_step = tf.Variable(0, trainable=False)
    lr = tf.train.exponential_decay(3e-4, global_step, num_train//batch_size, 0.9, staircase=True, name="lr")
    train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [12]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

In [None]:
nb_epoch = 20
for e in range(nb_epoch):
    # training phase
    cnt = 0
    for sample in train_generator:
        train_input, train_target = sample
        train_loss, _ = sess.run([loss, train_op], feed_dict={seq_input_ph: train_input, gt_ph: train_target})
        train_loss_hist.append(np.mean(train_loss))
        cnt += 1
        print("training {}/{}, minibatch loss: {}".format(cnt*batch_size, num_train, train_loss))
        if cnt > (num_train//batch_size):
            cnt = 0
            break
        
    # validation phase
    val_loss = 0.0
    for sample in val_generator:
        _, val_input, val_target = sample
        val_loss += np.sum(sess.run(loss, feed_dict={seq_input_ph: val_input, gt_ph: val_target}))
        cnt += 1
        if cnt > (num_val//batch_size):
            cnt = 0
            break
    
    # print progress
    verbose_loss = np.mean(train_loss_hist[-(num_train//batch_size+1):])
    verbose_val_loss = val_loss/((num_val//batch_size+1)*batch_size)
    print("epoch {}/{}, train loss: {}, val loss: {}".format(e+1, nb_epoch, verbose_loss, verbose_val_loss))

In [13]:
seq_input_ph

<tf.Tensor 'input_image:0' shape=(?, 4, 384, 384, 3) dtype=float32>

In [14]:
gt_ph

<tf.Tensor 'ground_truth:0' shape=(?, 11508, 16) dtype=float32>