Up to this point, we have been reusing layers from older, trained models, but they *still* are updated during the new round of training. They're probably near where they "should" be, assuming the new task is similar to the old one, but the gradients for these lower, reused layers are still calculated and the weights are still updated. For a very large neural network, it's a good idea to take "reuse" to its logical conclusion and freeze this weights under the confidence that they're "already where they should be." For large networks in particular, this will speed up training considerably since there are far fewer parameters to train. Freezing layers is implemented here but there won't be noticeable gains in such a small network.

In [1]:
from utils import *

# load data

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("tmp/")

Extracting tmp/train-images-idx3-ubyte.gz
Extracting tmp/train-labels-idx1-ubyte.gz
Extracting tmp/t10k-images-idx3-ubyte.gz
Extracting tmp/t10k-labels-idx1-ubyte.gz


In [2]:
# construction

reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 20
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
    logits = tf.layers.dense(hidden4, n_outputs, name="outputs")
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

In [3]:
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    # isolating only hidden3, hidden4, and outputs as trainable
    # hidden1 and hidden2 are frozen
    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="hidden[34]|outputs")
    # pass trainable variables to the optimizer when making training op
    training_op = optimizer.minimize(loss, var_list=train_vars)

In [4]:
init = tf.global_variables_initializer()
new_saver = tf.train.Saver() # need to make a new saver, if using another one to load existing weights

In [5]:
n_epochs = 20
batch_size = 200

# at this point, certain layers are frozen
# still need to load the pretrained weights

# reuse 1, 2, and 3. 1 and 2 are frozen, 3 will be re-trained.
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden[123]")
reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
restore_saver = tf.train.Saver(reuse_vars_dict)

with tf.Session() as sess:
    init.run()
    # pulls old weights in
    restore_saver.restore(sess, "savedmodels/11_07_gradientclipping.ckpt")
    
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
        if epoch % 5 == 0 or epoch == n_epochs - 1:
            acc_train = accuracy.eval(feed_dict={X:mnist.train.images, y:mnist.train.labels})
            acc_val = accuracy.eval(feed_dict={X:mnist.validation.images, y:mnist.validation.labels})
            print(epoch, "train acc:", acc_train, "val acc:", acc_val)
    acc_test = accuracy.eval(feed_dict={X:mnist.test.images, y:mnist.test.labels})
    print("Test acc:", acc_test)
    save_path = new_saver.save(sess, "savedmodels/11_13_frozen.ckpt")

INFO:tensorflow:Restoring parameters from savedmodels/11_07_gradientclipping.ckpt
0 train acc: 0.896855 val acc: 0.8986
5 train acc: 0.953073 val acc: 0.95
10 train acc: 0.958582 val acc: 0.956
15 train acc: 0.960509 val acc: 0.958
19 train acc: 0.961727 val acc: 0.9598
Test acc: 0.9567


Note for self: if I restore the checkpoint I just made, are the frozen layers still frozen?

Answer: yes

In [7]:
reset_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50  # reused
n_hidden3 = 50  # reused
n_hidden4 = 20  # new!
n_outputs = 10  # new!

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

In [8]:
with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
                              name="hidden1") # reused frozen
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
                              name="hidden2") # reused frozen
    hidden2_stop = tf.stop_gradient(hidden2)
    hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu,
                              name="hidden3") # reused, not frozen
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,
                              name="hidden4") # new!
    logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!

In [9]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [17]:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                               scope="hidden[123]") # regular expression
reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "savedmodels/11_13_frozen.ckpt")

    for epoch in range(n_epochs):
        print("*"*50)
        print("Epoch {}".format(epoch))

        h1 = sess.run("hidden1/kernel:0")
        h4 = sess.run("outputs/kernel:0")
        if "h1_old" in locals():
            print("h1 diff")
            print(np.sum(h1-h1_old))
        if "h4_old" in locals():
            print("h4 diff")
            print(np.sum(h4-h4_old))
        h1_old = h1
        h4_old = h4
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
                                                y: mnist.test.labels})
        print(epoch, "Test accuracy:", accuracy_val)


INFO:tensorflow:Restoring parameters from savedmodels/11_13_frozen.ckpt
**************************************************
Epoch 0
0 Test accuracy: 0.9018
**************************************************
Epoch 1
h1 diff
0.0
h4 diff
1.19209e-06
1 Test accuracy: 0.9353
**************************************************
Epoch 2
h1 diff
0.0
h4 diff
5.88596e-07
2 Test accuracy: 0.9421
**************************************************
Epoch 3
h1 diff
0.0
h4 diff
-1.85519e-06
3 Test accuracy: 0.9457
**************************************************
Epoch 4
h1 diff
0.0
h4 diff
2.99886e-07
4 Test accuracy: 0.949
**************************************************
Epoch 5
h1 diff
0.0
h4 diff
1.89431e-06
5 Test accuracy: 0.9492
**************************************************
Epoch 6
h1 diff
0.0
h4 diff
-9.94653e-07
6 Test accuracy: 0.9507
**************************************************
Epoch 7
h1 diff
0.0
h4 diff
-2.11876e-06
7 Test accuracy: 0.9518
**************************************