# Sparsifying Convolutional Neural Network using modified RVSM

Algorithm:

\begin{align*}
u^t &= H_{\gamma}(w^t)\\
w^{t+1} &= w^t - \nabla f(u^t)
\end{align*}


where $H_{\gamma}(\cdot)$ is the hard thresholding operator.

In [3]:
from __future__ import division, print_function, absolute_import

import tensorflow as tf
# tf.enable_eager_execution()
import numpy as np

# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [4]:
# Training Parameters
learning_rate = 0.001
# learning_rate = 0.00005
num_steps = 500
batch_size = 128
display_step = 50

In [5]:
# Network Parameters
num_input = 784 # MNIST data input (img shape: 28*28)
num_classes = 10 # MNIST total classes (0-9 digits)
dropout = 0.75 # Dropout, probability to keep units

# tf Graph input
X = tf.placeholder(tf.float32, [None, num_input])
Y = tf.placeholder(tf.float32, [None, num_classes])
# X = tf.zeros([num_input,1])
# Y = tf.zeros([1,num_classes])
keep_prob = tf.constant(0.2) # dropout (keep probability)

In [6]:
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)


def maxpool2d(x, k=2):
    # MaxPool2D wrapper
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                          padding='SAME')


# Create model
def conv_net(x, weights, biases, dropout):
    # MNIST data input is a 1-D vector of 784 features (28*28 pixels)
    # Reshape to match picture format [Height x Width x Channel]
    # Tensor input become 4-D: [Batch Size, Height, Width, Channel]
    x = tf.reshape(x, shape=[-1, 28, 28, 1])

    # Convolution Layer
    conv1 = conv2d(x, weights['wc1'], biases['bc1'])
    # Max Pooling (down-sampling)
    conv1 = maxpool2d(conv1, k=2)

    # Convolution Layer
    conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
    # Max Pooling (down-sampling)
    conv2 = maxpool2d(conv2, k=2)

    # Fully connected layer
    # Reshape conv2 output to fit fully connected layer input
    fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
    fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
    fc1 = tf.nn.relu(fc1)
    # Apply Dropout
    fc1 = tf.nn.dropout(fc1, dropout)

    # Output, class prediction
    out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
    return out

In [7]:
# Store layers weight & bias
weights = {
    # 5x5 conv, 1 input, 32 outputs
    'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
    # 5x5 conv, 32 inputs, 64 outputs
    'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
    # fully connected, 7*7*64 inputs, 1024 outputs
    'wd1': tf.Variable(tf.random_normal([7*7*64, 1024])),
    # 1024 inputs, 10 outputs (class prediction)
    'out': tf.Variable(tf.random_normal([1024, num_classes]))
}

u = tf.Variable(tf.random_normal([7*7*64, 1024]))

weights_u = {
    'wc1': weights['wc1'],
    'wc2': weights['wc2'],
    'wd1': u,
    'out': weights['out']
}

biases = {
    'bc1': tf.Variable(tf.random_normal([32])),
    'bc2': tf.Variable(tf.random_normal([64])),
    'bd1': tf.Variable(tf.random_normal([1024])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}


Instructions for updating:
Colocations handled automatically by placer.


In [8]:
# Define hard thresholding parameters
lamb = 0.0005
beta = 0.1
gamma = lamb/beta

# Construct model
logits = conv_net(X, weights, biases, keep_prob)
prediction = tf.nn.softmax(logits)

logits_u = conv_net(X, weights_u, biases, keep_prob)
prediction_u = tf.nn.softmax(logits_u)

# Define loss and optimizers
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits, labels=Y))

loss_op_u = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits_u, labels=Y))

### AdamOptimizer with proximal descent
opt = tf.train.AdamOptimizer(learning_rate=learning_rate)

grad_u = opt.compute_gradients(loss_op_u)[4][0]
grad = opt.compute_gradients(loss_op)

# Compute w and u
w = grad[2][1]   # 3rd variable is w
condition = tf.greater(tf.abs(w), np.sqrt(2*gamma))
u = u.assign(tf.where(condition, w, tf.zeros_like(w)))

# Feed gradients and original w to Adam
# + beta * (w - u)
grad[2] = (grad_u, w)   
train_op = opt.apply_gradients(grad)   

# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [9]:
# Initialize save function
saver = tf.train.Saver()

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [13]:
# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, num_steps+1):
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: batch_x, Y: batch_y, keep_prob: dropout})
        u.eval()
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
                                                                 Y: batch_y,
                                                                 keep_prob: 1.0})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    # Calculate accuracy for 256 MNIST test images
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: mnist.test.images[:256],
                                      Y: mnist.test.labels[:256],
                                      keep_prob: 1.0}))
    wbar = tf.Variable(w.eval())
    saver.save(sess, 'mnist1.chkp')


Step 1, Minibatch Loss= 69629.6562, Training Accuracy= 0.094
Step 50, Minibatch Loss= 2844.1426, Training Accuracy= 0.828
Step 100, Minibatch Loss= 1643.4833, Training Accuracy= 0.891
Step 150, Minibatch Loss= 817.3546, Training Accuracy= 0.938
Step 200, Minibatch Loss= 694.1049, Training Accuracy= 0.953
Step 250, Minibatch Loss= 1189.9749, Training Accuracy= 0.922
Step 300, Minibatch Loss= 461.4323, Training Accuracy= 0.938
Step 350, Minibatch Loss= 1144.9181, Training Accuracy= 0.938
Step 400, Minibatch Loss= 571.9758, Training Accuracy= 0.945
Step 450, Minibatch Loss= 284.1655, Training Accuracy= 0.969
Step 500, Minibatch Loss= 279.8378, Training Accuracy= 0.945
Optimization Finished!
Testing Accuracy: 0.96484375


In [14]:
# Create two operations to evaluate the accuracy of the model. 
# The first one is when w is replaced with u = threshold(w).
# The second one is to make sure the code runs correctly. Here u is a reversed threshold of w where large values are pruned off.

# Evaluate with u

condition = tf.greater(tf.abs(wbar), np.sqrt(2*gamma))
ubar = tf.where(condition, wbar, tf.zeros_like(wbar))   # This is threshold(wbar)
# weights['wd1'] = ubar
logits_u = conv_net(X, {'wc1': weights['wc1'],
                        'wc2': weights['wc2'],
                        'wd1': ubar,
                        'out': weights['out']},
                        biases, keep_prob)
prediction_u = tf.nn.softmax(logits_u)
correct_pred_u = tf.equal(tf.argmax(prediction_u, 1), tf.argmax(Y, 1))
accuracy_u = tf.reduce_mean(tf.cast(correct_pred_u, tf.float32))

# Evaluate with not u

condition = tf.greater(tf.abs(wbar), np.sqrt(2*gamma))
unot = tf.where(condition, tf.zeros_like(wbar), wbar)   # This is a reversed threshold(wbar)
# weights['wd1'] = unot
logits_unot = conv_net(X, {'wc1': weights['wc1'],
                        'wc2': weights['wc2'],
                        'wd1': unot,
                        'out': weights['out']},
                        biases, keep_prob)
prediction_unot = tf.nn.softmax(logits_unot)
correct_pred_unot = tf.equal(tf.argmax(prediction_unot, 1), tf.argmax(Y, 1))
accuracy_unot = tf.reduce_mean(tf.cast(correct_pred_unot, tf.float32))


init = tf.global_variables_initializer()

In [15]:
with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, 'mnist1.chkp')
    batch_x, batch_y = mnist.train.next_batch(batch_size)
    
    # Evaluate with w
    acc = sess.run(accuracy, feed_dict={X: mnist.test.images[:256],
                                      Y: mnist.test.labels[:256],
                                      keep_prob: 1.0})
    print('Accuracy with w: {:.8}'.format(acc))
    
    # Evaluate with u        
    acc_u = sess.run(accuracy_u, feed_dict={X: mnist.test.images[:256],
                                      Y: mnist.test.labels[:256],
                                      keep_prob: 1.0})
    print('Accuracy with u: {:.8}'.format(acc_u))
    
    # Evaluate with unot     
    acc_unot = sess.run(accuracy_unot, feed_dict={X: mnist.test.images[:256],
                                      Y: mnist.test.labels[:256],
                                      keep_prob: 1.0})
    print('Accuracy with unot: {:.8}'.format(acc_unot))
    
    zeros_u = tf.reduce_sum(tf.cast(u.eval()==0,tf.int32)).eval()
    zeros_unot = tf.reduce_sum(tf.cast(unot.eval()==0,tf.int32)).eval()
    size_w = 7*7*64*1024
        
    sparse_u = zeros_u / size_w * 100
    sparse_unot = zeros_unot / size_w * 100
    
    print('Total elements of w: {}'.format(size_w))
    print('Number of zeros of u: {}'.format(zeros_u) + '. So u has {:2.2f}% sparsity'.format(sparse_u))
    print('Number of zeros of unot: {}'.format(zeros_unot) + '. So unot has {:2.2f}% sparsity'.format(sparse_unot))


Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from mnist1.chkp
Accuracy with w: 0.96484375
Accuracy with u: 0.96484375
Accuracy with unot: 0.0703125
Total elements of w: 3211264
Number of zeros of u: 255833. So u has 7.97% sparsity
Number of zeros of unot: 2955431. So unot has 92.03% sparsity


In [None]:
with tf.Session() as sess:
    sess.run(init)
#     print(tf.reduce_sum(tf.cast(tf.abs(wbar.eval())>np.sqrt(2*gamma),tf.int32)).eval())
    print(wbar.eval())
    print(tf.reduce_sum(tf.cast(ubar.eval()==0,tf.float32)).eval())
