# Advanced ML Part II // Lecture 02 Scratch

In [1]:
# written by John P. Cunningham, for use in lecture
# continues many of the conventions set out in Wenda Zhou's excellent tf tutorial
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

### Define helper functions

In [2]:
# evaluate performance on some data 
def perf_eval(y_pred, y_true):
    """a function to evaluate performance of predicted y values vs true class labels"""
    # now look at some data
    print(' sample pred: {0}\n sample true: {1}'.format(np.argmax(y_pred[0:20],1),np.argmax(y_true[0:20],1)))
    # avg accuracy
    is_correct_vals = np.equal(np.argmax(y_pred,1),np.argmax(y_true,1))
    accuracy_vals = np.mean(is_correct_vals)
    print(' mean classification accuracy: {0}%'.format(100*accuracy_vals))
    # Dig in a little deeper.  Where did we make correct predictions?  Does this seem reasonable?
    print(' correct predictions by class: {0}'.format(y_true[is_correct_vals,:].sum(axis=0)))

In [3]:
def plot_save(x, fname='foo.png', extent=None, show=True, cmap='gray'):
    plt.imshow(x,cmap=cmap,extent=extent)
    plt.axis('off')
    plt.savefig('tmp/'+fname,bbox_inches='tight')
    if show:
        plt.show()

### Load and sample the data

In [4]:
# get mnist data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('../data/mnist', one_hot=True)

Extracting ../data/mnist/train-images-idx3-ubyte.gz
Extracting ../data/mnist/train-labels-idx1-ubyte.gz
Extracting ../data/mnist/t10k-images-idx3-ubyte.gz
Extracting ../data/mnist/t10k-labels-idx1-ubyte.gz


In [5]:
# take a manageable dataset size for full gradient calculation
X_train = mnist.train.images[0:5000,:]
y_train = mnist.train.labels[0:5000,:]

In [6]:
some_digit = 1
plot_save(X_train[some_digit,:].reshape(28,28), 'mnist_digit.png', show=False)

### Define a basic logistic regression model, as in Wenda's tf tutorial

In [7]:
# We start with our existing model code

def compute_logits(x):
    """Compute the logits of the model"""
    W = tf.get_variable('W', shape=[784, 10])
    b = tf.get_variable('b', shape=[10])
    
    logits = tf.add(tf.matmul(x, W), b, name='logits')
    return logits

# Note: this function is implemented in tensorflow as
# tf.nn.softmax_cross_entropy_with_logits

# We have included it here for illustration only, please don't use it.
def compute_cross_entropy(logits, y):
    y_pred = tf.nn.softmax(logits, name='y_pred') # the predicted probability for each example.

    # Compute the average cross-entropy across all the examples.
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(y_pred), axis=[1]))
    return cross_entropy

def compute_accuracy(logits, y):
    prediction = tf.argmax(logits, 1, name='pred_class')
    true_label = tf.argmax(y, 1, name='true_class')
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, true_label), tf.float32))
    return accuracy

In [13]:
# choose case to run to demonstrate SGD; 0 corresponds to full grad
batch_size = 100 
if batch_size==0:
    dir_name = 'logs/scratch02x/full_grad'
else:
    dir_name = 'logs/scratch02x/sgd{}'.format(batch_size)

In [14]:
with tf.Graph().as_default():
    # We build the model here as before
    x = tf.placeholder(tf.float32, [None, 784], name='x')
    y = tf.placeholder(tf.float32, [None, 10], name='y')
    
    logits = compute_logits(x)
    loss = compute_cross_entropy(logits=logits, y=y)
    accuracy = compute_accuracy(logits, y)
    
    opt = tf.train.GradientDescentOptimizer(0.5)
    train_step = opt.minimize(loss)
    
    # create summary for loss and accuracy
    tf.summary.scalar('loss', loss) 
    tf.summary.scalar('accuracy', accuracy)
    # create summary for logits
    tf.summary.histogram('logits', logits)
    # create summary for input image
    tf.summary.image('input', tf.reshape(x, [-1, 28, 28, 1]))
    
    summary_op = tf.summary.merge_all()
    
    with tf.Session() as sess:
        summary_writer = tf.summary.FileWriter(dir_name, sess.graph)
        
        sess.run(tf.global_variables_initializer())
    
        for i in range(101):
            # choose batch
            if batch_size==0:
                X_batch = mnist.train.images #X_train 
                y_batch = mnist.train.labels #y_train
            else:
                batch = mnist.train.next_batch(batch_size)
                X_batch = batch[0]
                y_batch = batch[1]

            # now run
            _ , summary = sess.run((train_step, summary_op),
                                      feed_dict={x: X_batch, y: y_batch})
            
            # write the summary output to file
            summary_writer.add_summary(summary, i)

            # print diagnostics
            print(".", end='', flush=True)
            if i%100 == 0:
                train_error = sess.run(accuracy, {x: X_train, y: y_train})
                print("\rAfter step {0:3d}, training accuracy {1:0.4f}".format(i, train_error), flush=True)
            if i%1000 == 0: 
                test_error = sess.run(accuracy, {x:mnist.test.images, y:mnist.test.labels})
                print("\rAfter step {0:3d}, test accuracy {1:0.4f}".format(i, test_error), flush=True)


After step   0, training accuracy 0.2556
After step   0, test accuracy 0.2374
After step 100, training accuracy 0.8852............................................................


### A simple MSE example of SGD being unstable...

In [10]:
# simpler set up to demonstrate risk of SGD
# choose case to run to demonstrate SGD; 0 corresponds to full grad
batch_size = 10
decay_rate = 1
if batch_size==0:
    dir_name = 'logs/scratch02x_ex/full_grad'
else:
    dir_name = 'logs/scratch02x_ex/sgd{}_decay{}'.format(batch_size,decay_rate)
        
n = 21
X = np.reshape(np.linspace(-10,10,n), [n,1])

with tf.Graph().as_default():
    w = tf.get_variable('w', shape=[1], initializer=tf.constant_initializer(-10))
    x = tf.placeholder(tf.float32, [None,1], name='x')
    global_step = tf.Variable(0, trainable=False)

    mse = tf.reduce_mean(0.5*(x - w)**2)
    theta = tf.reduce_mean(w)
    
    learning_rate = tf.train.inverse_time_decay(0.5, global_step, 1, decay_rate)
    opt = tf.train.GradientDescentOptimizer(learning_rate)
    train_step = opt.minimize(mse, global_step=global_step)
    
    tf.summary.scalar('mse', mse) 
    tf.summary.scalar('theta', theta ) 
    tf.summary.histogram('foo', x)
    
    summary_op = tf.summary.merge_all()
    
    with tf.Session() as sess:
        summary_writer = tf.summary.FileWriter(dir_name, sess.graph)
        
        sess.run(tf.global_variables_initializer())
        for i in range(101):
            if batch_size==0:
                X_iter = X
            else:
                batch = np.floor(np.random.rand(batch_size)*n).astype(int)
                X_iter = X[batch]
                
            #_, lo = sess.run((train_step,mse), feed_dict={x: X_iter})
            _, lo, summary = sess.run((train_step, mse, summary_op), feed_dict={x: X_iter})
            #print(lo)
            #print(sess.run(learning_rate))
            # write the summary output to file
            summary_writer.add_summary(summary, i)
