In [None]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import matplotlib.pyplot as plt

import time
from datetime import datetime, timedelta

from pylib.draw_nn import draw_neural_net_fig

In [None]:
sess = None

def reset_vars():
    sess.run(tf.global_variables_initializer())

def reset_tf():
    global sess
    if sess:
        sess.close()
    tf.reset_default_graph()
    sess = tf.Session()

# Deep Learning and TensorFlow

<!-- requirement: pylib/draw_nn.py -->
<!-- requirement: pylib/__init__.py -->

## TensorFlow

In previous modules we have dealt with relatively simple algorithms and relatively small amounts of data. As projects get larger and more complex, it becomes more important to consider the performance of algorithms. [TensorFlow](https://www.tensorflow.org/) is a computational framework where your data and desired transformations are represented as a graph (a similar but more flexible incarnation of the scikit-learn pipeline). This has several advantages, including optimizations from lazy evaluation and enabling parallel computing.

Let's go through an example workflow. For more details and reference, you can use:
* https://www.tensorflow.org/how_tos/index.html
* http://learningtensorflow.com/index.html

In [None]:
# Lazy evaluation

x = tf.constant([42, 212, 7, 13], name='x')
y = tf.Variable(x ** 2, name='y')
z = tf.Variable([0, 1, 2, 3], name='z')

print(y)

In [None]:
model = tf.global_variables_initializer()

with tf.Session() as session:
    for i in xrange(3):
        session.run(model)
        z += 1
        print session.run(z)
    print session.run(y)

# Note that session.run(y) only computes the part of the graph necessary to calculate y

In [None]:
# Placeholders are empty until you feed in data

x = tf.placeholder("float", [None, 4])  # data type and dimension [rows, columns]
y = -x  # operation

with tf.Session() as session:
    x_data = [[1, 2, 3, 4],
              [5, 6, 7, 8],]
    result = session.run(y, feed_dict={x: x_data})
    print(result)

In [None]:
x = tf.Variable(0., name='x')
threshold = tf.constant(5.)

model = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(model)
    while session.run(tf.less(x, threshold)):
        x = x + 1
        x_value = session.run(x)
        print(x_value)

In [None]:
# Linear model optimization
x = tf.placeholder("float")
y = tf.placeholder("float")
w = tf.Variable([1.0, 2.0], name="w")

y_model = x * w[0] + w[1]

error = tf.square(y - y_model)  # sum of squared error

train_op = tf.train.GradientDescentOptimizer(0.01).minimize(error)

model = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(model)
    for i in range(1000):
        x_value = np.random.rand()
        y_value = x_value * 2 + 6
        session.run(train_op, feed_dict={x: x_value, y: y_value})

    w_value = session.run(w)
    print("Predicted model: {a:.3f}x + {b:.3f}".format(a=w_value[0], b=w_value[1]))

# Deep Neural Networks

## What is deep learning?

Deep learning is a branch of machine learning that tries to emulate the biological structure and function of the brain using artificial neural networks. These networks include: 

- Multilayer Perceptron Networks
- Convolutional Neural Networks
- Recurrent Neural Networks

Additionally, these networks are hierarchical or multilayered, enabling them to model high-level abstractions in data. For this reason, deep learning is also called **hierarchical learning**. In this notebook, we will discuss the first type of network.

There are benefits to using hierarchical models. In contrast to the performance of older machine learning algorithms, the performance of deep learning algorithms scales with the amount of data they are trained on -- the more data, the better the model. Consequently, deep learning algorithms typically outperform traditional ones. These models also have the ability to automatically extract features from data in a process called [feature learning](https://en.wikipedia.org/wiki/Feature_learning). This ability eliminates the need for a priori knowledge of the data to construct features, which is particularly useful when dealing with complex data such as images.  

Deep learning has some pretty neat applications. Not only can we classify images with a high degree of accuracy, but we can also use deep learning algorithms to [generate captions](https://research.googleblog.com/2016/09/show-and-tell-image-captioning-open.html), [summarize](https://research.googleblog.com/2016/08/text-summarization-with-tensorflow.html) and [translate](https://research.googleblog.com/2016/09/a-neural-network-for-machine.html) text, [generate audio](https://deepmind.com/blog/wavenet-generative-model-raw-audio/), and [produce art](https://github.com/lengstrom/fast-style-transfer/). 

## The perceptron
The _perceptron_ is a linear decision boundary classifier\* that trains by an iterative learning approach.

The model works as follows:

- **Input**: A data point. This point is transformed into an $n$-length "feature vector" $v$ $\in R^n$, with each element describing the value of that particular feature.
- **Output**: A classification, either -1 or 1.

As mentioned above, the basic perceptron is linear, which means we can represent our model as another $n$-length "weight vector" $w$ - in the image below with input vector $v$, we
 - compute the inner product $<v,w> := u$
 - calculate $f(u)$, where $f$ is the _activation function_ (in a perceptron, the Heaviside step function, as in the diagram below).
 
 to obtain our prediction. It may also be instructive to think of this as matrix multiplication (with $v$ a `1xn` matrix and `w^T` an `nx1` matrix)- when we chain together perceptrons for a neural net, we can in fact represent each layer of the model as a matrix.

![Perceptron](http://i.stack.imgur.com/KUvpQ.png)

\*In fact, one can use [kernel methods](https://en.wikipedia.org/wiki/Kernel_perceptron) (much like with SVMs) to attempt nonlinear classification with perceptrons.

### Activation functions

The fact that the activation function is nonlinear is crucial.  This is what keeps the whole network from just being a linear transformation.  Any non-linearity will do though, so a number of different activation functions have been proposed.  Here are a few:

The first perceptron, a single-layer neural network, desgined by Frank Rosenblatt in 1957, used the **Heaviside** or **step** function.  This is essentially equivalent to using a threshold with logistic regression.  While this if fine for predicting a class, it has slope 0 almost everywhere, and therefore is unsuitable for use with gradient descent algorithms.

We have already seen the **sigmoid** function used in logistic regression.  In a sense, it smooths out the step function, allowing a usable gradient in the area near $x = 0$.  Because the function saturates at $\pm\infty$, the gradient goes to zero for large positive or negative inputs.  This can cause optimization algorithms to slow down.

The average output of a sigmoid is 0.5, but it performs best when the average input is 0.  Thus, several layers of sigmoid neurons may push themselves away from optimal behavior.  One solution to this is use a **tanh** instead.  While the general shape is the same, its range is [-1, 1], so the output will on average be 0.

The tanh will still have trouble with saturation of the signal.  Recently, many researchers have had success with the **rectified linear unit (ReLU)**: $f(x) = \max(0, x)$.  While it might seem to combine the problems of the other functions (non-analytic points, zero derivatives, non-centered output), in practice it tends to be quite successful.

### The XOR problem

Research into artificial neurons dates to the late 40s, but it was not until 1969 that Martin Minsky and Seymour Papert pointed out that basic neurons were unable to reproduce the **exclusive-or** (XOR) function.  This boolean function of two boolean variables returns true if exactly one of its inputs is true:

$$ \mathrm{XOR}(0, 0) = \mathrm{XOR}(1, 1) = 0 \ \ \ \ \ \ \mathrm{XOR}(0, 1) = \mathrm{XOR}(1, 0) = 1 $$

Below, we create a related two-class classification problem, with one class clustered about (0, 0) and (1, 1), and the other about (0, 1) and (1, 0).  It would be quite easy to draw a boundary separating the two classes by hand.

In [None]:
centers = np.array([[0, 0]] * 100 + [[1, 1]] * 100
                   + [[0, 1]] * 100 + [[1, 0]] * 100)
np.random.seed(42)
data = np.random.normal(0, 0.2, (400, 2)) + centers
labels = np.array([[0]] * 200 + [[1]] * 200)

plt.scatter(data[:,0], data[:,1], c=labels, cmap=plt.cm.RdYlBu)
plt.colorbar();

Below, we build a simple logistic classifier for these data.  It takes two features of input and returns a prediction for the probability of being in class 1.

In [None]:
draw_neural_net_fig([2, 1])

In [None]:
reset_tf()

x = tf.placeholder(tf.float32, [None, 2], name="features")
y_label = tf.placeholder(tf.float32, [None, 1], name="labels")

W = tf.Variable(tf.zeros([2, 1]), name="weights")
b = tf.Variable(tf.zeros([1]), name="biases")
y = tf.matmul(x, W) + b

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, 
                                                              labels=y_label))
train = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

predicted = tf.cast(tf.nn.sigmoid(y) > 0.5, np.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, y_label), np.float32))

Let's train it for a little. 

In [None]:
reset_vars()

for i in range(300):
    sess.run(train, feed_dict={x: data, y_label: labels})
    if i % 30 == 0:
        print sess.run([loss, accuracy],
                       feed_dict={x: data, y_label: labels})

That's a bit odd.  The model seems to get stuck on 52% accuracy.  More notably, the entropy has barely improved at all.

Let's visualize the predictions.  Note that the color scale only covers a portion of the range.

In [None]:
pred_labels = sess.run(tf.nn.sigmoid(y), feed_dict={x: data, y_label: labels})
ww, bb = sess.run([W, b])

plt.scatter(data[:,0], data[:,1], c=pred_labels, cmap=plt.cm.RdYlBu,
            vmin=0.45, vmax=0.55)
xx = np.linspace(-1, 2, 100)
yy = -ww[0] / ww[1] * xx - bb / ww[1]
plt.plot(xx, yy, 'k')
plt.axis((-1,2,-1,2))
plt.colorbar();

Now the problem becomes apparent: The model is returning nearly a 50% probability for each observation.  Therefore the entropy is stuck at just about $-\log\frac12 \approx 0.6931$.

We can draw a line at $p = 0.5$ to separate the two classes.  From the logistic function, we know this is equivalent to $x\cdot W + b = 0$.  Thus, our model is attempting to draw a straight line through the plane to separate the two classes.  (In the general case, logistic regression separates the classes with a $(n-1)$-D hyperplane in $n$-D space.)  No line can do that in this case, so the model falls back to guessing 50% for each.

We might wonder why the line wasn't chosen to separate one cluster from the other three.  This could have given us an accuracy approaching 0.75.  But remember that we are optimizing entropy, which is based on the probability estimates, not accuracy.  Because the probability grows the further we go from the threshold line, the penalty for the one cluster on the wrong side of the line would outweigh the gains from the two clusters put fully on the right side.

## Multilayer Perceptron

### Hidden Layers

Let's try to combine these artificial neurons into a more complex configuration.  We'll make a network with a single **hidden layer** of size two.  That is, we will have two logistic regressions whose outputs are not visible.  Instead, they are fed into a third, visible neuron, whose output we use.

In [None]:
draw_neural_net_fig([2, 2, 1])

The math behind this isn't as bad as it might seem at first.  All of the weights of the neurons in the hidden layer can be combined into a single $2\times2$ matrix $W^{(1)}$.  The final neuron's weights will be in a $2\times1$ matrix $W^{(2)}$.  The biases behave similarly.  Then our final probabilistic prediction is just

$$ p_j = f_2\bigg( f_1\left( X_{ji} W^{(1)}_{ik} + b^{(1)}_k \right) W^{(2)}_k + b^{(2)} \bigg)$$

We are using the Einstein notation: All repeated indices are implicitly summed over.  Both $f_1$ and $f_2$ represent the logistic function, which is taken to operate element-wise over tensors.

The **backpropagation** algorithm, developed by Paul Werbos in 1975, points out that we can use gradient descent (or similar algorithms) to optimize all of the parameters in these sorts of expressions.  All it takes is successive applications of the chain rule.  In fact, there's nothing special we have to do to make use of it: TensorFlow's optimizers automatically work though the successive derivatives to generate the update rules.  All we have to do is set up the calculation:

In [None]:
hidden_size = 2
W1 = tf.Variable(tf.random_normal([2, hidden_size], seed=42), name="weight1")
b1 = tf.Variable(tf.zeros([hidden_size]), name="bias1")

hidden = tf.nn.sigmoid(tf.matmul(x, W1) + b1, name="hidden")

W2 = tf.Variable(tf.random_normal([hidden_size, 1], seed=24), name="weight2")
b2 = tf.Variable(tf.zeros([1]), name="bias2")

y = tf.matmul(hidden, W2) + b2

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y,
                                                              labels=y_label))
train = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

predicted = tf.cast(tf.nn.sigmoid(y) > 0.5, np.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, y_label), np.float32))

And let's run it.  We need a few more steps to get all of the weights well-trained.

In [None]:
reset_vars()

for i in range(3000):
    sess.run(train, feed_dict={x: data, y_label: labels})
    if i % 300 == 0:
        print sess.run([loss, accuracy],
                       feed_dict={x: data, y_label: labels})

We can verify the improved accuracy by examining our predictions.  This time, we plot the probability as a background field, so we can compare the actual labels in marker colors.

In [None]:
mesh = np.column_stack(a.reshape(-1) for a in np.meshgrid(np.r_[-1:2:100j], np.r_[-1:2:100j]))
ymesh = sess.run(tf.nn.sigmoid(y), feed_dict={x: mesh})

plt.imshow(ymesh.reshape(100,100), cmap=plt.cm.RdYlBu, origin='lower',
           extent=(-1, 2, -1, 2), vmin=0, vmax=1)
plt.scatter(data[:, 0], data[:, 1], c=labels, cmap=plt.cm.RdYlBu,
            edgecolor='w', lw=1)
plt.axis((-1, 2, -1, 2))
plt.colorbar();

How is that stripe created?  We can get some understanding by looking at the weights of the hidden layer.

In [None]:
ww1 = sess.run(W1)
hmesh = sess.run(hidden, feed_dict={x: mesh})

for i in xrange(hidden_size):
    plt.subplot(1, hidden_size, i+1)
    plt.imshow(hmesh[:, i].reshape((100, 100)), origin='lower', cmap=plt.cm.RdYlBu,
               extent=(-0.5,1.5,-0.5,1.5))

Now we can see how each hidden neuron is just defining a line through the feature space.  Each line defines one side of the strip.

Note that the colors are inverted from the final probabilities.  We can understand what's going on by examining the weights in the second layer:

In [None]:
sess.run(W2)

## Example: Classifying handwritten digits

In [None]:
mnist = input_data.read_data_sets('/tmp/data', one_hot=True)

### Minibatch processing

**Minibatch processing** trains on a "minibatch", a subset of the whole data, at a time.  This increases the accuracy of the gradient calculation, reducing the noise in each step, while still avoiding much of the duplication in the whole set.  Minibatches can often be about as fast as individual steps&mdash;the parallelism available in modern CPUs and especially GPUs is wasted if we are calculating a single row at a time.

A batch size of 100 is usually enough to smooth out the noise.  In our case, the full data is not so much larger than the batch size, so there is little difference in performance. We call 1 full iteration over the training set an **epoch**.

In [None]:
N_PIXELS= 28 * 28
BATCH_SIZE = 100
LEARNING_RATE = 0.5

hidden_size = 64

### Initializing Weights and Biases

As a reminder, we want to initialize our weights with random values to break symmetry between neurons in a hidden layer. Additionally, we want to choose small values to avoid the **gradient vanishing problem**, where the weighted sum of the inputs (plus a bias) fall on the flat portion of the sigmoid curve. What is the proper scale of the weights?  Most of our activation functions have their best response for inputs of $\mathcal O(1)$.  If we have $m$ random inputs, each of $\mathcal O(1)$, we expect their sum to scale as $\sqrt m$.  Therefore, weights are often chosen randomly with a mean of zero and standard deviation of $1/\sqrt m$.

For very large layers, this gives rather small weights.  An alternative approach is to only provide $k < m$ non-zero weights when initializing neurons.  This scheme, known as **sparse initialization**, provides more diversity amongst the neurons at initialization.  It can, however, also produce very slow convergence as "incorrect" choices of non-zero weights have the be removed.

In the code below, we initialize our weights by sampling from a truncated normal distribution, where any weights greater than 2 standard deviations from the mean is re-picked. We also initialize the biases to zero. 

In [None]:
def initializer(shape):
    return tf.truncated_normal(shape, stddev=shape[0]**-0.5)

In [None]:
reset_tf()

x = tf.placeholder(tf.float32, [None, N_PIXELS], name="pixels")
y_label = tf.placeholder(tf.float32, [None, 10], name="labels")

W1 = tf.Variable(initializer([N_PIXELS, hidden_size]), name="weights")
b1 = tf.Variable(tf.zeros([hidden_size]), name="biases")

hidden = tf.nn.sigmoid(tf.matmul(x, W1) + b1)

W2 = tf.Variable(initializer([hidden_size, 10]), name="weights2")
b2 = tf.Variable(tf.zeros([10]), name="biases2")

y = tf.matmul(hidden, W2) + b2

In [None]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y,
                                                              labels=y_label))
train = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(y_label, 1)), tf.float32))

reset_vars()
for i in range(10000):
    batch_x, batch_y = mnist.train.next_batch(BATCH_SIZE)
    sess.run(train,
             feed_dict={x: batch_x, y_label: batch_y})
    if i % 1000 == 0:
        print "Test: ", sess.run([loss, accuracy],
                                 feed_dict={x: mnist.test.images,
                                            y_label: mnist.test.labels})
        print "Train:", sess.run([loss, accuracy],
                                 feed_dict={x: mnist.train.images, 
                                            y_label: mnist.train.labels})

In [None]:
prediction = tf.argmax(y, 1)

def predict(idx):
    image = mnist.test.images[idx]
    return sess.run(prediction, feed_dict={x: [image]})

idx = 0
actual = np.argmax(mnist.test.labels[idx])
print ("Predicted: %d, Actual: %d" % (predict(idx), actual))
plt.imshow(mnist.test.images[idx].reshape((28,28)), cmap=plt.cm.gray_r)

*Copyright &copy; 2017 The Data Incubator.  All rights reserved.*