# Building a Neural Network in Tensorflow
### Solving a simple XOR problem

In [0]:
import tensorflow as tf
import time

## Model 1

* This model is setup step-by-step and used as a baselin for models 2 - 4

### Setup placeholders

*   Tensorflow will automatically fill them with the data when we run the network
*   In this XOR problem, we have four different training examples and each example has two features
*   There are also four expected outputs, each with just one value (either 0 or 1)



In [0]:
x_ = tf.placeholder(tf.float32, shape=[4,2], name = 'x-input')
y_ = tf.placeholder(tf.float32, shape=[4,1], name = 'y-input')

### Setup parameters for the network

* These are called variables in Tensorflow
* Variables will be modified by Tensorflow during the training steps
* For our theta matrices, we want them initialized to random values between -1 and +1, so we use the built-in random_uniform function to do that

In [0]:
Theta1 = tf.Variable(tf.random_uniform([2,2], -1, 1), name = "Theta1")
Theta2 = tf.Variable(tf.random_uniform([2,1], -1, 1), name = "Theta2")

### Setup bias nodes

*  Bias nodes are setup separately, but still as Variables
*  this lets the algorithms modify the values of the bias node
* This is mathematically equivalent to having a signal value of 1 and initial weights of 0 on the links from the bias nodes

In [0]:
Bias1 = tf.Variable(tf.zeros([2]), name = "Bias1")
Bias2 = tf.Variable(tf.zeros([1]), name = "Bias2")

### Tensorflow Model

Tensorflow runs a model inside of a $session$, which it uses to maintain the state of the variables as they pass through the network

*  Matmul is Tensorflow's matrix multiplication function
*  Sigmoid is the sigmoid activation function
*  The cost function is the average over all the training examples
*  The training algorithm used is the gradient descent algorithm with a learning rate of 0.1
* the training algorithm objective is to minimize the cost function

In [0]:
with tf.name_scope("layer2") as scope:
	A2 = tf.sigmoid(tf.matmul(x_, Theta1) + Bias1)

with tf.name_scope("layer3") as scope:
	Hypothesis = tf.sigmoid(tf.matmul(A2, Theta2) + Bias2)

with tf.name_scope("cost") as scope:
	cost = tf.reduce_mean(( (y_ * tf.log(Hypothesis)) + 
		((1 - y_) * tf.log(1.0 - Hypothesis)) ) * -1)

with tf.name_scope("train") as scope:
	train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
  
XOR_X = [[0,0],[0,1],[1,0],[1,1]]
XOR_Y = [[0],[1],[1],[0]]

init = tf.global_variables_initializer()
sess = tf.Session()

#writer = tf.summary.FileWriter("./logs/xor_logs", sess.graph)

sess.run(init)

### Training steps

*   Each time the training step is executed, the values in the dictionary "feed_dict" are loaded into the placeholders 
*   As the XOR problem is simple, each epoch will contain the entire training set
*  To see what's going on inside the loop, just print the values of the Variables

In [6]:
t_start = time.clock()
for i in range(100000):
	sess.run(train_step, feed_dict={x_: XOR_X, y_: XOR_Y})
	if i % 1000 == 0:
		print('Epoch ', i)
		print('Hypothesis ', sess.run(Hypothesis, feed_dict={x_: XOR_X, y_: XOR_Y}))
		print('Theta1 ', sess.run(Theta1))
		print('Bias1 ', sess.run(Bias1))
		print('Theta2 ', sess.run(Theta2))
		print('Bias2 ', sess.run(Bias2))
		print('cost ', sess.run(cost, feed_dict={x_: XOR_X, y_: XOR_Y}))
t_end = time.clock()
print('Elapsed time ', t_end - t_start)

Epoch  0
Hypothesis  [[0.43824816]
 [0.3980671 ]
 [0.4864669 ]
 [0.44081098]]
Theta1  [[ 0.75088984 -0.59059376]
 [ 0.39410627  0.9964763 ]]
Bias1  [ 6.3200285e-05 -4.6624002e-05]
Theta2  [[ 0.38002536]
 [-0.877795  ]]
Bias2  [0.00059361]
cost  0.699921
Epoch  1000
Hypothesis  [[0.49284208]
 [0.46040598]
 [0.5420136 ]
 [0.50363797]]
Theta1  [[ 0.766465   -0.57335395]
 [ 0.44432032  1.0140898 ]]
Bias1  [0.0284321  0.04139753]
Theta2  [[ 0.47723562]
 [-0.78650117]]
Bias2  [0.1307458]
cost  0.6918734
Epoch  2000
Hypothesis  [[0.49301705]
 [0.46258795]
 [0.5422611 ]
 [0.504325  ]]
Theta1  [[ 0.7806317  -0.55485165]
 [ 0.4934432   1.0464029 ]]
Bias1  [0.04629835 0.10360561]
Theta2  [[ 0.49484852]
 [-0.7767083 ]]
Bias2  [0.1273696]
cost  0.6910097
Epoch  3000
Hypothesis  [[0.49211153]
 [0.46353695]
 [0.5424705 ]
 [0.50463533]]
Theta1  [[ 0.7988815  -0.54680574]
 [ 0.5464299   1.0815223 ]]
Bias1  [0.06534696 0.16392256]
Theta2  [[ 0.51686794]
 [-0.7757771 ]]
Bias2  [0.12117802]
cost  0.690111

## Model 2

Substitute hyperbolic (tanh) activation function for the sigmoid activation in layer 2

In [7]:
x_ = tf.placeholder(tf.float32, shape=[4,2], name = 'x-input')
y_ = tf.placeholder(tf.float32, shape=[4,1], name = 'y-input')

Theta1 = tf.Variable(tf.random_uniform([2,2], -1, 1), name = "Theta1")
Theta2 = tf.Variable(tf.random_uniform([2,1], -1, 1), name = "Theta2")

Bias1 = tf.Variable(tf.zeros([2]), name = "Bias1")
Bias2 = tf.Variable(tf.zeros([1]), name = "Bias2")

with tf.name_scope("layer2") as scope:
	A2 = tf.tanh(tf.matmul(x_, Theta1) + Bias1)

with tf.name_scope("layer3") as scope:
	Hypothesis = tf.sigmoid(tf.matmul(A2, Theta2) + Bias2)

with tf.name_scope("cost") as scope:
	cost = tf.reduce_mean(( (y_ * tf.log(Hypothesis)) + 
		((1 - y_) * tf.log(1.0 - Hypothesis)) ) * -1)

with tf.name_scope("train") as scope:
	train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
  
XOR_X = [[0,0],[0,1],[1,0],[1,1]]
XOR_Y = [[0],[1],[1],[0]]

init = tf.global_variables_initializer()
sess = tf.Session()

#writer = tf.summary.FileWriter("./logs/xor_logs", sess.graph)

sess.run(init)

t_start = time.clock()
for i in range(100000):
	sess.run(train_step, feed_dict={x_: XOR_X, y_: XOR_Y})
	if i % 1000 == 0:
		print('Epoch ', i)
		print('Hypothesis ', sess.run(Hypothesis, feed_dict={x_: XOR_X, y_: XOR_Y}))
		print('Theta1 ', sess.run(Theta1))
		print('Bias1 ', sess.run(Bias1))
		print('Theta2 ', sess.run(Theta2))
		print('Bias2 ', sess.run(Bias2))
		print('cost ', sess.run(cost, feed_dict={x_: XOR_X, y_: XOR_Y}))
t_end = time.clock()
print('Elapsed time ', t_end - t_start)

Epoch  0
Hypothesis  [[0.49984598]
 [0.5847971 ]
 [0.5301093 ]
 [0.5923024 ]]
Theta1  [[ 0.44006395  0.1013131 ]
 [ 0.818389   -0.22679135]]
Bias1  [-4.6198442e-05  2.1032672e-04]
Theta2  [[ 0.38464987]
 [-0.37569836]]
Bias2  [-0.00051929]
cost  0.6903078
Epoch  1000
Hypothesis  [[0.43973786]
 [0.54971534]
 [0.50675505]
 [0.55955416]]
Theta1  [[ 0.7096743   0.1781341 ]
 [ 0.94072783 -0.11885826]]
Bias1  [0.01005977 0.09657937]
Theta2  [[ 0.54627657]
 [-0.36203325]]
Bias2  [-0.21286458]
cost  0.6693501
Epoch  2000
Hypothesis  [[0.38000372]
 [0.57151604]
 [0.53289914]
 [0.58267474]]
Theta1  [[ 1.0552616   0.2753368 ]
 [ 1.1653717  -0.03517294]]
Bias1  [-0.06056732  0.16268046]
Theta2  [[ 0.8847472 ]
 [-0.41493675]]
Bias2  [-0.36909828]
cost  0.63520426
Epoch  3000
Hypothesis  [[0.2867806 ]
 [0.60667664]
 [0.5624128 ]
 [0.6062222 ]]
Theta1  [[1.434809   0.4039786 ]
 [1.4660288  0.03358523]]
Bias1  [-0.2114637  0.2018269]
Theta2  [[ 1.2875129 ]
 [-0.55133176]]
Bias2  [-0.5330094]
cost  0.5

## Model 3

*   Substitute Adam Optimizer for the Gradient Optimizer training algorithm 
*   Keep the thanh activation for layer 2

In [8]:
x_ = tf.placeholder(tf.float32, shape=[4,2], name = 'x-input')
y_ = tf.placeholder(tf.float32, shape=[4,1], name = 'y-input')

Theta1 = tf.Variable(tf.random_uniform([2,2], -1, 1), name = "Theta1")
Theta2 = tf.Variable(tf.random_uniform([2,1], -1, 1), name = "Theta2")

Bias1 = tf.Variable(tf.zeros([2]), name = "Bias1")
Bias2 = tf.Variable(tf.zeros([1]), name = "Bias2")

with tf.name_scope("layer2") as scope:
	A2 = tf.tanh(tf.matmul(x_, Theta1) + Bias1)

with tf.name_scope("layer3") as scope:
	Hypothesis = tf.sigmoid(tf.matmul(A2, Theta2) + Bias2)

with tf.name_scope("cost") as scope:
	cost = tf.reduce_mean(( (y_ * tf.log(Hypothesis)) + 
		((1 - y_) * tf.log(1.0 - Hypothesis)) ) * -1)

with tf.name_scope("train") as scope:
	train_step = tf.train.AdamOptimizer(0.01).minimize(cost)
  
XOR_X = [[0,0],[0,1],[1,0],[1,1]]
XOR_Y = [[0],[1],[1],[0]]

init = tf.global_variables_initializer()
sess = tf.Session()

#writer = tf.summary.FileWriter("./logs/xor_logs", sess.graph)

sess.run(init)

t_start = time.clock()
for i in range(100000):
	sess.run(train_step, feed_dict={x_: XOR_X, y_: XOR_Y})
	if i % 1000 == 0:
		print('Epoch ', i)
		print('Hypothesis ', sess.run(Hypothesis, feed_dict={x_: XOR_X, y_: XOR_Y}))
		print('Theta1 ', sess.run(Theta1))
		print('Bias1 ', sess.run(Bias1))
		print('Theta2 ', sess.run(Theta2))
		print('Bias2 ', sess.run(Bias2))
		print('cost ', sess.run(cost, feed_dict={x_: XOR_X, y_: XOR_Y}))
t_end = time.clock()
print('Elapsed time ', t_end - t_start)

Epoch  0
Hypothesis  [[0.49516347]
 [0.6471397 ]
 [0.3932538 ]
 [0.5562207 ]]
Theta1  [[-0.9014865  -0.22460361]
 [ 0.94062406  0.7037214 ]]
Bias1  [-0.00999995 -0.00999985]
Theta2  [[0.42385703]
 [0.5108524 ]]
Bias2  [-0.00999989]
cost  0.7161104
Epoch  1000
Hypothesis  [[0.00289546]
 [0.99711835]
 [0.49889576]
 [0.50116134]]
Theta1  [[-4.739171   4.366158 ]
 [ 2.646672   2.8392463]]
Bias1  [-1.683154 -0.76683 ]
Theta2  [[3.4141288]
 [3.6901593]]
Bias2  [-0.2749523]
cost  0.34915403
Epoch  2000
Hypothesis  [[7.5966050e-04]
 [9.9919826e-01]
 [4.9970758e-01]
 [5.0034696e-01]]
Theta1  [[-5.556321   5.180324 ]
 [ 2.8504236  3.1153576]]
Bias1  [-1.7813692 -0.8594941]
Theta2  [[4.0368967]
 [4.366113 ]]
Bias2  [-0.3288474]
cost  0.3472839
Epoch  3000
Hypothesis  [[3.1823188e-04]
 [9.9965549e-01]
 [4.9987724e-01]
 [5.0015336e-01]]
Theta1  [[-6.0854707  5.7021127]
 [ 2.972627   3.2692668]]
Bias1  [-1.8416506  -0.91292655]
Theta2  [[4.448776 ]
 [4.8015203]]
Bias2  [-0.3525721]
cost  0.3468774
E

## Model 4

* Return to using the sigmoid activation function for layer 2
*  Continue using the Adam Optimizer learning algorithm

In [9]:
x_ = tf.placeholder(tf.float32, shape=[4,2], name = 'x-input')
y_ = tf.placeholder(tf.float32, shape=[4,1], name = 'y-input')

Theta1 = tf.Variable(tf.random_uniform([2,2], -1, 1), name = "Theta1")
Theta2 = tf.Variable(tf.random_uniform([2,1], -1, 1), name = "Theta2")

Bias1 = tf.Variable(tf.zeros([2]), name = "Bias1")
Bias2 = tf.Variable(tf.zeros([1]), name = "Bias2")

with tf.name_scope("layer2") as scope:
	A2 = tf.sigmoid(tf.matmul(x_, Theta1) + Bias1)

with tf.name_scope("layer3") as scope:
	Hypothesis = tf.sigmoid(tf.matmul(A2, Theta2) + Bias2)

with tf.name_scope("cost") as scope:
	cost = tf.reduce_mean(( (y_ * tf.log(Hypothesis)) + 
		((1 - y_) * tf.log(1.0 - Hypothesis)) ) * -1)

with tf.name_scope("train") as scope:
	train_step = tf.train.AdamOptimizer(0.01).minimize(cost)
  
XOR_X = [[0,0],[0,1],[1,0],[1,1]]
XOR_Y = [[0],[1],[1],[0]]

init = tf.global_variables_initializer()
sess = tf.Session()

#writer = tf.summary.FileWriter("./logs/xor_logs", sess.graph)

sess.run(init)

t_start = time.clock()
for i in range(100000):
	sess.run(train_step, feed_dict={x_: XOR_X, y_: XOR_Y})
	if i % 1000 == 0:
		print('Epoch ', i)
		print('Hypothesis ', sess.run(Hypothesis, feed_dict={x_: XOR_X, y_: XOR_Y}))
		print('Theta1 ', sess.run(Theta1))
		print('Bias1 ', sess.run(Bias1))
		print('Theta2 ', sess.run(Theta2))
		print('Bias2 ', sess.run(Bias2))
		print('cost ', sess.run(cost, feed_dict={x_: XOR_X, y_: XOR_Y}))
t_end = time.clock()
print('Elapsed time ', t_end - t_start)

Epoch  0
Hypothesis  [[0.6909979 ]
 [0.64672303]
 [0.7223264 ]
 [0.6819148 ]]
Theta1  [[ 0.91154903  0.01759423]
 [-0.40655136 -0.5681985 ]]
Bias1  [-0.00999991 -0.00999993]
Theta2  [[0.68796396]
 [0.9498017 ]]
Bias2  [-0.00999998]
cost  0.77023965
Epoch  1000
Hypothesis  [[0.03956076]
 [0.93799907]
 [0.9632074 ]
 [0.03746817]]
Theta1  [[ 7.3222656  6.666449 ]
 [-7.656102  -6.5904126]]
Bias1  [-3.8925066  3.2577145]
Theta2  [[ 7.0580506]
 [-6.512936 ]]
Bias2  [2.9410083]
cost  0.04501138
Epoch  2000
Hypothesis  [[0.00925676]
 [0.98634505]
 [0.9908342 ]
 [0.00885829]]
Theta1  [[ 8.398478  8.125467]
 [-8.780868 -8.086592]]
Bias1  [-4.375178   4.0587482]
Theta2  [[ 9.810077]
 [-9.398983]]
Bias2  [4.4443774]
cost  0.010288686
Epoch  3000
Hypothesis  [[0.00391291]
 [0.99427384]
 [0.996082  ]
 [0.00376104]]
Theta1  [[ 8.949761  8.792685]
 [-9.34984  -8.763931]]
Bias1  [-4.6386366  4.407253 ]
Theta2  [[ 11.47301 ]
 [-11.080085]]
Bias2  [5.297216]
cost  0.004339262
Epoch  4000
Hypothesis  [[0.

## Summary
*  Model 2 proves to be the most accurate network with a cost  of ~0.003 
*  Using tanh for the layer 2 activation function creates an advantage because the layer 2 nodes are not restricted to an output range between 0 and 1 
*  With tanh, the layer can better handle weight and bias inputs with negative values 
*  The ouput from a layer using tanh will be between -1 and 1. The sigmoid function left in layer 3 can work well with values in this range when trying to produce a binary output. 
*  Tanh is a superio activation in most hiddne layers of neural networks. 