In [40]:
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# Background:
- This notebook is just for playing around a bit with tensorflow to retrain that muscle memory I had from working with TF at The Data Incubator

# Basics of sessions 

In [2]:
#create variables in tensorflow
x = tf.Variable(3,name='x')
y = tf.Variable(4,name='y')

#create the computation graph
f = x*x*y + y + 2

#create a tf session

sess = tf.Session()

#initialize session variables
sess.run(x.initializer)
sess.run(y.initializer)
result = sess.run(f)
print(result)
#close the session
sess.close()

42


In [3]:
#We can avoid repeating sess.run:
with tf.Session() as sess:
    #with block sets sess as default session
    
    #below becomes equivalent to tf.get_default_session().run(x.initializer)
    x.initializer.run()  
    y.initializer.run()
    #below becomes equivalent to tf.get_default_session().run(f)
    result = f.eval()
#session is automatically closed at end of with block

In [4]:
#use global variables initializer instead of doing all by hand
init = tf.global_variables_initializer() #prep an init node

with tf.Session() as sess:
    
    init.run()
    results = f.eval()

In [5]:
#alternately can use an interactive session so we don't need a with block, but we need to manually close the session
sess = tf.InteractiveSession()
init.run()
result = f.eval()
print(result)

42


In [6]:
sess.close()

# Graph management

In [7]:
#add node to default graph (this is automatic behavior)
x1 = tf.Variable(1)
x1.graph is tf.get_default_graph()

True

In [8]:
#but we can make and manage independent graphs
graph = tf.Graph()

with graph.as_default():
    x2 = tf.Variable(2)

In [9]:
x2.graph is graph

True

In [10]:
x2.graph is tf.get_default_graph()

False

In [11]:
#we can reset the default graph to clean it up
tf.reset_default_graph()

In [12]:
#Node values are dropped between graph runs
w = tf.constant(3)
x = w + 2
y = x + 5
z = x * 3

with tf.Session() as sess:
    
    print(y.eval()) #tf sees it need x and hence w so it first evaluates w then x then evaluate y
    print(z.eval()) #tf needs x and w again, and will reevaluate w then x then evalute z. x and w will be evaluated 2x
    
#All node values are dropped between graph runs except variable values which are maintained across graphs (constants 
#are not variables).

10
15


In [13]:
#we can evaluate y and z in the same graph though:
with tf.Session() as sess:
    y_val, z_val = sess.run([y,z])
    print(y_val)
    print(z_val)
    
#Variable states in single process tensor flow are not shared across multiple sessions even with the same graph.
#each graph 

10
15


# Computation examples
## Solve normal equations:
given a linear model $y(\mathbf{x},\mathbf{\theta}) = \mathbf{x}\cdot\mathbf{\theta}$ where $\mathbf{\theta} = [\theta_1, \theta_2, ... , \theta_n]$ and $\mathbf{x} = [x_1, x_2,... ,x_n]$ is a feature vector of length n. Given a dataset with k observations and n features organzied into a matrix $X \in \mathbb{R}^{kxn}$ and k target values $\mathbf{y}_{target}$ supoose we wish to find $\hat{\theta}$ which minimizes the mean square error given by

$\begin{equation}
MSE(\theta) \propto \sum_{i = 1} ^k ||\mathbf{y}(X,\theta) - \mathbf{y}_{target}|| ^2
\end{equation}$

where $\mathbf{y}(X,\theta) = X \cdot \theta$ is a matrix of shape $kx1$. Let's minimize the MSE with respect to $\theta$

$\begin{equation}
||\mathbf{y}(X,\theta) - \mathbf{y}_{target}|| * \partial_\theta{\mathbf{y}(X,\mathbf{\theta})} = 0
\end{equation}$

so

$\begin{equation}
||\mathbf{y}(X,\theta) - \mathbf{y}_{target}|| = 0
\end{equation}$

or

$X\mathbf{\theta} = \mathbf{y}_{target} \rightarrow X^tX\mathbf{\theta} = X^t\mathbf{y}_{target}
\rightarrow \mathbf{\theta} = (X^tX)^{-1}X^t\mathbf{y}_{target}$


In [14]:
housing = fetch_california_housing()
m, n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m,1)), housing.data]

X = tf.constant(housing_data_plus_bias, dtype = tf.float32, name = 'X')
y = tf.constant(housing.target.reshape(-1,1), dtype = tf.float32, name = 'y')
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT,X)),XT),y)

with tf.Session() as sess:
    
    theta_value = theta.eval()
    
    print(theta_value)

[[-3.7185181e+01]
 [ 4.3633747e-01]
 [ 9.3952334e-03]
 [-1.0711310e-01]
 [ 6.4479220e-01]
 [-4.0338000e-06]
 [-3.7813708e-03]
 [-4.2348403e-01]
 [-4.3721911e-01]]


## Batch gradient descent
This is a numerical approach to minimizing a cost function like MSE. For MSE of the linear regression problem the gradient can be found analytically as

$\begin{equation}
\nabla_\theta MSE(\theta) = \frac{2}{k} X^t(X\mathbf{\theta} - \mathbf{y}_{target})
\end{equation}$

If we take a small step opposite the direction of the gradient, we move decrease the MSE.

$\theta^{(i+1)} = \theta^{(i)} - \eta \nabla_\theta MSE(\theta)$

$\eta$ here is the learning rate and determines the step size in proportion to the gradient size.

In [15]:
n_epochs = 1000
learning_rate = 0.01

scaler = StandardScaler()
scaled_housing = scaler.fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m,1)),scaled_housing]

X = tf.constant(scaled_housing_data_plus_bias, dtype= tf.float32, name = 'X')
y = tf.constant(housing.target.reshape(-1,1), dtype = tf.float32, name = 'y')
theta = tf.Variable(tf.random_uniform([n+1,1],-1.0,1.0),name='theta')
y_pred = tf.matmul(X,theta,name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error),name='mse')
gradients = 2/m * tf.matmul(tf.transpose(X),error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    
    sess.run(init)
    
    for epoch in range(n_epochs):
        
        if epoch % 100 == 0:
            
            print('Epoch', epoch, 'MSE=',mse.eval())
            
        sess.run(training_op)
        
    best_theta = theta.eval()

Epoch 0 MSE= 6.0908103
Epoch 100 MSE= 0.77572846
Epoch 200 MSE= 0.62502444
Epoch 300 MSE= 0.59593636
Epoch 400 MSE= 0.5784452
Epoch 500 MSE= 0.5656278
Epoch 600 MSE= 0.556019
Epoch 700 MSE= 0.5487716
Epoch 800 MSE= 0.5432795
Epoch 900 MSE= 0.5390978


In [16]:
best_theta

array([[ 2.0685523 ],
       [ 0.8826072 ],
       [ 0.15565088],
       [-0.31520402],
       [ 0.32459393],
       [ 0.00823164],
       [-0.04357724],
       [-0.58113176],
       [-0.55551445]], dtype=float32)

In [17]:
#Let's implement it pretending we didn't know the form of the gradient 

n_epochs = 1000
learning_rate = 0.01

scaler = StandardScaler()
scaled_housing = scaler.fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m,1)),scaled_housing]

X = tf.constant(scaled_housing_data_plus_bias, dtype= tf.float32, name = 'X')
y = tf.constant(housing.target.reshape(-1,1), dtype = tf.float32, name = 'y')
theta = tf.Variable(tf.random_uniform([n+1,1],-1.0,1.0),name='theta')
y_pred = tf.matmul(X,theta,name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error),name='mse')
gradients = tf.gradients(mse,[theta])[0]
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    
    sess.run(init)
    
    for epoch in range(n_epochs):
        
        if epoch % 100 == 0:
            
            print('Epoch', epoch, 'MSE=',mse.eval())
            
        sess.run(training_op)
        
    best_theta = theta.eval()

Epoch 0 MSE= 13.762668
Epoch 100 MSE= 0.81799835
Epoch 200 MSE= 0.6006053
Epoch 300 MSE= 0.5770048
Epoch 400 MSE= 0.56378
Epoch 500 MSE= 0.5541626
Epoch 600 MSE= 0.5470061
Epoch 700 MSE= 0.54165065
Epoch 800 MSE= 0.5376267
Epoch 900 MSE= 0.53458995


## The above gradient method implements Reverse-mode autodiff 
- Reverse mode auto diff requires $n_{outputs} + 1$ graph traversals gives high accuracy and supports arbitrary code for functions.

Reverse autodiff proceedure:

1) Traverse the graph in the forward direction to calculate the node values.

2) For each output value, traverse the graph backwards making use of the chain rule to build up the derivatives with respect to the inputs via the chain rule. 

## Example:

Suppose $f(x,y) = x^2y + y + 2$ let's consider calculating the derivative at $f(3,4)$. Our graph would have a multiplication node that takes the input node x (value $ = 3$ node $= n_1$) twice resulting in (value $= 9$ node $= n_4$) and combines that output at another node with input y (value $= 4$ node $ = n_2$) to obtain (value $= 36$ node $= n_5$). $n_2$ also combines with a constant node (value $= 2$ node $ = n_3$) at an addition node to obtain (value $= 6$ node $=n_6$). Finally $n_5$ and $n_6$ are combined in an addition node (value $= 42$ node $= n_7$) to give the function $f(x,y)$. Note the values were optained with a single forward pass through the graph.

We may compute the partial derivative with respect to x as follows:

$\begin{equation}
\partial_x f(x,y) = \partial_{n_7} f(x,y) \partial_{x} n_7 =  \partial_{x} n_7\\ 
f(x,y) = n_7\\
\partial_{x} n_7 = \partial_{n_5} n_7 \partial_{x} n_5 + \partial_{n_6} n_7 \partial_{x} n_6 =  \partial_{x} n_5 +  \partial_{x} n_6\\
n_7 = n_5 + n_6\\
\partial_{x} n_6 = \partial_{n_2} n_6 \partial_{x} n_2 + \partial_{n_3} n_6 \partial_{x} n_3= \partial_{x} n_2 + \partial_{x} n_3\\
n_6 = n_2 + n_3\\
\partial_{x} n_3 = 0\\
n_3 = 2\\
\partial_{x} n_2 = 0\\
n_2 = y\\
\partial_{x} n_5 = \partial_{n_2} n_5 \partial_{x} n_2 + \partial_{n_4} n_5 \partial_{x} n_4= n_4 \partial_{x} n_2 + n_2 \partial_{x} n_4\\
n_5 = n_2 n_4\\
\partial_{x} n_4 = \partial_{n_1} n_4 \partial_{x} n_1 = 2 n_1 \partial_{x} n_1 = 6\\
n_1 = x \rightarrow 3\\
\partial_{x} n_2 = 0\\
n_2 = y\\
\end{equation}$

So $\partial_x f(x,y)|_{x=3,y=4} = 1 *(4*6) = 24$.

- This approach is very flexible and accurate and can even handle functions not differentiable everywhere as long as we try to compute it at a differentiable point of the function.
- Note: if implementing a new operation in tensorflow, in order to make it compatible with autodiff, you must provide a function that builds a subgraph to compute 

## Optimizers
- TF also includes some out of the box optimizers so that we don't have to write e.g. our own gradient descent
- all we have to do is swap out the optimizer. (Note I use momentum optimizer below, and it converges a bit faster)

In [18]:
#using an optimizer instead of our own gradient descent

n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name = 'X')
y = tf.constant(housing.target.reshape(-1,1),dtype=tf.float32,name='y')
theta = tf.Variable(tf.random_uniform([n+1,1],-1.0,1.0),name='theta')
y_pred = tf.matmul(X,theta,name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error),name='mse')
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum = 0.9)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    
    sess.run(init)
    
    for epoch in range(n_epochs):
        
        if epoch % 100 == 0:
            
            print('Epoch', epoch, 'MSE', mse.eval())
            
        sess.run(training_op)
        
    best_theta = theta.eval()
        
        

Epoch 0 MSE 10.774582
Epoch 100 MSE 0.5247632
Epoch 200 MSE 0.52432555
Epoch 300 MSE 0.5243211
Epoch 400 MSE 0.524321
Epoch 500 MSE 0.52432096
Epoch 600 MSE 0.52432096
Epoch 700 MSE 0.5243209
Epoch 800 MSE 0.5243209
Epoch 900 MSE 0.5243209


In [19]:
# placeholder example
A = tf.placeholder(tf.float32, shape = (None,3))
B = A + 5

with tf.Session() as sess:
    
    B_val_1 = B.eval(feed_dict = {A:[[1,2,3]]})
    B_val_2 = B.eval(feed_dict = {A:[[4,5,6],[7,8,9]]})

In [20]:
print(B_val_1)

[[6. 7. 8.]]


In [21]:
print(B_val_2)

[[ 9. 10. 11.]
 [12. 13. 14.]]


## implement minibatch 
works the same as batch gradient descent but uses only a random subset of the data for each batch. Runs many small batches over fewer epochs

In [22]:
n_epochs = 10
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape = (None, n+1), name = 'X')
y = tf.placeholder(tf.float32, shape = (None,1),name = 'y')
theta = tf.Variable(tf.random_uniform([n+1,1],-1.0,1.0, seed=42), name = 'theta')
y_pred = tf.matmul(X,theta, name = 'predictions')
error = y - y_pred
mse = tf.reduce_mean(tf.square(error, name = 'mse'))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [23]:
batch_size = 100
n_batches = int(np.ceil(m/batch_size))

In [24]:
def fetch_batch(epoch, batch_index, batch_size):
    
    np.random.seed(epoch*batch_index)
    
    idx = np.random.randint(m,size = batch_size)
    
    X_batch = scaled_housing_data_plus_bias[idx]
    y_batch = housing.target[idx].reshape(-1,1)

    return X_batch,y_batch

In [25]:
with tf.Session() as sess:
    
    sess.run(init)
    
    for epoch in range(n_epochs):
        
        #if epoch % 100 == 0:
            
        #        print('Epoch', epoch, 'MSE', mse.eval())
        
        for batch_index in range(n_batches):
                  
            X_batch, y_batch = fetch_batch(epoch,batch_index,batch_size)
            sess.run(training_op,feed_dict = {X: X_batch, y: y_batch})
    
    best_theta = theta.eval()

In [26]:
best_theta

array([[ 0.9045429 ],
       [ 0.35481548],
       [ 0.5906365 ],
       [ 0.51156354],
       [-0.04808879],
       [ 0.26202965],
       [-0.62795925],
       [-0.7713845 ],
       [-0.32755637]], dtype=float32)

## Saving and Restoring Models

In [34]:
# We save by creating a saver node at the end of the construction phase.
# Then call the save method in the session when we want to save the model.

batch_size =100
n_epochs = 10
n_batches = int(np.ceil(m/batch_size))
learning_rate = 0.01

X = tf.placeholder(tf.float32,shape = (None,n+1),name = 'X')
y = tf.placeholder(tf.float32,shape = (None,1), name = 'y')
theta = tf.Variable(tf.random_uniform([n+1,1],-1.0,1.0, seed = 42),name = 'theta')
y_pred = tf.matmul(X,theta)
error = y-y_pred
mse = tf.reduce_mean(tf.square(error),name = 'mse')
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

with tf.Session() as sess:
    
    sess.run(init)
    
    for epoch in range(n_epochs):
        
        for batch_index in range(n_batches):
            
            X_batch, y_batch = fetch_batch(epoch,batch_index,batch_size)
            
            sess.run(train_op,feed_dict={X:X_batch,y:y_batch})
        
        if epoch % 100 == 2:
            
            save_path = saver.save(sess,'./tmp/my_model.ckpt')
            
    best_theta = theta.eval()
    save_path = saver.save(sess, './tmp/my_model_final.ckpt')

In [39]:
best_theta

array([[ 2.0802064 ],
       [ 0.81800103],
       [ 0.12630619],
       [-0.2501979 ],
       [ 0.2781734 ],
       [-0.01142241],
       [-0.01247775],
       [-0.906074  ],
       [-0.8746359 ]], dtype=float32)

In [37]:
# Lets load the model back in
with tf.Session() as sess:
    
    saver.restore(sess,'./tmp/my_model_final.ckpt')
    
    loaded_theta = theta.eval()

INFO:tensorflow:Restoring parameters from ./tmp/my_model_final.ckpt


In [38]:
loaded_theta

array([[ 2.0802064 ],
       [ 0.81800103],
       [ 0.12630619],
       [-0.2501979 ],
       [ 0.2781734 ],
       [-0.01142241],
       [-0.01247775],
       [-0.906074  ],
       [-0.8746359 ]], dtype=float32)

In [43]:
tf.reset_default_graph()

now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
root_logdir = './tf_logs'
logdir = '{}/run-{}'.format(root_logdir,now)

n_epochs = 10
batch_size = 100
learning_rate = 0.01
n_batches = int(np.ceil(m/batch_size))

X = tf.placeholder(tf.float32,shape=(None,n+1),name='X')
y = tf.placeholder(tf.float32,shape=(None,1),name='y')
theta = tf.Variable(tf.random_uniform([n+1,1],-1.0,1.0,seed = 42),name='theta')
y_pred = tf.matmul(X,theta,name='predictions')
error = y - y_pred
mse = tf.reduce_mean(tf.square(error),name='mse')
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(mse)
mse_summary = tf.summary.scalar('MSE',mse)
file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    
    sess.run(init)
    
    for epoch in range(n_epochs):
        
        for batch_index in range(n_batches):
            
            X_batch, y_batch = fetch_batch(epoch,batch_index,batch_size)
            
            if batch_index % 10 == 0:
                
                summary_str = mse_summary.eval(feed_dict={X:X_batch,y:y_batch})
                
                step = epoch*n_batches + batch_index
                
                file_writer.add_summary(summary_str, step)
            
            sess.run(train_op,feed_dict={X:X_batch,y:y_batch})
            
        
    best_theta = theta.eval()
    

file_writer.close()

Using name scopes to declutter graphs

In [47]:
tf.reset_default_graph()

now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
root_logdir = './tf_logs'
logdir = '{}/run-{}'.format(root_logdir,now)

n_epochs = 10
batch_size = 100
learning_rate = 0.01
n_batches = int(np.ceil(m/batch_size))

X = tf.placeholder(tf.float32,shape=(None,n+1),name='X')
y = tf.placeholder(tf.float32,shape=(None,1),name='y')
theta = tf.Variable(tf.random_uniform([n+1,1],-1.0,1.0,seed = 42),name='theta')
y_pred = tf.matmul(X,theta,name='predictions')
with tf.name_scope('loss') as scope:
    error = y - y_pred
    mse = tf.reduce_mean(tf.square(error),name='mse')
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(mse)
mse_summary = tf.summary.scalar('MSE',mse)
file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    
    sess.run(init)
    
    for epoch in range(n_epochs):
        
        for batch_index in range(n_batches):
            
            X_batch, y_batch = fetch_batch(epoch,batch_index,batch_size)
            
            if batch_index % 10 == 0:
                
                summary_str = mse_summary.eval(feed_dict={X:X_batch,y:y_batch})
                
                step = epoch*n_batches + batch_index
                
                file_writer.add_summary(summary_str, step)
            
            sess.run(train_op,feed_dict={X:X_batch,y:y_batch})
            
        
    best_theta = theta.eval()
    

file_writer.close()

In [63]:
now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
root_logdir = './tf_logs'
logdir = '{}/run-{}'.format(root_logdir,now)
tf.reset_default_graph()
def relu(X):
    with tf.name_scope('relu'):
        w_shape = (int(X.get_shape()[1]),1)
        w = tf.Variable(tf.random_normal(w_shape),name = 'weights')
        b = tf.Variable(0.0, name='bias')
        z = tf.add(tf.matmul(X,w),b, name ='z')
        return tf.maximum(z,0,name='relu')
    
n_features = 3
X=tf.placeholder(tf.float32,shape=(None,n_features),name='X')
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    
    sess.run(init)
    
    sess.run(output,feed_dict={X:[[1,2,3]]})
    
    tf.summary.FileWriter(logdir,tf.get_default_graph())

In [68]:
now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
root_logdir = './tf_logs'
logdir = '{}/run-{}'.format(root_logdir,now)
tf.reset_default_graph()
def relu(X):

    w_shape = (int(X.get_shape()[1]),1)
    w = tf.Variable(tf.random_normal(w_shape),name = 'weights')
    b = tf.Variable(0.0, name='bias')
    z = tf.add(tf.matmul(X,w),b, name ='z')
    return tf.maximum(z,0,name='relu')

n_features = 3
X=tf.placeholder(tf.float32,shape=(None,n_features),name='X')
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    
    sess.run(init)
    
    final_output = sess.run(output,feed_dict={X:[[1,2,3]]})
    
    tf.summary.FileWriter(logdir,tf.get_default_graph())
                    

In [69]:
final_output

array([[9.165289]], dtype=float32)