## ResourceExhaustedError Case

In [1]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

In [2]:
import numpy as np

import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
n_samples = mnist.train.num_examples

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [4]:
def xavier_init(fan_in, fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)

In [7]:
tf.reset_default_graph()

np.random.seed(0)
tf.set_random_seed(0)

network_architecture = \
    dict(n_hidden_recog_1=500, # 1st layer encoder neurons
         n_hidden_recog_2=500, # 2nd layer encoder neurons
         n_hidden_gener_1=500, # 1st layer decoder neurons
         n_hidden_gener_2=500, # 2nd layer decoder neurons
         n_input=784, # MNIST data input (img shape: 28*28)
         n_z=20)  # dimensionality of latent space
transfer_fct = tf.nn.softplus
learning_rate = 1e-3
batch_size = 100

## NB ## this is not an efficient way to construct network

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)

# self._create_network()

x = tf.placeholder(tf.float32, [None, network_architecture["n_input"]])

# .. self._initialize_weights(**self.network_architecture)

n_hidden_recog_1 = network_architecture['n_hidden_recog_1']
n_hidden_recog_2 = network_architecture['n_hidden_recog_2']
n_hidden_gener_1 = network_architecture['n_hidden_gener_1']
n_hidden_gener_2 = network_architecture['n_hidden_gener_2']
n_input = network_architecture['n_input']
n_z = network_architecture['n_z']

network_weights = dict()
network_weights['weights_recog'] = {
    'h1': tf.Variable(xavier_init(n_input, n_hidden_recog_1)),
    'h2': tf.Variable(xavier_init(n_hidden_recog_1, n_hidden_recog_2)),
    'out_mean': tf.Variable(xavier_init(n_hidden_recog_2, n_z)),
    'out_log_sigma': tf.Variable(xavier_init(n_hidden_recog_2, n_z))}
network_weights['biases_recog'] = {
    'b1': tf.Variable(tf.zeros([n_hidden_recog_1], dtype=tf.float32)),
    'b2': tf.Variable(tf.zeros([n_hidden_recog_2], dtype=tf.float32)),
    'out_mean': tf.Variable(tf.zeros([n_z], dtype=tf.float32)),
    'out_log_sigma': tf.Variable(tf.zeros([n_z], dtype=tf.float32))}
network_weights['weights_gener'] = {
    'h1': tf.Variable(xavier_init(n_z, n_hidden_gener_1)),
    'h2': tf.Variable(xavier_init(n_hidden_gener_1, n_hidden_gener_2)),
    'out_mean': tf.Variable(xavier_init(n_hidden_gener_2, n_input)),
    'out_log_sigma': tf.Variable(xavier_init(n_hidden_gener_2, n_input))}
network_weights['biases_gener'] = {
    'b1': tf.Variable(tf.zeros([n_hidden_gener_1], dtype=tf.float32)),
    'b2': tf.Variable(tf.zeros([n_hidden_gener_2], dtype=tf.float32)),
    'out_mean': tf.Variable(tf.zeros([n_input], dtype=tf.float32)),
    'out_log_sigma': tf.Variable(tf.zeros([n_input], dtype=tf.float32))}

# .. self._recognition_network(network_weights["weights_recog"], network_weights["biases_recog"])
# i.e. q(z|x)

recog_layer1 = transfer_fct(tf.add(tf.matmul(x, network_weights['weights_recog']['h1']), 
                                   network_weights['biases_recog']['b1']))
recog_layer2 = transfer_fct(tf.add(tf.matmul(recog_layer1, network_weights['weights_recog']['h2']), 
                                   network_weights['biases_recog']['b2']))
z_mean = tf.add(tf.matmul(recog_layer2, network_weights['weights_recog']['out_mean']),
                network_weights['biases_recog']['out_mean']) # [batch_size, hidden_size]
z_log_sigma_sq = tf.add(tf.matmul(recog_layer2, network_weights['weights_recog']['out_log_sigma']),
                        network_weights['biases_recog']['out_log_sigma']) # [batch_size, hidden_size]

eps = tf.random_normal((batch_size, n_z), 0, 1, dtype=tf.float32)
z = tf.add(z_mean, tf.multiply(tf.sqrt(tf.exp(z_log_sigma_sq)), eps))
    # exp{log_sigma_sq} = sigma_sq
    # then sqrt to get stddev

# ... self.x_reconstr_mean = self._generator_network(network_weights["weights_gener"], network_weights["biases_gener"])
# i.e. p(x|z)

gener_layer1 = transfer_fct(tf.add(tf.matmul(z, network_weights['weights_gener']['h1']), 
                                   network_weights['biases_gener']['b1']))
gener_layer2 = transfer_fct(tf.add(tf.matmul(gener_layer1, network_weights['weights_gener']['h2']), 
                                   network_weights['biases_gener']['b2']))
x_reconstr_mean = tf.nn.sigmoid(tf.add(tf.matmul(gener_layer2, network_weights['weights_gener']['out_mean']),
                                       network_weights['biases_gener']['out_mean']))

# self._create_loss_optimizer()   

reconstr_loss = -tf.reduce_sum(x * tf.log(1e-10 + x_reconstr_mean) + 
                               (1-x) * tf.log(1e-10 + 1 - x_reconstr_mean), 1)
    # neg-log-prob of reconstruction (nats required for reconstructing input)
latent_loss = -0.5 * tf.reduce_sum(1 + z_log_sigma_sq - tf.square(z_mean) - tf.exp(z_log_sigma_sq), 1)
    # KL divergence between generative density and diagonal Gaussian
cost = tf.reduce_sum(reconstr_loss + latent_loss)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

sess.run(tf.global_variables_initializer())

ResourceExhaustedError: OOM when allocating tensor with shape[784,500]
	 [[Node: random_uniform/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform/RandomUniform, random_uniform/sub)]]

Caused by op 'random_uniform/mul', defined at:
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/opt/apps/gcc4_9/python3/3.5.2/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-6ef96df71245>", line 37, in <module>
    'h1': tf.Variable(xavier_init(n_input, n_hidden_recog_1)),
  File "<ipython-input-4-007b28eaac23>", line 8, in xavier_init
    dtype=tf.float32)
  File "/opt/apps/gcc4_9/cuda8_0/cudnn5_1/python3_5/tensorflow-gpu/1.0.0/lib/python3.5/site-packages/tensorflow/python/ops/random_ops.py", line 246, in random_uniform
    return math_ops.add(rnd * (maxval - minval), minval, name=name)
  File "/opt/apps/gcc4_9/cuda8_0/cudnn5_1/python3_5/tensorflow-gpu/1.0.0/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py", line 884, in binary_op_wrapper
    return func(x, y, name=name)
  File "/opt/apps/gcc4_9/cuda8_0/cudnn5_1/python3_5/tensorflow-gpu/1.0.0/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py", line 1105, in _mul_dispatch
    return gen_math_ops._mul(x, y, name=name)
  File "/opt/apps/gcc4_9/cuda8_0/cudnn5_1/python3_5/tensorflow-gpu/1.0.0/lib/python3.5/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1625, in _mul
    result = _op_def_lib.apply_op("Mul", x=x, y=y, name=name)
  File "/opt/apps/gcc4_9/cuda8_0/cudnn5_1/python3_5/tensorflow-gpu/1.0.0/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/opt/apps/gcc4_9/cuda8_0/cudnn5_1/python3_5/tensorflow-gpu/1.0.0/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/opt/apps/gcc4_9/cuda8_0/cudnn5_1/python3_5/tensorflow-gpu/1.0.0/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[784,500]
	 [[Node: random_uniform/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform/RandomUniform, random_uniform/sub)]]


In [6]:
%%time

training_epochs = 10
display_step = 5

for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = int(n_samples / batch_size)
    for i in range(total_batch):
        batch_xs, _ = mnist.train.next_batch(batch_size)
            # [batch_size, mnist-img-dim], [batch_size, mnist-lb-dim]
        opt, c = sess.run([optimizer, cost], feed_dict={x:batch_xs})
        avg_cost += c / n_samples * batch_size
    if epoch % display_step == 0:
        print('Epoch:', '%04d' % (epoch+1),
              'cost=', '{:.9f}'.format(avg_cost))

Epoch: 0004 cost= 11147.837139560
Epoch: 0009 cost= 10498.969240057
CPU times: user 28.6 s, sys: 2.67 s, total: 31.2 s
Wall time: 24.9 s
