In [5]:
import tensorflow as tf
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt
%matplotlib inline
from cnn import Network
from datetime import datetime
import os
import config
# os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
from cs231n.data_utils import load_CIFAR10

def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=10000):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the two-layer neural net classifier. These are the same steps as
    we used for the SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    # Subsample the data
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image

    return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Train data shape:  (49000, 32, 32, 3)
Train labels shape:  (49000,)
Validation data shape:  (1000, 32, 32, 3)
Validation labels shape:  (1000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)


In [6]:
FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_string('checkpoint', None, 'Whether use a pre-trained checkpoint, default None.')
tf.flags.DEFINE_string('network', 'CNN', 'Network type seleted from BP, RBF and CNN, default CNN.')

In [7]:
def main():
    current_time = datetime.now().strftime('%Y%m%d-%H%M')
    checkpoint_dir = 'checkpoints'
    if FLAGS.checkpoint is not None:
        checkpoint_path = os.path.join(checkpoint_dir, FLAGS.checkpoint.lstrip('checkpoints/'))
    else:
        checkpoint_path = os.path.join(checkpoint_dir, '{}'.format(current_time))
        try:
            os.makedirs(checkpoint_path)
        except os.error:
            print('Unable to make checkpoints direction: %s' % checkpoint_path)
    model_save_path = os.path.join(checkpoint_path, 'model.ckpt')

    nn = Network(FLAGS.network)

    saver = tf.train.Saver()
    print('Build session.')
    tfconfig = tf.ConfigProto()
    tfconfig.gpu_options.allow_growth = True
    sess = tf.Session(config=tfconfig)

    if FLAGS.checkpoint is not None:
        print('Restore from pre-trained model.')
        checkpoint = tf.train.get_checkpoint_state(checkpoint_path)
        meta_graph_path = checkpoint.model_checkpoint_path + '.meta'
        restore = tf.train.import_meta_graph(meta_graph_path)
        restore.restore(sess, tf.train.latest_checkpoint(checkpoint_path))
        step = int(meta_graph_path.split('-')[2].split('.')[0])
    else:
        print('Initialize.')
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        step = 0

    loss_list = []
    train_accuracy_list = []
    val_accuracy_list = []
    step = 0

    train_writer = tf.summary.FileWriter('logs/train@' + current_time, sess.graph)
    test_writer = tf.summary.FileWriter('logs/test@' + current_time, sess.graph)
    summary_op = tf.summary.merge_all()

    print('Start training:')
    train_len = len(y_train)
    for epoch in range(config.num_epochs):
        permutation = np.random.permutation(train_len)
        X_train_data = X_train[permutation]
        y_train_data = y_train[permutation]
        data_idx = 0
        while data_idx < dataset.train_len - 1:
            X_train_batch = X_train_data[data_idx: np.clip(data_idx + config.batch_size, 0, train_len - 1)]
            y_train_batch = y_train_data[data_idx: np.clip(data_idx + config.batch_size, 0, train_len - 1)]
            data_idx += config.batch_size
            
            loss, _, train_accuracy, summary = sess.run([nn.loss, nn.optimizer, nn.accuracy, summary_op],
                {nn.X_inputs: X_train_batch, nn.y_inputs: y_train_batch, 
                 nn.keep_prob: config.keep_prob, nn.training: True})
            loss_list.append(loss)
            train_accuracy_list.append(train_accuracy)
            print('>> At step %i: loss = %.2f, train accuracy = %.3f%%' % (step, loss, train_accuracy * 100))
            train_writer.add_summary(summary, step)
            step += 1

        accuracy, summary = sess.run([nn.accuracy, summary_op],
            {nn.X_inputs: X_val, nn.y_inputs: y_val, nn.keep_prob: 1.0, nn.training: False})
        val_accuracy_list.append(accuracy)
        print('For epoch %i: valid accuracy = %.2f%%\n' % (epoch, accuracy * 100))
        val_writer.add_summary(summary, epoch)
        
    accuracy, summary = sess.run([nn.accuracy, summary_op],
        {nn.X_inputs: X_test, nn.y_inputs: y_test, nn.keep_prob: 1.0, nn.training: False})
    print('Test accuracy = %.2f%%\n' % accuracy * 100)

    save_path = saver.save(sess, model_save_path, global_step=step)
    print('Model saved in file: %s' % save_path)
    sess.close()
    train_writer.close()
    test_writer.close()

In [8]:
if __name__ == '__main__':
    main()

Build session.
Initialize.


ResourceExhaustedError: OOM when allocating tensor with shape[32768,10]
	 [[Node: dense/kernel/Adam/Assign = Assign[T=DT_FLOAT, _class=["loc:@dense/kernel"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](dense/kernel/Adam, dense/kernel/Adam/Initializer/zeros)]]

Caused by op 'dense/kernel/Adam/Assign', defined at:
  File "/home1/wyk/conda/envs/python3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/traitlets/config/application.py", line 592, in launch_instance
    app.start()
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 405, in start
    ioloop.IOLoop.instance().start()
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 260, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 212, in dispatch_shell
    handler(stream, idents, msg)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 370, in execute_request
    user_expressions, allow_stdin)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 175, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2902, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3006, in run_ast_nodes
    if self.run_code(code, result):
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-2436fc2ab63a>", line 2, in <module>
    main()
  File "<ipython-input-7-260422b8c5dd>", line 14, in main
    nn = Network(FLAGS.network)
  File "/home1/wyk/work/cs231n-assignment2/cnn.py", line 23, in __init__
    self.loss, self.optimizer = self.optimize(self.logits, self.labels)
  File "/home1/wyk/work/cs231n-assignment2/cnn.py", line 71, in optimize
    self.learning_rate, config.beta1, config.beta2).minimize(loss, global_step=self.global_step)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 325, in minimize
    name=name)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 446, in apply_gradients
    self._create_slots([_get_variable_for(v) for v in var_list])
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/training/adam.py", line 132, in _create_slots
    self._zeros_slot(v, "m", self._name)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 766, in _zeros_slot
    named_slots[_var_key(var)] = slot_creator.create_zeros_slot(var, op_name)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/training/slot_creator.py", line 174, in create_zeros_slot
    colocate_with_primary=colocate_with_primary)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/training/slot_creator.py", line 146, in create_slot_with_initializer
    dtype)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/training/slot_creator.py", line 66, in _create_slot_var
    validate_shape=validate_shape)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable
    use_resource=use_resource, custom_getter=custom_getter)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable
    use_resource=use_resource, custom_getter=custom_getter)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable
    validate_shape=validate_shape, use_resource=use_resource)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter
    use_resource=use_resource)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 725, in _get_single_variable
    validate_shape=validate_shape)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 199, in __init__
    expected_shape=expected_shape)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 320, in _init_from_args
    validate_shape=validate_shape).op
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 274, in assign
    validate_shape=validate_shape)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 43, in assign
    use_locking=use_locking, name=name)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home1/wyk/conda/envs/python3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[32768,10]
	 [[Node: dense/kernel/Adam/Assign = Assign[T=DT_FLOAT, _class=["loc:@dense/kernel"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](dense/kernel/Adam, dense/kernel/Adam/Initializer/zeros)]]
