# 第5章 MNIST数字识别问题

## 5.1 MNIST数据处理

- TensorFlow对MNIST数据集做了封装

In [1]:
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("./", one_hot=True)

print "Training data size: ", mnist.train.num_examples

print "Validating data size: ", mnist.validation.num_examples

print "Testing data size: ", mnist.test.num_examples

print "Example training data: ", mnist.train.images[0]

print "Example training data label: ", mnist.train.labels[0]

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ./train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ./train-labels-idx1-ubyte.gz
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting ./t10k-images-idx3-ubyte.gz
Extracting ./t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Training data size:  55000
Validating data size:  5000
Testing data size:  10000
Example training data:  [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.        

- 处理后的每一张图片是一个长度为784的一维数组，这个数组中的元素对应了图片像素矩阵中的每一个数字（28 * 28=784），因为神经网络的输入是一个特征向量，所以在此把一张二维图像的像素矩阵放到一个一维数组中可以方便TensorFlow将图片的像素矩阵提供给神经网络的输入层
- 为了方便使用随机梯度下降，`input_data.read_data_sets`函数生成的类还提供了`mnist.train.next_batch`函数，它可以从所有的训练数据中读取一小部分作为一个训练batch

In [2]:
batch_size = 100
xs, ys = mnist.train.next_batch(batch_size)
print "X shape: ", xs.shape
print "Y shape: ", ys.shape

X shape:  (100, 784)
Y shape:  (100, 10)


## 5.2 神经网络模型训练及不同模型结果对比

### 5.2.1 TensorFlow训练神经网络

In [3]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

# MNIST数据集相关的常数
INPUT_NODE = 784 # 输入层节点数
OUTPUT_NODE = 10 # 输出层节点数

# 配置神经网络的参数
LAYER1_NODE = 500            # 隐藏层节点数（这里我们仅使用一个隐藏层）
BATCH_SIZE = 100             # 一个训练batch中的训练数据个数
LEARNING_RATE_BASE = 0.8     # 基础的学习率
LEARNING_RATE_DECAY = 0.99   # 学习率的衰减率
REGULARIZATION_RATE = 0.0001 # 正则项的系数
TRAINING_STEPS = 3000       # 训练轮数
#TRAINING_STEPS = 30000       # 训练轮数
MOVING_AVERAGE_DECAY = 0.99  # 滑动平均衰减

# 定义前向传播函数，采用ReLU激活函数
def inference(input_tensor, avg_class, weights1, biases1, weights2, biases2):
    # 区分是否有滑动平均
    if avg_class == None:
        layer1 = tf.nn.relu(tf.matmul(input_tensor, weights1) + biases1)
        return tf.matmul(layer1, weights2) + biases2
    else:
        layer1 = tf.nn.relu(tf.matmul(input_tensor, avg_class.average(weights1)) + avg_class.average(biases1))
        return tf.matmul(layer1, avg_class.average(weights2)) + avg_class.average(biases2)

# 模型训练
def train(mnist):
    x = tf.placeholder(tf.float32, [None, INPUT_NODE], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-input')
    
    # 生成隐藏层参数
    weights1 = tf.Variable(tf.truncated_normal([INPUT_NODE, LAYER1_NODE], stddev=0.1))
    biases1 = tf.Variable(tf.constant(0.1, shape=[LAYER1_NODE]))
    # 生成输出层参数
    weights2 = tf.Variable(tf.truncated_normal([LAYER1_NODE, OUTPUT_NODE], stddev=0.1))
    biases2 = tf.Variable(tf.constant(0.1, shape=[OUTPUT_NODE]))
    
    # 计算在当前参数下神经网络前向传播的结果
    y = inference(x, None, weights1, biases1, weights2, biases2)
    
    # 定义存储训练轮数的变量，并指定为不可训练的参数
    global_step = tf.Variable(0, trainable=False)
    
    # 初始化滑动平均类
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    
    # 在所有代表神经网络参数的变量上使用滑动平均，tf.trainable_variables返回的就是图上集合GraphKeys.TRAINABLE_VARIABLES中的元素，
    # 这个集合的元素就是所有没有指定trainable=False的参数
    variables_averages_op = variable_averages.apply(tf.trainable_variables())
    
    # 计算使用了滑动平均之后的前向传播结果；第4章介绍过滑动平均不会改变变量本身的取值，而是会维护一个影子变量来记录其滑动平均值，
    # 所以当需要这个滑动平均值时，需要明确调用average函数
    average_y = inference(x, variable_averages, weights1, biases1, weights2, biases2)
    
    # 计算交叉熵损失函数；标准答案是一个长度为10的一维数组，而该函数需要提供的是一个正确答案的数字，
    # 所以需要使用tf.argmax函数来得到正确答案对应的类别编号
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
    
    # 计算在当前batch中所有样例的交叉熵平均值
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    
    # 计算L2正则化损失函数
    regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
    
    # 计算模型的正则化损失，一般只计算神经网络边上权重的正则化损失，而不使用偏置项
    regularization = regularizer(weights1) + regularizer(weights2)
    
    # 总损失等于交叉熵损失和正则化损失的和
    loss = cross_entropy_mean + regularization
    
    # 设置指数衰减的学习率
    learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, mnist.train.num_examples / BATCH_SIZE, 
                                              LEARNING_RATE_DECAY)
    
    # 使用梯度下降来优化损失函数
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
    
    # 在训练神经网络模型时，每过一遍数据既需要通过反向传播来更新神经网络中的参数，又要更新每一个参数的滑动平均值，为了一次完成多个操作，
    # TensorFlow提供了tf.control_dependencies和tf.group两种机制
    with tf.control_dependencies([train_step, variables_averages_op]):
        train_op = tf.no_op(name='train')
    
    # 计算模型在一组数据上的正确率
    correct_prediction = tf.equal(tf.argmax(average_y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # 初始化会话并开始训练过程
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        # 准备验证数据，一般在神经网络的训练过程中会通过验证数据来大致判断停止的条件和评判训练的效果
        validate_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
        
        # 准备测试数据
        test_feed = {x: mnist.test.images, y_: mnist.test.labels}
        
        # 迭代地训练神经网络
        for i in range(TRAINING_STEPS):
            # 每1000轮输出一次在验证数据集上的测试结果
            if i % 1000 == 0:
                # 计算滑动平均模型在验证数据上的结果，因为MNIST数据集比较小，所以一次可以处理所有的验证数据，而不用划分为更小的batch
                validate_acc = sess.run(accuracy, feed_dict=validate_feed)
                print "After %d training step(s), validation accuracy using average model is %g" % (i, validate_acc)

            # 产生这一轮使用的一个batch的训练数据，并运行训练过程
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            sess.run(train_op, feed_dict={x: xs, y_: ys})
        
        # 训练结束后，在测试数据上检测神经网络模型的最终正确率
        test_acc = sess.run(accuracy, feed_dict=test_feed)
        print "After %d training step(s), test accuracy using average model is %g" % (TRAINING_STEPS, test_acc)
        
def main(argv=None):
    mnist = input_data.read_data_sets("./", one_hot=True)
    train(mnist)

# TensorFlow提供的一个主程序入口，tf.app.run会调用上面定义的main函数
if __name__ == "__main__":
    tf.app.run()

Extracting ./train-images-idx3-ubyte.gz
Extracting ./train-labels-idx1-ubyte.gz
Extracting ./t10k-images-idx3-ubyte.gz
Extracting ./t10k-labels-idx1-ubyte.gz
Instructions for updating:
Colocations handled automatically by placer.
After 0 training step(s), validation accuracy using average model is 0.0728
After 1000 training step(s), validation accuracy using average model is 0.9766
After 2000 training step(s), validation accuracy using average model is 0.9806
After 3000 training step(s), test accuracy using average model is 0.9831


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## 5.3 变量管理

- 第4章介绍了通过tf.Variable函数来创建一个变量，除了tf.Variable函数，TensorFlow还提供了tf.get_variable函数来创建或者获取变量，当后者用于创建变量时，它和tf.Variable的功能基本等价

In [4]:
import tensorflow as tf
# 下面两个定义等价
v = tf.get_variable("v", shape=[1], initializer=tf.constant_initializer(1.0)) # 还可以使用别的initializer
v = tf.Variable(tf.constant(1.0, shape=[1]), name='v')

- 对于tf.get_variable函数，变量名称是一个必填的参数，不能创建同名参数
- 通过tf.variable_scope函数可以控制tf.get_variable函数获取已经创建过的变量

In [5]:
# 在名为foo的命名空间内创建名字为var的变量
with tf.variable_scope("foo"):
    v = tf.get_variable("var", [1], initializer=tf.constant_initializer(1.0))

# 因为在命名空间foo中已经存在名为var的变量，所以以下代码报错
#with tf.variable_scope("foo"):
#    v = tf.get_variable("var", [1])
    
# 将参数reuse设置为True，这样tf.get_variable函数将直接获取已经声明的变量
with tf.variable_scope("foo", reuse=True):
    v1 = tf.get_variable("var", [1])
    print v == v1
    
# 将参数reuse设置为True时，tf.variable_score将只能获取已经创建过的变量，因为在命名空间bar中还没有创建变量var，所以以下代码报错
#with tf.Variable_scope("bar", reuse=True):
#    v = tf.get_variable("var", [1])

True


- 以上例子说明，当tf.variable_scope函数使用参数reuse=True生成上下文管理器时，这个上下文管理器内所有的tf.get_variable函数会直接获取已经创建的变量，如果变量不存在，则报错；相反，如果tf.variable_scope函数使用参数reuse=None或reuse=False创建上下文管理器，tf.get_variable操作将创建新的变量，如果同名变量已存在，则报错

- tf.variable_scope是可以嵌套的，下面的例子说明了嵌套时reuse参数的取值如何确定

In [6]:
import tensorflow as tf

with tf.variable_scope("root"):
    print tf.get_variable_scope().reuse
    
    with tf.variable_scope("foo", reuse=True):
        print tf.get_variable_scope().reuse
        
        with tf.variable_scope("bar"): # 不指定reuse，这时reuse取值保持和外层一致
            print tf.get_variable_scope().reuse
    print tf.get_variable_scope().reuse

False
True
True
False


- tf.variable_scope生成的上下文管理器也会创建一个TensorFlow中的命名空间，在命名空间内创建的变量名称都会带上这个命名空间名作为前缀

In [7]:
import tensorflow as tf

v1 = tf.get_variable("var1", [1])
print v1.name

with tf.variable_scope("foo"):
    v2 = tf.get_variable("v", [1])
    print v2.name
    
with tf.variable_scope("foo"):
    with tf.variable_scope("bar"):
        v3 = tf.get_variable("v", [1])
        print v3.name
        
    v4 = tf.get_variable("v1", [1])
    print v4.name
    
with tf.variable_scope("", reuse=True):
    v5 = tf.get_variable("foo/bar/v", [1])
    print v5 ==v3
    v6 = tf.get_variable("foo/v1", [1])
    print v6 == v4

var1:0
foo/v:0
foo/bar/v:0
foo/v1:0
True
True


- 通过tf.variable_scope和tf.get_variable函数，以下代码对之前定义的计算前向传播结果的函数做了一些改进

In [8]:
def inference(input_tensor, reuse=False):
    # 定义第一层神经网络的变量和前向传播过程
    with tf.variable_scope('layer1', reuse=reuse):
        # 根据传进来的reuse来判断是创建新变量还是使用已经创建好的；第一个构造网络时需要创建新的变量，
        # 以后每次调用这个函数都直接使用reuse=True就不需要每次将变量传进来了
        weights = tf.get_variable("weights", [INPUT_NODE, LAYER1_NODE], 
                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
        biases = tf.get_variable("biases", [LAYER1_NODE], initializer=tf.constant_initializer(0.0))
        layer1 = tf.nn.relu(tf.matmul(input_tensors, weights) + biases)
        
    # 类似地定义第二层神经网络的变量和前向传播过程
    with tf.variable_scope('layer2', reuse=reuse):
        weights = tf.get_variable("weights", [LAYER1_NODE, OUTPUT_NODE], 
                                 initializer=tf.truncated_normal_initializer(stddev=0.1))
        biases = tf.get_variable("biases", [OUTPUT_NODE], initializer=tf.constant_initializer(0.0))
        layer2 = tf.matmul(layer1, weights) + biases
        
    return layer2

## 5.4 TensorFlow模型持久化

### 5.4.1 持久化代码实现

- TensorFlow提供了一个非常简单的API来保存和还原一个神经网络模型：tf.train.Saver类

In [9]:
import tensorflow as tf

v1 = tf.Variable(tf.constant(1.0, shape=[1]), name='v1')
v2 = tf.Variable(tf.constant(2.0, shape=[1]), name='v2')
result = v1 + v2

init_op = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init_op)
    saver.save(sess, './model.ckpt')

- 加载已保存的TensorFlow模型

In [1]:
import tensorflow as tf

v1 = tf.Variable(tf.constant(1.0, shape=[1]), name='v1')
v2 = tf.Variable(tf.constant(2.0, shape=[1]), name='v2')
result = v1 + v2

init_op = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    saver.restore(sess, './model.ckpt')
    print sess.run(result)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./model.ckpt
[3.]


- 如果不希望重复定义图上的运算，也可以直接加载已经持久化的图

In [1]:
import tensorflow as tf

saver = tf.train.import_meta_graph("./model.ckpt.meta")

with tf.Session() as sess:
    saver.restore(sess, './model.ckpt')
    print sess.run(tf.get_default_graph().get_tensor_by_name("add:0"))

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./model.ckpt


InvalidArgumentError: You must feed a value for placeholder tensor 'x-input' with dtype float and shape [?,784]
	 [[node x-input (defined at <ipython-input-1-89e2628dd6a1>:3) ]]

Caused by op u'x-input', defined at:
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Library/Python/2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/Library/Python/2.7/site-packages/tornado/ioloop.py", line 1064, in start
    handler_func(fd_obj, events)
  File "/Library/Python/2.7/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Library/Python/2.7/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Library/Python/2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Library/Python/2.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Library/Python/2.7/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py", line 2714, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py", line 2818, in run_ast_nodes
    if self.run_code(code, result):
  File "/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py", line 2878, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-89e2628dd6a1>", line 3, in <module>
    saver = tf.train.import_meta_graph("./model.ckpt.meta")
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 1435, in import_meta_graph
    meta_graph_or_file, clear_devices, import_scope, **kwargs)[0]
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 1457, in _import_meta_graph_with_return_elements
    **kwargs))
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/meta_graph.py", line 806, in import_scoped_meta_graph_with_return_elements
    return_elements=return_elements)
  File "/Library/Python/2.7/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/importer.py", line 442, in import_graph_def
    _ProcessNewOps(graph)
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/importer.py", line 235, in _ProcessNewOps
    for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/ops.py", line 3433, in _add_new_tf_operations
    for c_op in c_api_util.new_tf_operations(self)
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/ops.py", line 3325, in _create_op_from_tf_operation
    ret = Operation(c_op, self)
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'x-input' with dtype float and shape [?,784]
	 [[node x-input (defined at <ipython-input-1-89e2628dd6a1>:3) ]]


- 还可以在保存或者加载时给变量重命名

In [1]:
import tensorflow as tf

v1 = tf.Variable(tf.constant(1.0, shape=[1]), name='other-v1')
v2 = tf.Variable(tf.constant(2.0, shape=[1]), name='other-v2')
result = v1 + v2

saver = tf.train.Saver({"v1": v1, "v2": v2})

with tf.Session() as sess:
    saver.save(sess, './model.ckpt')
    print sess.run(result)

Instructions for updating:
Colocations handled automatically by placer.


FailedPreconditionError: Attempting to use uninitialized value other-v1
	 [[node save/SaveV2 (defined at <ipython-input-1-d19e87786a31>:7) ]]

Caused by op u'save/SaveV2', defined at:
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Library/Python/2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/Library/Python/2.7/site-packages/tornado/ioloop.py", line 1064, in start
    handler_func(fd_obj, events)
  File "/Library/Python/2.7/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Library/Python/2.7/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Library/Python/2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Library/Python/2.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Library/Python/2.7/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py", line 2714, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py", line 2818, in run_ast_nodes
    if self.run_code(code, result):
  File "/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py", line 2878, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-d19e87786a31>", line 7, in <module>
    saver = tf.train.Saver({"v1": v1, "v2": v2})
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 832, in __init__
    self.build()
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 844, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 881, in _build
    build_save=build_save, build_restore=build_restore)
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 510, in _build_internal
    save_tensor = self._AddSaveOps(filename_tensor, saveables)
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 210, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "/Library/Python/2.7/site-packages/tensorflow/python/training/saver.py", line 124, in save_op
    tensors)
  File "/Library/Python/2.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1807, in save_v2
    name=name)
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/Library/Python/2.7/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
    op_def=op_def)
  File "/Library/Python/2.7/site-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value other-v1
	 [[node save/SaveV2 (defined at <ipython-input-1-d19e87786a31>:7) ]]


- 加载时对变量重命名的主要目的之一是方便使用变量的滑动平均值

In [1]:
import tensorflow as tf

v = tf.Variable(0, dtype=tf.float32, name="v")
for variables in tf.global_variables():
    print variables.name
    
ema = tf.train.ExponentialMovingAverage(0.99)
maintain_averages_op = ema.apply(tf.global_variables())

for variables in tf.global_variables():
    print variables.name
    
saver = tf.train.Saver()
with tf.Session() as sess:
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    
    sess.run(tf.assign(v, 10))
    sess.run(maintain_averages_op)
    saver.save(sess, "./ema.ckpt")
    print sess.run([v, ema.average(v)])

Instructions for updating:
Colocations handled automatically by placer.
v:0
v:0
v/ExponentialMovingAverage:0
[10.0, 0.099999905]


In [2]:
import tensorflow as tf

v = tf.Variable(0, dtype=tf.float32, name='v')
# 通过变量重命名将原来变量v的滑动平均值直接赋值给v
saver = tf.train.Saver({"v/ExponentialMovingAverage": v})
with tf.Session() as sess:
    saver.restore(sess, './ema.ckpt')
    print sess.run(v) # 输出0.099999905，这个值就是原来模型中变量v的滑动平均值

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./ema.ckpt
0.099999905


- 为了方便加载时重命名滑动平均变量，tf.train.ExponentialMovingAverage类提供了variables_to_restore函数来生成tf.train.Saver类所需要的变量重命名字典

In [1]:
import tensorflow as tf

v = tf.Variable(0, dtype=tf.float32, name='v')
ema = tf.train.ExponentialMovingAverage(0.99)

print ema.variables_to_restore()

saver = tf.train.Saver(ema.variables_to_restore())

with tf.Session() as sess:
    saver.restore(sess, "./ema.ckpt")
    print sess.run(v) # 输出0.099999905，这个值就是原来模型中变量v的滑动平均值

Instructions for updating:
Colocations handled automatically by placer.
{u'v/ExponentialMovingAverage': <tf.Variable 'v:0' shape=() dtype=float32_ref>}
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./ema.ckpt
0.099999905


- 使用tf.train.Saver会保存运行TensorFlow程序所需的全部信息，然而有时并不需要某些信息；比如在测试或离线预测时，只需要知道如何从神经网络的输入层经过前向传播计算得到输出层即可，而不需要类似变量初始化、模型保存等辅助节点的信息；TensorFlow提供了convert_variables_to_constants函数，通过这个函数可以将计算图中的变量及其取值通过常量方式保存

In [1]:
import tensorflow as tf
from tensorflow.python.framework import graph_util

v1 = tf.Variable(tf.constant(1.0, shape=[1]), name="v1")
v2 = tf.Variable(tf.constant(2.0, shape=[1]), name="v2")
result = v1 + v2

init_op = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init_op)
    # 导出当前计算图的GraphDef部分，只需要这一部分就可以完成从输入层到输出层的计算过程
    graph_def = tf.get_default_graph().as_graph_def()
    
    # 将图中的变量及其取值转化为常量，同时将图中不必要的节点去掉；最后一个参数['add']给出了需要保存的节点名称
    output_graph_def = graph_util.convert_variables_to_constants(sess, graph_def, ['add'])
    with tf.gfile.GFile("./combined_model.pb", "wb") as f:
        f.write(output_graph_def.SerializeToString())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.compat.v1.graph_util.convert_variables_to_constants
Instructions for updating:
Use tf.compat.v1.graph_util.extract_sub_graph
INFO:tensorflow:Froze 2 variables.
INFO:tensorflow:Converted 2 variables to const ops.


- 通过以下程度可以直接计算定义的加法运算的结果，当只需要得到计算图中某个节点的取值时，这提供了一个更加方便的方法

In [1]:
import tensorflow as tf
from tensorflow.python.platform import gfile

with tf.Session() as sess:
    model_filename = "./combined_model.pb"
    with gfile.FastGFile(model_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    
    result = tf.import_graph_def(graph_def, return_elements=["add:0"])
    print sess.run(result)

Instructions for updating:
Use tf.gfile.GFile.
[array([3.], dtype=float32)]


### 5.4.2 持久化原理及数据格式

- 上一节介绍了当调用saver.save函数时，TensorFlow程序会自动生成4个文件，TensorFlow模型的持久化就是通过这4个文件完成的
- TensorFlow通过元图（MetaGraph）来记录计算图中节点的信息以及运行计算图中节点所需要的元数据；TensorFlow中元图是由MetaGraphDef Protocol Buffer定义的，MetaGraphDef中的内容就构成了TensorFlow持久化时的第一个文件，也即`model.ckpt.meta`
- 除了持久化TensorFlow计算图的结构，持久化TensorFlow中变量的取值也是非常重要的一部分，`model.ckpt.index`和`model.ckpt.data-****-of-****`文件就保存了所有变量的取值；其中model.ckpt.data文件是通过SSTable格式存储的，可以大致理解为就是一个（key, value）列表
- 最后一个文件的名字是固定的，叫checkpoint，它维护了由一个tf.train.Saver类持久化的所有TensorFlow模型文件的文件名，当某个保存的TensorFlow模型文件被删除时，这个模型所对应的文件名也会从checkpoint文件中删除

## 5.5 TensorFlow最佳实践样例程序

- 本节将提供重构之后的程序来解决MNIST问题，重构之后的代码将会被拆成3个程序，第一个是mnist_inference.py，它定义了前向传播的过程以及神经网络中的参数，第二个是mnist_train.py，它定义了神经网络的训练过程，第三个是mnist_eval.py，它定义了测试过程

In [1]:
import tensorflow as tf

# 定义神经网络结构相关的参数
INPUT_NODE = 784
OUTPUT_NODE = 10
LAYER1_NODE = 500

# 通过tf.get_variable函数来获取变量，在训练神经网络时会创建这些变量，在测试时会通过保存的模型加载这些变量的取值
def get_weight_variable(shape, regularizer):
    weights = tf.get_variable("weights", shape, initializer=tf.truncated_normal_initializer(stddev=0.1))
    
    # 当给出了正则化生成函数时，将当前变量的正则化损失加入名字为losses的集合；在这里使用了add_to_collection函数将一个张量加入一个集合，
    # 而这个集合的名称为losses，这是自定义的集合，不在TensorFlow自动管理的集合列表中
    if regularizer != None:
        tf.add_to_collection('losses', regularizer(weights))
    return weights

# 定义神经网络的前向传播过程
def inference(input_tensor, regularizer):
    # 声明第一层神经网络的变量并完成前向传播过程
    with tf.variable_scope('layer1'):
        weights = get_weight_variable([INPUT_NODE, LAYER1_NODE], regularizer)
        biases = tf.get_variable("biases", [LAYER1_NODE], initializer=tf.constant_initializer(0.0))
        layer1 = tf.nn.relu(tf.matmul(input_tensor, weights) + biases)
    
    # 类似的声明第二层神经网络的变量并完成前向传播过程
    with tf.variable_scope('layer2'):
        weights = get_weight_variable([LAYER1_NODE, OUTPUT_NODE], regularizer)
        biases = tf.get_variable("biases", [OUTPUT_NODE], initializer=tf.constant_initializer(0.0))
        layer2 = tf.matmul(layer1, weights) + biases
        
    return layer2

In [3]:
import os
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

# 加载mnist_inference.py中定义的常量和前向传播的函数
#import mnist_inference

# 配置神经网络的参数
BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.8
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001
TRAINING_STEPS = 30000
MOVING_AVERAGE_DECAY = 0.99
# 模型保存的路径和文件名
MODEL_SAVE_PATH = "./"
MODEL_NAME = "mnist_model.ckpt"

def train(mnist):
    # 定义输入输出placeholder
    x = tf.placeholder(tf.float32, [None, INPUT_NODE], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-input')
    regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
    # 直接使用mnist_inference.py中定义的前向传播过程
    y = inference(x, regularizer)
    global_step = tf.Variable(0, trainable=False)
    
    # 定义损失函数、学习率、滑动平均操作以及训练过程
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variable_averages_op = variable_averages.apply(tf.trainable_variables())
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
    learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, mnist.train.num_examples / BATCH_SIZE,
                                              LEARNING_RATE_DECAY)
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
    with tf.control_dependencies([train_step, variable_averages_op]):
        train_op = tf.no_op(name='train')
        
    # 初始化TensorFlow持久化类
    saver = tf.train.Saver()
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        
        # 在训练过程中不再测试模型在验证数据集上的表现，验证和测试的过程将会有一个独立的程序来完成
        for i in range(TRAINING_STEPS):
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={x: xs, y_: ys})
            
            # 每1000轮保存一次模型
            if i % 1000 == 0:
                # 输出当前的训练情况，这里只输出了模型在当前训练batch上的损失函数大小
                print "After %d training step(s), loss on training batch is %g." % (step, loss_value)
                # 保存当前的模型，注意这里给出了global_step参数，这样可以让每个被保存模型的文件名末尾加上训练的轮数
                saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step)
                
def main(argv=None):
    mnist = input_data.read_data_sets("./", one_hot=True)
    train(mnist)
    
if __name__ == "__main__":
    # TensorFlow提供的一个主程序入口，tf.app.run会调用上面定义的main函数
    #tf.app.run()
    pass

In [11]:
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

#import mnist_inference
#import mnist_train

# 每10秒加载一次最新的模型，并在测试集上测试最新模型的正确率
EVAL_INTERVAL_SECS = 10

def evaluate(mnist):
    with tf.Graph().as_default() as g:
        # 定义输入输出的格式
        x = tf.placeholder(tf.float32, [None, INPUT_NODE], name='x-input')
        y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-inpit')
        validate_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
        
        # 直接通过调用封装好的函数来计算前向传播的结果，因为测试时不关注正则化损失的值，所以这里用于计算正则化损失的函数被设置为None
        y = inference(x, None)
        
        # 使用前向传播的结果计算正确率，如果需要对未知的样例进行分类，那么使用tf.argmax(y, 1)就可以得到输入样例的预测类别了
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
        # 通过变量重命名的方式来加载模型，这样在前向传播的过程中就不需要调用求滑动平均的函数来获取平均值了
        variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
        
        # 每隔EVAL_INTERVAL_SECS秒调用一次计算正确率的过程以检测训练过程中正确率的变化
        while True:
            with tf.Session() as sess:
                # 通过checkpoint文件自动找到目录中最新模型的文件名
                ckpt = tf.train.get_checkpoint_state(MODEL_SAVE_PATH)
                if ckpt and ckpt.model_checkpoint_path:
                    # 加载模型
                    saver.restore(sess, ckpt.model_checkpoint_path)
                    # 通过文件名得到模型保存时迭代的轮数
                    global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
                    accuracy_score = sess.run(accuracy, feed_dict=validate_feed)
                    print "After %s training step(s), validation accuracy = %g" % (global_step, accuracy_score)
                else:
                    print "No checkpoint file found"
                    return
            time.sleep(EVAL_INTERVAL_SECS)
            
def main(arg=None):
    mnist = input_data.read_data_sets("./", one_hot=True)
    evaluate(mnist)
    
if __name__ == "__main__":
    tf.app.run()

Extracting ./train-images-idx3-ubyte.gz
Extracting ./train-labels-idx1-ubyte.gz
Extracting ./t10k-images-idx3-ubyte.gz
Extracting ./t10k-labels-idx1-ubyte.gz
INFO:tensorflow:Restoring parameters from ./mnist_model.ckpt-29001
After 29001 training step(s), validation accuracy = 0.9852
INFO:tensorflow:Restoring parameters from ./mnist_model.ckpt-29001
After 29001 training step(s), validation accuracy = 0.9852
INFO:tensorflow:Restoring parameters from ./mnist_model.ckpt-29001
After 29001 training step(s), validation accuracy = 0.9852


KeyboardInterrupt: 