<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#NN-(Neural-Network)-神经网络" data-toc-modified-id="NN-(Neural-Network)-神经网络-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>NN (Neural Network) 神经网络</a></span></li><li><span><a href="#损失函数(loss)" data-toc-modified-id="损失函数(loss)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>损失函数(loss)</a></span></li><li><span><a href="#学习率(learning_rate)" data-toc-modified-id="学习率(learning_rate)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>学习率(learning_rate)</a></span></li><li><span><a href="#滑动平均(影子值)" data-toc-modified-id="滑动平均(影子值)-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>滑动平均(影子值)</a></span></li><li><span><a href="#正则化" data-toc-modified-id="正则化-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>正则化</a></span></li><li><span><a href="#神经网络搭建套路" data-toc-modified-id="神经网络搭建套路-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>神经网络搭建套路</a></span></li><li><span><a href="#手写数字识别示例(MNIST)" data-toc-modified-id="手写数字识别示例(MNIST)-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>手写数字识别示例(MNIST)</a></span></li></ul></div>

## NN (Neural Network) 神经网络
> NN复杂度： 常用NN层数和NN参数的个数表示
>> * 层数 = 隐藏层的层数 + 1个输出层 (输入层不纳入计算)
>> * 总参数 = 总W + 总b  
>>> 每一层的 `W` 个数等于 `上一层神经元个数` x `本层神经元个数`    
>>> 每一层的 `b` 个数等于 `本层神经元个数`

## 损失函数(loss)
[Link1](https://blog.csdn.net/marsjhao/article/details/72630147)

> 预测值(y)与已知答案(y_)的差距  

> NN优化的目标: loss最小化
>> 1. mse (Mean Squared Error) 均方误差
>>> loss_mse = tf.reduce_mean(tf.square(y_ - y))
>> 2. 自定义
>>> loss = tf.reduce_sum(tf.where(tf.greater(y, y_), arg1*(y - y_), arg2*(y_ - y)))
>> 3. ce (Cross Entropy) 交叉熵 -- `表征两个概率分布之间的距离`
>>> ce = -tf.reduce_mean(y_ * tf.log(tf.clip_by_value(y, 1e-12, 1.0)))   
>>>> tf.clip_by_value()函数可将一个tensor的元素数值限制在指定范围内，这样可防止一些错误运算，起到数值检查作用

> TensorFlow 提供了集成交叉熵函数  
>>> * ce = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits_v2(logits=y, labels=y_))

In [None]:
#coding:utf-8

"""
  '均方误差' 损失函数
"""

#0导入模块，生成数据集
import tensorflow as tf
import numpy as np
BATCH_SIZE = 8
SEED = 23455

rdm = np.random.RandomState(SEED)
X = rdm.rand(32,2)
Y_ = [[x1+x2+(rdm.rand()/10.0-0.05)] for (x1, x2) in X]

#1定义神经网络的输入、参数和输出，定义前向传播过程。
x = tf.placeholder(tf.float32, shape=(None, 2))
y_ = tf.placeholder(tf.float32, shape=(None, 1))
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)

#2定义损失函数及反向传播方法。
#定义损失函数为MSE,反向传播方法为梯度下降。
loss_mse = tf.reduce_mean(tf.square(y_ - y))
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss_mse)

#3生成会话，训练STEPS轮
with tf.Session() as sess:
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    STEPS = 20000
    for i in range(STEPS):
        start = (i*BATCH_SIZE) % 32
        end = (i*BATCH_SIZE) % 32 + BATCH_SIZE
        sess.run(train_step, feed_dict={x: X[start:end], y_: Y_[start:end]})
        if i % 500 == 0:
            print "After %d training steps, w1 is: " % (i)
            print sess.run(w1), "\n"
    print "Final w1 is: \n", sess.run(w1)

In [None]:
#coding:utf-8

"""
  '自定义' 损失函数
"""

import tensorflow as tf
import numpy as np
BATCH_SIZE = 8
SEED = 23455
COST = 1     # 酸奶成本1元
PROFIT = 9   # 酸奶利润9元
#预测少了损失大，故不要预测少，故生成的模型会多预测一些

rdm = np.random.RandomState(SEED)
X = rdm.rand(32,2)
Y = [[x1+x2+(rdm.rand()/10.0-0.05)] for (x1, x2) in X]

#1定义神经网络的输入、参数和输出，定义前向传播过程。
x = tf.placeholder(tf.float32, shape=(None, 2))
y_ = tf.placeholder(tf.float32, shape=(None, 1))
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)

#2定义损失函数及反向传播方法。
# 定义损失函数使得预测少了的损失大，于是模型应该偏向多的方向预测。
loss = tf.reduce_sum(tf.where(tf.greater(y, y_), (y - y_)*COST, (y_ - y)*PROFIT))
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)

#3生成会话，训练STEPS轮
with tf.Session() as sess:
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    STEPS = 3000
    for i in range(STEPS):
        start = (i*BATCH_SIZE) % 32
        end = (i*BATCH_SIZE) % 32 + BATCH_SIZE
        sess.run(train_step, feed_dict={x: X[start:end], y_: Y[start:end]})
        if i % 500 == 0:
            print "After %d training steps, w1 is: " % (i)
            print sess.run(w1), "\n"
    print "Final w1 is: \n", sess.run(w1)


In [None]:
#coding:utf-8

"""
  '交叉熵' 损失函数
"""

import tensorflow as tf
import numpy as np
BATCH_SIZE = 8
SEED = 23455

rdm = np.random.RandomState(SEED)
X = rdm.rand(32,2)
Y = [[x1+x2+(rdm.rand()/10.0-0.05)] for (x1, x2) in X]

#1定义神经网络的输入、参数和输出，定义前向传播过程。
x = tf.placeholder(tf.float32, shape=(None, 2))
y_ = tf.placeholder(tf.float32, shape=(None, 1))
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)

#2定义损失函数及反向传播方法。
#重新定义损失函数，使得预测多了的损失大，于是模型应该偏向少的方向预测。
'''
** https://blog.csdn.net/m0_37041325/article/details/77043598
1. 这个函数和tf.nn.softmax_cross_entropy_with_logits函数比较明显的区别在于它的参数labels的不同，
这里的参数label是非稀疏表示的
2. 稀疏表示的形式为[0,0,1](one-hot vector), 这个表示这个样本为第3个分类;
而非稀疏表示就表示为2(scala)（因为从0开始算，0,1,2,就能表示三类）
3. tf.nn.sparse_softmax_cross_entropy_with_logits函数
比tf.nn.softmax_cross_entropy_with_logits多了一步操作，将labels稀疏化的操作
'''
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
loss = tf.reduce_mean(ce)
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)

#3生成会话，训练STEPS轮
with tf.Session() as sess:
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    STEPS = 10000
    for i in range(STEPS):
        start = (i*BATCH_SIZE) % 32
        end = (i*BATCH_SIZE) % 32 + BATCH_SIZE
        sess.run(train_step, feed_dict={x: X[start:end], y_: Y[start:end]})
        if i % 500 == 0:
            print "After %d training steps, w1 is: " % (i)
            print sess.run(w1), "\n"
    print "Final w1 is: \n", sess.run(w1)


In [None]:
# -*- coding: utf-8 -*-

"""
  'sparse_softmax_cross_entropy_with_logits' 函数示例
"""

import tensorflow as tf

y2 = tf.convert_to_tensor([[0, 0, 1, 0]], dtype=tf.int64)
y_2 = tf.convert_to_tensor([[-2.6, -1.7, 3.2, 0.1]], dtype=tf.float32)
y_2_2 = tf.argmax(y2, 1)
c2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_2, labels=y_2_2)

y3 = tf.convert_to_tensor([[0, 0, 1, 0], [0, 0, 1, 0]], dtype=tf.int64)
y_3 = tf.convert_to_tensor([[-2.6, -1.7, -3.2, 0.1], [-2.6, -1.7, 3.2, 0.1]], dtype=tf.float32)
y_3_3 = tf.argmax(y_3, 1)
c3 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_3, labels=y_3_3)


with tf.Session() as sess:

    y2_ = sess.run(y2)
    print y2_.shape, y2_        # (1, 4) [[0 0 1 0]]

    y_2_ = sess.run(y_2)
    print y_2_.shape, y_2_      # (1, 4) [[-2.6 -1.7  3.2  0.1]]

    y_2_2_ = sess.run(y_2_2)
    print y_2_2_.shape, y_2_2_  # (1,) [2]

    c2_ = sess.run(c2)
    print c2_.shape, c2_        # (1,) [0.05403664]

    y3_ = sess.run(y3)
    print y3_.shape, y3_

    y_3_ = sess.run(y_3)
    print y_3_.shape, y_3_

    y_3_3 = sess.run(y_3_3)
    print y_3_3.shape, y_3_3

    c3_ = sess.run(c3)
    print c3_.shape, c3_

## 学习率(learning_rate)
[Link1](https://www.imooc.com/article/details/id/27808)

> 在神经网络训练过程中，参数每次更新的幅度

> 学习率设置大了会震荡不收敛，学习率设置小了收敛速度慢  
>> tf.train.GradientDescentOptimizer(learning_rate)  

> 指数衰减计算公式
>> decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
>>> * decayed_learning_rate: 每一轮优化时使用的学习率
>>> * learning_rate: 超参数，事先设定(预估)的初始学习率
>>> * decay_rate: 超参数，衰减系数
>>> * decay_steps: 衰减速度 (即迭代多少次进行衰减)
>>> * 一般来说，初始学习率、衰减系数和衰减速度都是根据经验设置的

> 指数衰减学习率
>> * global_step = tf.Variable(0, trainable=False)
>>> 迭代次数初始值为0
>> * learning_rate = tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase)
>>> * 参数decay_steps=100时，即表示100轮迭代后进行一次衰减
>>> * 参数staircase=True时，global_step/decay_steps会被转化为整数，这使得学习率呈阶梯型下降；若为False时，则是连续型下降
>> * tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
>>> 使用指数衰减的学习率，在minimize函数中传入global_step，它将自动更新，learning_rate也随即被更新


In [None]:
#coding:utf-8
#设损失函数 loss=(w+1)^2, 令w初值是常数10。反向传播就是求最优w，即求最小loss对应的w值
#使用指数衰减的学习率，在迭代初期得到较高的下降速度，可以在较小的训练轮数下取得更有收敛度。
import tensorflow as tf

LEARNING_RATE_BASE = 0.1   #最初学习率
LEARNING_RATE_DECAY = 0.99 #学习率衰减率
LEARNING_RATE_STEP = 1     #喂入多少轮BATCH_SIZE后，更新一次学习率，一般设为：总样本数/BATCH_SIZE

#运行了几轮BATCH_SIZE的计数器，初值给0, 设为不被训练
global_step = tf.Variable(0, trainable=False)
#定义指数下降学习率
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, 
                                           global_step, 
                                           LEARNING_RATE_STEP, 
                                           LEARNING_RATE_DECAY, 
                                           staircase=True)
#定义待优化参数，初值给10
w = tf.Variable(tf.constant(5, dtype=tf.float32))
#定义损失函数loss
loss = tf.square(w+1)
#定义反向传播方法
train_step = tf.train.GradientDescentOptimizer(learning_rate)\
                                            .minimize(loss, global_step=global_step)
#生成会话，训练40轮
with tf.Session() as sess:
    init_op=tf.global_variables_initializer()
    sess.run(init_op)
    for i in range(40):
        sess.run(train_step)
        learning_rate_val = sess.run(learning_rate)
        global_step_val = sess.run(global_step)
        w_val = sess.run(w)
        loss_val = sess.run(loss)
        print "After %s steps: global_step is %f, w is %f, learning rate is %f, loss is %f" % (i, global_step_val, w_val, learning_rate_val, loss_val)

## 滑动平均(影子值)
[Link1](https://blog.csdn.net/lanchunhui/article/details/70803060)

> 记录了每个参数一段时间内过往值的平均，增加了模型的泛化性   
> 滑动平均是针对所有参数：`W`和`b` 

> 滑动平均计算公式
>> * 影子 = 衰减率 x 影子 + (1 - 衰减率) × 参数(W)     （影子初值 = 参数初值）   
>> * 衰减率 = min{ MOVING_AVERAVG_DECAY, (1+轮数)/(10+轮数)}

> 滑动平均函数
>> * global_step = tf.Variable(0, trainable=False)
>> * ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
>> * ema_op = ema.apply(tf.trainable_variables())

In [None]:
#coding:utf-8
import tensorflow as tf

#1. 定义变量及滑动平均类
#定义一个32位浮点变量，初始值为0.0  这个代码就是不断更新w1参数，优化w1参数，滑动平均做了个w1的影子
w1 = tf.Variable(0, dtype=tf.float32)
#定义num_updates（NN的迭代轮数）,初始值为0，不可被优化（训练），这个参数不训练
global_step = tf.Variable(0, trainable=False)
#实例化滑动平均类，给衰减率为0.99，当前轮数global_step
MOVING_AVERAGE_DECAY = 0.99
ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
#ema.apply后的括号里是更新列表，每次运行sess.run（ema_op）时，对更新列表中的元素求滑动平均值。
#在实际应用中会使用tf.trainable_variables()自动将所有待训练的参数汇总为列表
#ema_op = ema.apply([w1])
ema_op = ema.apply(tf.trainable_variables())

#2. 查看不同迭代中变量取值的变化。
with tf.Session() as sess:
    # 初始化
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    #用 ema.average(w1)获取w1滑动平均值 （要运行多个节点，作为列表中的元素列出，写在sess.run中）
    #打印出当前参数w1和w1滑动平均值
    print "current global_step:", sess.run(global_step)
    print "current w1", sess.run([w1, ema.average(w1)]) 
    
    # 参数w1的值赋为1
    sess.run(tf.assign(w1, 1))
    sess.run(ema_op)
    print "current global_step:", sess.run(global_step)
    print "current w1", sess.run([w1, ema.average(w1)]) 
    
    # 更新global_step和w1的值,模拟出轮数为100时，参数w1变为10, 
    # 以下代码global_step保持为100，每次执行滑动平均操作，影子值会更新 
    sess.run(tf.assign(global_step, 100))  
    sess.run(tf.assign(w1, 10))
    sess.run(ema_op)
    print "current global_step:", sess.run(global_step)
    print "current w1:", sess.run([w1, ema.average(w1)])       
    
    # 每次sess.run会更新一次w1的滑动平均值
    sess.run(ema_op)
    print "current global_step:" , sess.run(global_step)
    print "current w1:", sess.run([w1, ema.average(w1)])

    sess.run(ema_op)
    print "current global_step:" , sess.run(global_step)
    print "current w1:", sess.run([w1, ema.average(w1)])

    sess.run(ema_op)
    print "current global_step:" , sess.run(global_step)
    print "current w1:", sess.run([w1, ema.average(w1)])

    sess.run(ema_op)
    print "current global_step:" , sess.run(global_step)
    print "current w1:", sess.run([w1, ema.average(w1)])

    sess.run(ema_op)
    print "current global_step:" , sess.run(global_step)
    print "current w1:", sess.run([w1, ema.average(w1)])

    sess.run(ema_op)
    print "current global_step:" , sess.run(global_step)
    print "current w1:", sess.run([w1, ema.average(w1)])
    
"""
运行结果：
current global_step: 0
current w1 [0.0, 0.0]
current global_step: 0
current w1 [1.0, 0.9]
current global_step: 100
current w1: [10.0, 1.6445453]
current global_step: 100
current w1: [10.0, 2.3281732]
current global_step: 100
current w1: [10.0, 2.955868]
current global_step: 100
current w1: [10.0, 3.532206]
current global_step: 100
current w1: [10.0, 4.061389]
current global_step: 100
current w1: [10.0, 4.547275]
current global_step: 100
current w1: [10.0, 4.9934072]
"""

## 正则化
[Link1](https://blog.csdn.net/u012436149/article/details/70264257)
[Link2](https://blog.csdn.net/u011012422/article/details/72808898?utm_source=itdadao&utm_medium=referral)

> 正则化在损失函数中引入模型复杂度指标，利用给`W`加权值，弱化了训练数据的噪声(一般不正则化参数`b`)

> 正则化计算公式
>> loss = loss(y 与 y_) + REGULARIZER * loss(w)
>>> * loss(y 与 y_): 模型中所有参数的损失函数, 如: 交叉熵、均方误差
>>> * REGULARIZER: 超参数，给出参数`w`在总loss中的比例，即正则化的权重
>>> * loss(w): 需要正则化的参数   

> L1 正则化
>> loss(w) = tf.contrib.layers.l1_regularizer(scale)(w)
>>> * scale: 超参数，正则项的系数

> L2 正则化
>> loss(w) = tf.contrib.layers.l2_regularizer(scale)(w)
>>> * scale: 超参数，正则项的系数

> 多种正则化组合
>> loss(w) = tf.contrib.layers.sum_regularizer(regularizer_list)(w)
>>> 返回一个可以执行多种(个)正则化的函数.意思是,创建一个正则化方法,这个方法是多个正则化方法的混合体
>>> * regularizer_list: 正则化方法的列表


In [None]:
# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np

def get_weights(shape, regularizer):
    var = tf.Variable(tf.random_normal(shape), dtype=tf.float32)
    tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(var))
    return var

x = tf.placeholder(tf.float32, shape=(None, 2))
y_ = tf.placeholder(tf.float32, shape=(None, 1))
batch_size = 8
layer_dimension = [2, 10, 10, 10, 1]
n_layers = len(layer_dimension)
cur_lay = x
in_dimension = layer_dimension[0]

for i in range(1, n_layers):
    out_dimension = layer_dimension[i]
    weights = get_weights([in_dimension, out_dimension], 0.001)
    bias = tf.Variable(tf.constant(0.1, shape=[out_dimension]))
    cur_lay = tf.nn.relu(tf.matmul(cur_lay, weights)+bias)
    in_dimension = layer_dimension[i]

mess_loss = tf.reduce_mean(tf.square(y_-cur_lay))
tf.add_to_collection('losses', mess_loss)
loss = tf.add_n(tf.get_collection('losses'))


## 神经网络搭建套路

In [None]:
#coding:utf-8

import tensorflow as tf

"""
    forwoard.py
    前向传播就是搭建神经网络，设计网络结构
"""

#定义神经网络的输入、参数和输出，定义前向传播过程 
def get_weight(shape, regularizer):
    w = tf.Variable(tf.random_normal(shape), dtype=tf.float32)
    tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w))
    return w

def get_bias(shape):  
    b = tf.Variable(tf.constant(0.01, shape=shape)) 
    return b

def forward(x, regularizer):
    w1 = get_weight([2,11], regularizer)	
    b1 = get_bias([11])
    y1 = tf.nn.relu(tf.matmul(x, w1) + b1)

    w2 = get_weight([11,1], regularizer)
    b2 = get_bias([1])
    y = tf.matmul(y1, w2) + b2 
    return y

In [None]:
#coding:utf-8

"""
    backward.py
    反向传播就是训练网络，优化网络参数
"""

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import forward

STEPS = 40000
BATCH_SIZE = 30 
LEARNING_RATE_BASE = 0.001
LEARNING_RATE_DECAY = 0.999
REGULARIZER = 0.01

def backward():
    x = tf.placeholder(tf.float32, shape=(None, 2))
    y_ = tf.placeholder(tf.float32, shape=(None, 1))

    X, Y_, Y_c = [...], [...], [...]

    y = forward(x, REGULARIZER)
    
    global_step = tf.Variable(0,trainable=False)
    
    # 定义指数衰减学习率
    learning_rate = tf.train.exponential_decay(
        LEARNING_RATE_BASE,
        global_step,
        300/BATCH_SIZE,
        LEARNING_RATE_DECAY,
        staircase=True)

    #定义损失函数：交叉熵
    # loss_mse = tf.reduce_mean(tf.square(y-y_))
    loss_mse = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, 
                                                              labels=tf.argmax(y_, 1))
    
    loss_total = loss_mse + tf.add_n(tf.get_collection('losses'))
    
    #定义反向传播方法：包含正则化
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss_total, 
                                                                global_step=global_step)

    #定义滑动平均
    ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    ema_op = ema.apply(tf.trainable_variables())
    with tf.control_dependencies([train_step, ema_op]):
        train_op = tf.no_op(name='train')
    
    
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        for i in range(STEPS):
            start = (i*BATCH_SIZE) % 300
            end = start + BATCH_SIZE
            sess.run(train_step, feed_dict={x: X[start:end], y_:Y_[start:end]})
            if i % 2000 == 0:
                loss_v = sess.run(loss_total, feed_dict={x:X,y_:Y_})
                print "After %d steps, loss is: %f" %(i, loss_v)

        xx, yy = np.mgrid[-3:3:.01, -3:3:.01]
        grid = np.c_[xx.ravel(), yy.ravel()]
        probs = sess.run(y, feed_dict={x:grid})
        probs = probs.reshape(xx.shape)
    
    plt.scatter(X[:,0], X[:,1], c=np.squeeze(Y_c)) 
    plt.contour(xx, yy, probs, levels=[.5])
    plt.show()
    
if __name__=='__main__':
    backward()


## 手写数字识别示例(MNIST)

In [None]:
import tensorflow as tf

"""
    filename: mnist_forward.py
    定义神经网络模型
"""

INPUT_NODE = 784
OUTPUT_NODE = 10
LAYER1_NODE = 500

def get_weight(shape, regularizer):
    w = tf.Variable(tf.truncated_normal(shape,stddev=0.1))
    if regularizer != None: tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w))
    return w


def get_bias(shape):  
    b = tf.Variable(tf.zeros(shape))  
    return b

def forward(x, regularizer):
    w1 = get_weight([INPUT_NODE, LAYER1_NODE], regularizer)
    b1 = get_bias([LAYER1_NODE])
    y1 = tf.nn.relu(tf.matmul(x, w1) + b1)

    w2 = get_weight([LAYER1_NODE, OUTPUT_NODE], regularizer)
    b2 = get_bias([OUTPUT_NODE])
    
    # 注：此处不再经过激活函数; 因为后面需要使用softmax函数，它也属于激活函数
    y = tf.matmul(y1, w2) + b2    
    return y


In [None]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import mnist_forward
import os

"""
    filename: mnist_backward.py
    训练神经网络
"""

BATCH_SIZE = 200
LEARNING_RATE_BASE = 0.1
LEARNING_RATE_DECAY = 0.99
REGULARIZER = 0.0001
STEPS = 50000
MOVING_AVERAGE_DECAY = 0.99
MODEL_SAVE_PATH="./model/"
MODEL_NAME="mnist_model"


def backward(mnist):

    x = tf.placeholder(tf.float32, [None, mnist_forward.INPUT_NODE])
    y_ = tf.placeholder(tf.float32, [None, mnist_forward.OUTPUT_NODE])
    y = mnist_forward.forward(x, REGULARIZER)
    global_step = tf.Variable(0, trainable=False)

    ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
    cem = tf.reduce_mean(ce)
    loss = cem + tf.add_n(tf.get_collection('losses'))

    learning_rate = tf.train.exponential_decay(
        LEARNING_RATE_BASE,
        global_step,
        mnist.train.num_examples / BATCH_SIZE, 
        LEARNING_RATE_DECAY,
        staircase=True)

    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    ema_op = ema.apply(tf.trainable_variables())
    with tf.control_dependencies([train_step, ema_op]):
        train_op = tf.no_op(name='train')

    saver = tf.train.Saver()

    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)

        # 支持从断点处恢复训练
        ckpt = tf.train.get_checkpoint_state(MODEL_SAVE_PATH)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        
        for i in range(STEPS):
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={x: xs, y_: ys})
            if i % 1000 == 0:
                print("After %d training step(s), loss on training batch is %g." % (step, loss_value))
                saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step)

def main():
    mnist = input_data.read_data_sets("./data/", one_hot=True)
    backward(mnist)

if __name__ == '__main__':
    main()

In [None]:
#coding:utf-8
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import mnist_forward
import mnist_backward
TEST_INTERVAL_SECS = 5

"""
    filename: mnist_test.py
    神经网络测试
"""

def test(mnist):
    with tf.Graph().as_default() as g:
        x = tf.placeholder(tf.float32, [None, mnist_forward.INPUT_NODE])
        y_ = tf.placeholder(tf.float32, [None, mnist_forward.OUTPUT_NODE])
        y = mnist_forward.forward(x, None)

        ema = tf.train.ExponentialMovingAverage(mnist_backward.MOVING_AVERAGE_DECAY)
        ema_restore = ema.variables_to_restore()
        saver = tf.train.Saver(ema_restore)
		
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        while True:
            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state(mnist_backward.MODEL_SAVE_PATH)
                if ckpt and ckpt.model_checkpoint_path:
                    saver.restore(sess, ckpt.model_checkpoint_path)
                    global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
                    accuracy_score = sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels})
                    print("After %s training step(s), test accuracy = %g" % (global_step, accuracy_score))
                else:
                    print('No checkpoint file found')
                    return
            time.sleep(TEST_INTERVAL_SECS)

def main():
    mnist = input_data.read_data_sets("./data/", one_hot=True)
    test(mnist)

if __name__ == '__main__':
    main()
