In [1]:
#batchnorm主要是让收敛变快，但对acc影响不大
from mxnet import nd
def pure_batch_norm(X, gamma, beta, eps=1e-5):
    assert len(X.shape) in (2,4)#只限制输入x的形状只有2和4维度？为什么呢？ 当维度为2的时候，维度是(batch, num_features)，
    #当维度为4的时候，（batch, channel, height, weight)，不同的channel数据的分布可能不同，所以不对不同通道的数据做BN，
    #同时对于图像数据，都是像素点，会假设相同通道的像素点取自相同的分布，所以对于四维的数据，同一个batch 里面，每个通道单独做BN。
    if len(X.shape)==2:
        mean=X.mean(axis=0)#每个输入维度在样本上的平均和方差
        variance=((X-mean)**2).mean(axis=0)
    else:
        # 对每个通道算均值和方差，需要保持4D形状使得可以正确地广播
        mean=X.mean(axis=(0,2,3),keepdims=True)
        variance=((X-mean)**2).mean(axis=(0,2,3),keepdims=True)
    #均一化
    X_hat=(X-mean)/nd.sqrt(variance+eps)
    # 拉升和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)

In [2]:
A = nd.arange(6).reshape((3,2))
A


[[ 0.  1.]
 [ 2.  3.]
 [ 4.  5.]]
<NDArray 3x2 @cpu(0)>

In [3]:
pure_batch_norm(A, gamma=nd.array([1,1]), beta=nd.array([0,0]))


[[-1.22474265 -1.22474265]
 [ 0.          0.        ]
 [ 1.22474265  1.22474265]]
<NDArray 3x2 @cpu(0)>

In [60]:
#在测试时，我们需要把原先训练时用到的批量均值和方差替换成整个训练数据的均值和方差。
#但是当训练数据极大时，这个计算开销很大。因此，我们用移动平均的方法来近似计算
#对近期数据给予较大的权数，对较远的数据给予较小的权数，对以前的一个参数进行赋值
#测试事的批量归一法
def batch_norm( X, gamma, beta, is_training, moving_mean, moving_variance,
               eps = 1e-5, moving_momentum = 0.9):
    assert len(X.shape) in (2,4)
    if len(X.shape)==2:
        mean=X.mean(axis=0)
        variance=((X-mean)**2).mean(axis=o)
    else:
        mean=X.mean(axis=(0,2,3),keepdims=True)
        variance=((X-mean)**2).mean(axis=(0,2,3),keepdims=True)
        moving_mean=moving_mean.reshape(mean.shape)
        moving_variance=moving_variance.reshape(variance.shape)
    # 均一化
    if is_training:
        X_hat = (X - mean) / nd.sqrt(variance + eps)
        #!!! 更新全局的均值和方差
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (
            1.0 - moving_momentum) * variance
    else:
        #!!! 测试阶段使用全局的均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps)

    # 拉升和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)

In [61]:
import sys
sys.path.append('..')
import utils
ctx = utils.try_gpu()
from mxnet import ndarray as nd
#define parameters
weight_scale=0.01

#first convolutional layer 输出通道 = 20, 卷积核 = (5,5)
c1=20
w1=nd.random.normal(shape=(c1,1,5,5),scale=weight_scale,ctx=ctx)
b1=nd.zeros(c1,ctx=ctx)

#first bn layer
gamma1=nd.random.normal(shape=c1,scale=weight_scale,ctx=ctx)
beta1 = nd.random.normal(shape=c1, scale=weight_scale,ctx=ctx)
moving_mean1 = nd.zeros(c1,ctx=ctx)
moving_variance1 = nd.zeros(c1,ctx=ctx)

#second convolutional layer # 输出通道 = 50, 卷积核 = (3,3)
c2=50
w2=nd.random.normal(shape=(c2,c1,3,3),scale=weight_scale,ctx=ctx)
b2=nd.zeros(c2,ctx=ctx)

#second bn layer
gamma2=nd.random.normal(shape=c2,scale=weight_scale,ctx=ctx)
beta2 = nd.random.normal(shape=c2, scale=weight_scale,ctx=ctx)
moving_mean2 = nd.zeros(c2,ctx=ctx)
moving_variance2 = nd.zeros(c2,ctx=ctx)

# 输出维度 = 128
o3 = 128
w3 = nd.random.normal(shape=(1250, o3), scale=weight_scale,ctx=ctx)
b3 = nd.zeros(o3,ctx=ctx)

# 输出维度 = 10
w4 = nd.random_normal(shape=(W3.shape[1], 10), scale=weight_scale,ctx=ctx)
b4 = nd.zeros(W4.shape[1],ctx=ctx)

# 注意这里moving_*是不需要更新的
params = [w1, b1, gamma1, beta1,
          w2, b2, gamma2, beta2,
          w3, b3, w4, b4]

for param in params:
    param.attach_grad()

In [62]:


#define function
def net(X,is_training=False,verbose=False):
    X = X.as_in_context(w1.context)
    
    #first convolutional layer
    h1_conv=nd.Convolution(data=X,weight=w1,bias=b1,kernel=w1.shape[2:],num_filter=c1)
    h1_bn=batch_norm(h1_conv,gamma1,beta1,is_training,moving_mean1, moving_variance1)
    h1_activation = nd.relu(h1_bn)
    h1 = nd.Pooling(
        data=h1_activation, pool_type="max", kernel=(2,2), stride=(2,2))
   
     #first second layer
    h2_conv = nd.Convolution(
        data=h1, weight=w2, bias=b2, kernel=w2.shape[2:], num_filter=c2)
    h2_bn = batch_norm(h2_conv, gamma2, beta2, is_training,
                       moving_mean2, moving_variance2)
    h2_activation = nd.relu(h2_bn)
    h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    h2 = nd.flatten(h2)
    
    #first connection layer
    h3_linear = nd.dot(h2, W3) + b3
    h3=nd.relu(h3_linear)
    
    #second connection layer
    h4_linear=nd.dot(h3,w4)+b4
    
    if verbose:
        print('1st conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4_linear.shape)
        print('output:', h4_linear)
    return h4_linear

In [None]:
from mxnet import autograd
from mxnet import gluon

batch_size = 256
train_data, test_data = utils.load_data_fashion_mnist(batch_size)

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

learning_rate = 0.2

for epoch in range(5):
    train_loss = 0.
    train_acc = 0.
    for data, label in train_data:
        label = label.as_in_context(ctx)
        with autograd.record():
            output = net(data, is_training=True)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        utils.SGD(params, learning_rate/batch_size)

        train_loss += nd.mean(loss).asscalar()
        train_acc += utils.accuracy(output, label)

    test_acc = utils.evaluate_accuracy(test_data, net)
    print("Epoch %d. Loss: %f, Train acc %f, Test acc %f" % (
            epoch, train_loss/len(train_data), train_acc/len(train_data), test_acc))