## yi = gamma*xi+beta   -> BN γ,β(xi)

In [1]:
from mxnet import nd


  from ._conv import register_converters as _register_converters
  import OpenSSL.SSL


In [2]:
def pure_batch_norm(X, gamma,beta,esp = 1e-5):
    assert len(X.shape) in (2,4)# 维度在（2，4）里
    #fc: batch_size * feature
    if len(X.shape) == 2:
        mean = X.mean(axis = 0)
        variance = ((X - mean)**2).mean(axis = 0)
    #conv-2d: batch_size*channel*h*w
    else:
        mean = X.mean(axis=(0,2,3),keepdims=True)
        variance = ((X-mean)**2).mean(axis = (0,2,3),keepdims = True)
    # normilazition
    X_hat = (X - mean) / nd.sqrt(variance + esp)
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)

In [3]:
a = nd.arange(6).reshape((3,2))
a


[[0. 1.]
 [2. 3.]
 [4. 5.]]
<NDArray 3x2 @cpu(0)>

In [4]:
pure_batch_norm(a,gamma=nd.array([1,1]),beta=nd.array([0,0]))


[[-1.2247427 -1.2247427]
 [ 0.         0.       ]
 [ 1.2247427  1.2247427]]
<NDArray 3x2 @cpu(0)>

In [5]:
b = nd.arange(18).reshape((1,2,3,3))
b



[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]

  [[ 9. 10. 11.]
   [12. 13. 14.]
   [15. 16. 17.]]]]
<NDArray 1x2x3x3 @cpu(0)>

In [6]:
bb = b.mean(axis=(0,2,3),keepdims = True)
bb


[[[[ 4.]]

  [[13.]]]]
<NDArray 1x2x1x1 @cpu(0)>

In [7]:
(b - ((b-bb)**2).mean(axis=(0,2,3),keepdims=True))



[[[[-6.6666665 -5.6666665 -4.6666665]
   [-3.6666665 -2.6666665 -1.6666665]
   [-0.6666665  0.3333335  1.3333335]]

  [[ 2.3333335  3.3333335  4.3333335]
   [ 5.3333335  6.3333335  7.3333335]
   [ 8.333334   9.333334  10.333334 ]]]]
<NDArray 1x2x3x3 @cpu(0)>

In [51]:
def batch_norm(X, gamma, beta, is_training, moving_mean, moving_variance,
               eps = 1e-5, moving_momentum = 0.9):
    assert len(X.shape) in (2, 4)
    # 全连接: batch_size x feature
    if len(X.shape) == 2:
        # 每个输入维度在样本上的平均和方差
        mean = X.mean(axis=0)
        variance = ((X - mean)**2).mean(axis=0)
    # 2D卷积: batch_size x channel x height x width
    else:
        # 对每个通道算均值和方差，需要保持4D形状使得可以正确的广播
        mean = X.mean(axis=(0,2,3), keepdims=True)
        variance = ((X - mean)**2).mean(axis=(0,2,3), keepdims=True)
        # 变形使得可以正确的广播
        moving_mean = moving_mean.reshape(mean.shape)
        moving_variance = moving_variance.reshape(mean.shape)

    # 均一化
    if is_training:
        X_hat = (X - mean) / nd.sqrt(variance + eps)
        #!!! 更新全局的均值和方差
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (
            1.0 - moving_momentum) * variance
    else:
        #!!! 测试阶段使用全局的均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps)

    # 拉升和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)

In [52]:
import utils1
import sys
sys.path.append('.')
ctx = utils1.try_gpu()
ctx

gpu(0)

In [53]:
weight_scale = .01

# 输出通道 = 20, 卷积核 = (5,5)
c1 = 20
W1 = nd.random.normal(shape=(c1,1,5,5), scale=weight_scale, ctx=ctx)
b1 = nd.zeros(c1, ctx)

# 第1层批量归一化
gamma1 = nd.random.normal(shape=c1, scale=weight_scale, ctx=ctx)
beta1 = nd.random.normal(shape=c1, scale=weight_scale, ctx=ctx)
moving_mean1 = nd.zeros(c1, ctx=ctx)
moving_variance1 = nd.zeros(c1, ctx=ctx)

# 输出通道 = 50, 卷积核 = (3,3)
c2 = 50
W2 = nd.random_normal(shape=(c2,c1,3,3), scale=weight_scale, ctx=ctx)
b2 = nd.zeros(c2, ctx=ctx)

# 第2层批量归一化
gamma2 = nd.random.normal(shape=c2, scale=weight_scale, ctx=ctx)
beta2 = nd.random.normal(shape=c2, scale=weight_scale, ctx=ctx)
moving_mean2 = nd.zeros(c2, ctx=ctx)
moving_variance2 = nd.zeros(c2, ctx=ctx)

# 输出维度 = 128
o3 = 128
W3 = nd.random.normal(shape=(1250, o3), scale=weight_scale, ctx=ctx)
b3 = nd.zeros(o3, ctx=ctx)

# 输出维度 = 10
W4 = nd.random_normal(shape=(W3.shape[1], 10), scale=weight_scale, ctx=ctx)
b4 = nd.zeros(W4.shape[1], ctx=ctx)

# 注意这里moving_*是不需要更新的
params = [W1, b1, gamma1, beta1,
          W2, b2, gamma2, beta2,
          W3, b3, W4, b4]

for param in params:
    param.attach_grad()

In [54]:
def net(X, is_training=False,verbose=False):
    X = X.as_in_context(w1.context)
    #conv1
    h1_conv = nd.Convolution(data=X,weight=w1,bias=b1,
                             kernel=w1.shape[2:],num_filter = c1)
    h1_bn = batch_norm(h1_conv,gamma1,beta1,is_training,
                       moving_mean1,moving_var1)
    h1_activation = nd.relu(h1_bn)
    h1 = nd.Pooling(data=h1_activation, pool_type="max",
                    kernel=(2,2),stride=(2,2))
    #conv2
    h2_conv = nd.Convolution(data=h1, weight=w2, bias=b2, 
                             kernel=w2.shape[2:],num_filter = c2)
    h2_bn = batch_norm(h2_conv,gamma2,beta2,is_training,
                       moving_mean2,moving_var2)
    h2_activation = nd.relu(h2_bn)
    h2 = nd.Pooling(data=h2_activation,pool_type="max",
                   kernel=(2,2),stride=(2,2))
    #fc1
    h3_linear = nd.dot(h2,w3) + b3
    h3 = nd.relu(h3_linear)
    #fc2
    h4_linear = nd.dot(h3,w4) + b4
    h4 = nd.relu(h4_linear)
    if verbose:
        print('1st conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4_linear.shape)
        print('output:', h4_linear)
    return h4_linear

In [55]:

from mxnet import autograd
from mxnet import gluon

batch_size = 256
train_data, test_data = utils1.load_data_fashion_mnist(batch_size)

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

learning_rate = 0.2

for epoch in range(5):
    train_loss = 0.
    train_acc = 0.
    for data, label in train_data:
        label = label.as_in_context(ctx)
        with autograd.record():
            output = net(data, is_training=True)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        utils1.SGD(params, learning_rate/batch_size)

        train_loss += nd.mean(loss).asscalar()
        train_acc += utils.accuracy(output, label)

    test_acc = utils.evaluate_accuracy(test_data, net, ctx)
    print("Epoch %d. Loss: %f, Train acc %f, Test acc %f" % (
        epoch, train_loss / len(train_data), train_acc / len(train_data), test_acc))

  label = np.fromstring(fin.read(), dtype=np.uint8).astype(np.int32)
  data = np.fromstring(fin.read(), dtype=np.uint8)


MXNetError: [17:53:02] src/imperative/./imperative_utils.h:55: Check failed: inputs[i]->ctx().dev_mask() == ctx.dev_mask() (2 vs. 1) Operator Convolution require all inputs live on the same context. But the first argument is on cpu(0) while the 3-th argument is on gpu(0)

Stack trace returned 10 entries:
[bt] (0) /home/wk/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28965c) [0x7f93a10e465c]
[bt] (1) /home/wk/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24cf87c) [0x7f93a332a87c]
[bt] (2) /home/wk/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24c1bce) [0x7f93a331cbce]
[bt] (3) /home/wk/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24026bb) [0x7f93a325d6bb]
[bt] (4) /home/wk/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXImperativeInvokeEx+0x63) [0x7f93a325dbf3]
[bt] (5) /home/wk/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f93ede11ec0]
[bt] (6) /home/wk/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f93ede1187d]
[bt] (7) /home/wk/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f93ee02682e]
[bt] (8) /home/wk/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12265) [0x7f93ee027265]
[bt] (9) /home/wk/anaconda3/bin/python(_PyObject_FastCallDict+0x8b) [0x7f93f45ed54b]
