# MSRA初始化
“Xavier” 是一种相对不错的初始化方法，在博文[“深度学习——Xavier初始化方法”](https://blog.csdn.net/shuzfan/article/details/51338178)中有介绍。但是，Xavier推导的时候假设激活函数是线性的，显然我们目前常用的ReLU和PReLU并不满足这一条件。

只考虑输入个数时，[MSRA](https://blog.csdn.net/shuzfan/article/details/51347572) 初始化是一个均值为`0`方差为`2/n`的高斯分布：

In [2]:
import sys
sys.path.append('E:/xinlib')
import numpy as np
import xint
import imagex

import mxnet as mx
from mxnet import gluon, autograd, nd, image
from mxnet.gluon import nn
from frameworkx import Residual, Inception, InceptionTranspose
T = xint.Trainer('cifar10', 'ker2')

Using CNTK backend
  args, _, _, _ = getargspec(function_or_class) if isfunction(function_or_class) else getargspec(function_or_class.__init__)


In [16]:
def vgg_block(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1):
    '''
    将 Inception 作为基础单元
    '''
    out = nn.HybridSequential()
    for _ in range(num_convs):
        out.add(Inception(n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    out.add(nn.MaxPool2D(pool_size=2, strides=2))
    return out

def vgg_stack(architecture):
    out = nn.HybridSequential()
    for (num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1) in architecture:
        out.add(vgg_block(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    return out

def vgg_block_transpose(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1):
    '''
    将 Inception 作为基础单元
    '''
    out = nn.HybridSequential()
    channels = sum([n1_1, n2_1, n2_3, n3_1, n3_5, n4_1])
    for _ in range(num_convs):
        out.add(InceptionTranspose(n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    out.add(nn.Conv2DTranspose(channels, kernel_size=3, strides=2))
    return out

def vgg_stack_transpose(architecture):
    out = nn.HybridSequential()
    for (num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1) in architecture:
        out.add(vgg_block_transpose(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    return out

class NIN(Residual):
    def __init__(self, n_classes, same_shape=False, **kwargs):
        super().__init__(n_classes, same_shape, **kwargs)
        self.architecture = ([2, 64, 96, 128, 16,32, 32],
                             [2, 128, 128, 192, 32, 96, 64])
        # add name_scope on the outermost Sequential
        self.net = nn.HybridSequential()
        with self.net.name_scope():
            self.net.add(
                vgg_stack(self.architecture),
                vgg_stack_transpose(self.architecture)
            )
            
    def hybrid_forward(self, F, x):
        x = self.net(x)
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        if not self.same_shape:
            x = self.conv3(x)
        return F.relu(out + x)
    
n_out = 10
n_classes = 10
nin = NIN(n_classes)
net = nn.HybridSequential()
with net.name_scope():
    net.add(
        nin,
        nn.GlobalAvgPool2D(),
        nn.Flatten()
    )

In [17]:
ctx = xint.try_gpu()
#net.initialize(ctx=ctx, init=mx.init.Xavier())
# net.initialize(ctx=ctx)
#net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True, ctx=ctx)
#net.hybridize()
net.initialize(ctx=ctx, init=mx.init.MSRAPrelu())

loss = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'adadelta', {'rho': 0.9999})
#trainer = gluon.Trainer(net.collect_params(), 'rmsprop', {'learning_rate': 0.03, 'gamma1': 0.9})

In [18]:
x = nd.random_normal(shape=(100, 3, 32, 32)).as_in_context(mx.gpu(0))
net(x)


[[ 0.43572974  0.01842342  1.19811356  1.88724983  0.63832462  0.07957593
   0.186563    0.49870616  1.03893507  0.23245613]
 [ 0.45598537  0.0168838   1.20165849  1.81312335  0.66502327  0.07044569
   0.19124216  0.51440835  1.05111003  0.24925552]
 [ 0.44281268  0.01399717  1.17827344  1.82457972  0.68310964  0.07071031
   0.18550892  0.46888238  1.08558524  0.2566174 ]
 [ 0.42326882  0.02013076  1.21423066  1.82286823  0.63406026  0.07462503
   0.19891061  0.47036207  1.03869963  0.24199761]
 [ 0.3828238   0.01104463  1.18613124  1.88961053  0.65691543  0.07851652
   0.18865846  0.47797817  1.06256902  0.25104168]
 [ 0.42719051  0.01636644  1.26936889  1.9372834   0.67934418  0.08098178
   0.20867838  0.47962594  1.07578588  0.26137882]
 [ 0.42646083  0.01904831  1.15397274  1.76943862  0.66254437  0.05828552
   0.19082645  0.46244949  1.03061473  0.22139737]
 [ 0.47132403  0.01518754  1.21396124  1.86598229  0.67295557  0.07046261
   0.18220125  0.46621826  1.08331347  0.25238636]

In [None]:
num_epochs = 1
batch_size = 64
T.train(net, loss, trainer, num_epochs, batch_size, True, *[32, 28])

# [【CV知识学习】神经网络梯度与归一化问题总结+highway network、ResNet的思考](http://www.cnblogs.com/jie-dcai/p/5803220.html)

可以看到，当网络加深，训练的误差反而上升了，而加入了highway之后，这个问题得到了缓解。一般来说，深度网络训练困难是由于梯度回流受阻的问题，可能浅层网络没有办法得到调整，或者我自己YY的一个原因是（回流的信息经过网络之后已经变形了，很可能就出现了internal covariate shift类似的问题了）。Highway Network 受LSTM启发，增加了一个门函数，让网络的输出由两部分组成，分别是网络的直接输入以及输入变形后的部分。