# MSRA初始化
“Xavier” 是一种相对不错的初始化方法，在博文[“深度学习——Xavier初始化方法”](https://blog.csdn.net/shuzfan/article/details/51338178)中有介绍。但是，Xavier推导的时候假设激活函数是线性的，显然我们目前常用的ReLU和PReLU并不满足这一条件。

只考虑输入个数时，[MSRA](https://blog.csdn.net/shuzfan/article/details/51347572) 初始化是一个均值为`0`方差为`2/n`的高斯分布：

In [2]:
import sys
sys.path.append('E:/xinlib')
import numpy as np
import xint
from data import imagex

import mxnet as mx
from mxnet import gluon, autograd, nd, image
from mxnet.gluon import nn
from frameworkx import Residual, Inception, InceptionTranspose
T = xint.Trainer('cifar10', 'ker2')

In [29]:
def vgg_block(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1):
    '''
    将 Inception 作为基础单元
    '''
    out = nn.HybridSequential()
    for _ in range(num_convs):
        out.add(Inception(n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    out.add(nn.MaxPool2D(pool_size=2, strides=2))
    return out

def vgg_stack(architecture):
    out = nn.HybridSequential()
    for (num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1) in architecture:
        out.add(vgg_block(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    return out

def vgg_block_transpose(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1):
    '''
    将 Inception 作为基础单元
    '''
    out = nn.HybridSequential()
    channels = sum([n1_1, n2_1, n2_3, n3_1, n3_5, n4_1])
    for _ in range(num_convs):
        out.add(InceptionTranspose(n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    out.add(nn.Conv2DTranspose(channels, kernel_size=3, strides=2))
    return out

def vgg_stack_transpose(architecture):
    out = nn.HybridSequential()
    for (num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1) in architecture:
        out.add(vgg_block_transpose(num_convs, n1_1, n2_1, n2_3, n3_1, n3_5, n4_1))
    return out

class NIN(Residual):
    def __init__(self, n_classes, same_shape=False, **kwargs):
        super().__init__(n_classes, same_shape, **kwargs)
        self.architecture = ([2, 64, 96, 128, 16,32, 32],
                             [2, 128, 128, 192, 32, 96, 64])
        # add name_scope on the outermost Sequential
        self.net = nn.HybridSequential()
        with self.net.name_scope():
            self.net.add(
                vgg_stack(self.architecture)
            )
            
    def hybrid_forward(self, F, x):
        x = self.net(x)
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        if not self.same_shape:
            x = self.conv3(x)
        return F.relu(out + x)
    
n_out = 10
n_classes = 10
nin = NIN(n_classes)
net = nn.HybridSequential()
with net.name_scope():
    net.add(
        nin,
        nn.GlobalAvgPool2D(),
        nn.Flatten()
    )

In [30]:
ctx = xint.try_gpu()
#net.initialize(ctx=ctx, init=mx.init.Xavier())
# net.initialize(ctx=ctx)
#net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True, ctx=ctx)

net.initialize(ctx=ctx, init=mx.init.MSRAPrelu())
#net.hybridize()
loss = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'adadelta', {'rho': 0.9999})
#trainer = gluon.Trainer(net.collect_params(), 'rmsprop', {'learning_rate': 0.03, 'gamma1': 0.9})

In [25]:
T = xint.Trainer('cifar10', 'ker')

In [31]:
batch_size = 64
T.train(net, loss, trainer, 50, batch_size)

No image augmentation  used on the gpu(0)


KeyboardInterrupt: 

# [【CV知识学习】神经网络梯度与归一化问题总结+highway network、ResNet的思考](http://www.cnblogs.com/jie-dcai/p/5803220.html)

可以看到，当网络加深，训练的误差反而上升了，而加入了highway之后，这个问题得到了缓解。一般来说，深度网络训练困难是由于梯度回流受阻的问题，可能浅层网络没有办法得到调整，或者我自己YY的一个原因是（回流的信息经过网络之后已经变形了，很可能就出现了internal covariate shift类似的问题了）。Highway Network 受LSTM启发，增加了一个门函数，让网络的输出由两部分组成，分别是网络的直接输入以及输入变形后的部分。

In [32]:
from mxnet.gluon.model_zoo import vision as models

In [None]:
net = models.Inception3

Inception3(
  (features): HybridSequential(
    (0): HybridSequential(
      (0): Conv2D(None -> 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (1): BatchNorm(axis=1, eps=0.001, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
      (2): Activation(relu)
    )
    (1): HybridSequential(
      (0): Conv2D(None -> 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
      (1): BatchNorm(axis=1, eps=0.001, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
      (2): Activation(relu)
    )
    (2): HybridSequential(
      (0): Conv2D(None -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm(axis=1, eps=0.001, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
      (2): Activation(relu)
    )
    (3): MaxPool2D(size=(3, 3), stride=(2, 2), padding=(0, 0), ceil_mode=False)
    (4): HybridSequential(
      (0): Conv2D(None -> 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
   