In [32]:
import torch.nn as nn
import torch
import numpy as np

In [2]:
class MLP(nn.Module):
    def __init__(self,neural_num,layers):
        super(MLP,self).__init__()
        self.linear = nn.ModuleList([nn.Linear(neural_num,neural_num,bias=False) for i in range(layers)])
        self.neural_num = neural_num
    def forward(self,x):
        for (i,linear) in enumerate(self.linear):
            x =linear(x)
        return x
    def initialize(self):
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.normal_(m.weight.data)

In [3]:
layer_nums = 100
neurAL_nums = 256
batch_size= 16

In [4]:
net = MLP(neural_num=neurAL_nums,layers=layer_nums)

In [5]:
net.initialize()

In [12]:
inputs = torch.randn(batch_size,neurAL_nums)

In [7]:
output = net(inputs)

In [8]:
print(output)

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<MmBackward>)


可以发现数据超出了可表示的范围（要么非常大，要么非常小）  
现在我们观察一下数据什么时候出现了nan

In [27]:
class MLP(nn.Module):
    def __init__(self,neural_num,layers):
        super(MLP,self).__init__()
        self.linear = nn.ModuleList([nn.Linear(neural_num,neural_num,bias=False) for i in range(layers)])
        self.neural_num = neural_num
    def forward(self,x):
        for (i,linear) in enumerate(self.linear):
            x =linear(x)
            print("layers:{}, std:{}".format(i,x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
        return x
    def initialize(self):
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.normal_(m.weight.data)

In [28]:
net2 = MLP(neurAL_nums,layer_nums)

In [29]:
net2.initialize()

In [30]:
output = net2(inputs)

layers:0, std:16.083585739135742
layers:1, std:261.1745910644531
layers:2, std:4165.44287109375
layers:3, std:67348.359375
layers:4, std:1060944.75
layers:5, std:16473645.0
layers:6, std:268507744.0
layers:7, std:4413952512.0
layers:8, std:68022620160.0
layers:9, std:1075242074112.0
layers:10, std:17344401244160.0
layers:11, std:280017523179520.0
layers:12, std:4513735213318144.0
layers:13, std:7.149678727318733e+16
layers:14, std:1.1240694261408072e+18
layers:15, std:1.816435760185947e+19
layers:16, std:2.926854037534136e+20
layers:17, std:4.750459374395229e+21
layers:18, std:7.5197882000499345e+22
layers:19, std:1.1979136178057828e+24
layers:20, std:1.927403442855524e+25
layers:21, std:3.0146831310132793e+26
layers:22, std:4.7524355611535073e+27
layers:23, std:7.439961362102737e+28
layers:24, std:1.1938434877627082e+30
layers:25, std:1.8699478403222863e+31
layers:26, std:3.011275605592508e+32
layers:27, std:4.7634230544147416e+33
layers:28, std:7.726996420132194e+34
layers:29, std:1.

In [31]:
print(output)

tensor([[        inf, -2.2689e+38,         inf,  ...,  7.6598e+37,
          6.8875e+37, -4.8236e+36],
        [-1.0201e+38, -1.2558e+38, -2.1341e+38,  ...,  9.9613e+37,
          6.0173e+37,        -inf],
        [-2.2543e+38,        -inf, -1.0540e+38,  ...,         inf,
          7.9858e+37, -1.6063e+37],
        ...,
        [ 2.1819e+38,         inf,  2.3744e+38,  ...,        -inf,
          9.7533e+37,  9.7264e+37],
        [-9.7689e+37, -2.3595e+38,         inf,  ...,         inf,
          1.3836e+38,        -inf],
        [       -inf, -1.1059e+38,        -inf,  ...,         inf,
         -1.5182e+38, -1.5844e+38]], grad_fn=<MmBackward>)


可以看到网络权值是一个逐渐变大的过程，在31层就变到了 无法表示的了

$$D(H_1) = sum(1-n) (D(x_i)*D(w_{1i})$$

要使得D不变必须D(w) = 1/n =>std(w) = (1/n)^1/2

In [37]:
class MLP(nn.Module):
    def __init__(self,neural_num,layers):
        super(MLP,self).__init__()
        self.linear = nn.ModuleList([nn.Linear(neural_num,neural_num,bias=False) for i in range(layers)])
        self.neural_num = neural_num
    def forward(self,x):
        for (i,linear) in enumerate(self.linear):
            x =linear(x)
            
            print("layers:{}, std:{}".format(i,x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
        return x
    def initialize(self):
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.normal_(m.weight.data,std=np.sqrt(1/self.neural_num))

In [38]:
net3 = MLP(neurAL_nums,layer_nums)

In [39]:
net3.initialize()

In [40]:
output = net3(inputs)

layers:0, std:1.0056896209716797
layers:1, std:0.9921356439590454
layers:2, std:1.0027211904525757
layers:3, std:0.9852660298347473
layers:4, std:0.9625363349914551
layers:5, std:0.9498919248580933
layers:6, std:0.9279433488845825
layers:7, std:0.9370994567871094
layers:8, std:0.9171197414398193
layers:9, std:0.9329935908317566
layers:10, std:0.9441112875938416
layers:11, std:0.9663146138191223
layers:12, std:0.9664993286132812
layers:13, std:0.954406201839447
layers:14, std:0.9583677053451538
layers:15, std:0.9696717262268066
layers:16, std:0.9842697978019714
layers:17, std:0.9867438673973083
layers:18, std:0.961632251739502
layers:19, std:0.9636610746383667
layers:20, std:0.9645311832427979
layers:21, std:0.9377244114875793
layers:22, std:0.9541034698486328
layers:23, std:0.9769448637962341
layers:24, std:0.96211177110672
layers:25, std:0.9813463091850281
layers:26, std:0.9384353756904602
layers:27, std:0.9408388137817383
layers:28, std:0.9674381613731384
layers:29, std:0.95398819446

In [41]:
output

tensor([[ 1.2775, -0.5735, -1.9407,  ...,  1.6885,  0.9244, -0.4549],
        [ 1.4726, -0.3046, -1.1130,  ...,  0.3368,  0.3650,  0.5948],
        [ 0.7473, -0.3004,  0.2890,  ...,  0.8805,  1.0801,  0.2106],
        ...,
        [ 1.2075,  0.0306, -0.1541,  ..., -2.4691, -1.0989,  1.5947],
        [ 0.7168,  0.3330, -0.6735,  ...,  0.0168,  0.4514, -0.5064],
        [-0.9210, -0.5308,  1.4379,  ..., -0.5287, -0.4071, -0.3429]],
       grad_fn=<MmBackward>)

我们来看看梯度消失的例子  
我们在线性层后面加上激活函数

In [42]:
class MLP(nn.Module):
    def __init__(self,neural_num,layers):
        super(MLP,self).__init__()
        self.linear = nn.ModuleList([nn.Linear(neural_num,neural_num,bias=False) for i in range(layers)])
        self.neural_num = neural_num
    def forward(self,x):
        for (i,linear) in enumerate(self.linear):
            x =linear(x)
            x = torch.tanh(x)
            print("layers:{}, std:{}".format(i,x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
        return x
    def initialize(self):
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.normal_(m.weight.data,std=np.sqrt(1/self.neural_num))

In [43]:
net_ = MLP(neurAL_nums,layer_nums)

In [44]:
net_.initialize()

In [45]:
output = net_(inputs)

layers:0, std:0.6284132599830627
layers:1, std:0.4860682189464569
layers:2, std:0.4059627950191498
layers:3, std:0.35384225845336914
layers:4, std:0.3184562027454376
layers:5, std:0.29324835538864136
layers:6, std:0.27135857939720154
layers:7, std:0.2467942088842392
layers:8, std:0.23449578881263733
layers:9, std:0.21716250479221344
layers:10, std:0.2077142596244812
layers:11, std:0.2040049433708191
layers:12, std:0.20062442123889923
layers:13, std:0.1967809498310089
layers:14, std:0.1906975954771042
layers:15, std:0.18707753717899323
layers:16, std:0.18422092497348785
layers:17, std:0.17498335242271423
layers:18, std:0.17414061725139618
layers:19, std:0.1663168966770172
layers:20, std:0.15726089477539062
layers:21, std:0.15164928138256073
layers:22, std:0.14775006473064423
layers:23, std:0.14356309175491333
layers:24, std:0.14003361761569977
layers:25, std:0.13919799029827118
layers:26, std:0.1360105574131012
layers:27, std:0.1325470507144928
layers:28, std:0.13525713980197906
layers:

可以看到方差越来越小，那么权重数据也会越来越小，最终会导致梯度的消失

## 常见的初始化方法

Xavier初始化
方差一致性：保持数据尺度维持在适当范围，通常方差为1  
激活函数：饱和函数，如sigmoid,Tanh

$$n_i*D(w) = 1$$  
$$n_{i+1}*D(w) = 1$$   
权重的方差，输入层神经元个数$$n_i$$  
$$n_{i+1}$$输出层神经元个数

=》$$D(w)=2/n_i+n_{i+1}$$

In [48]:
class MLP(nn.Module):
    def __init__(self,neural_num,layers):
        super(MLP,self).__init__()
        self.linear = nn.ModuleList([nn.Linear(neural_num,neural_num,bias=False) for i in range(layers)])
        self.neural_num = neural_num
    def forward(self,x):
        for (i,linear) in enumerate(self.linear):
            x =linear(x)
            x = torch.tanh(x)
            print("layers:{}, std:{}".format(i,x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
        return x
    def initialize(self):
        for m in self.modules():
            if isinstance(m,nn.Linear):
                a = np.sqrt(6/(self.neural_num+self.neural_num))
                tanh_gain = nn.init.calculate_gain('tanh')
                a*=tanh_gain
                nn.init.uniform(m.weight.data,-a,a)

In [49]:
net_xver = MLP(neurAL_nums,layer_nums)

In [50]:
net_xver.initialize()



In [51]:
output = net_xver(inputs)

layers:0, std:0.7615704536437988
layers:1, std:0.7009486556053162
layers:2, std:0.6695532202720642
layers:3, std:0.6599892973899841
layers:4, std:0.6508842706680298
layers:5, std:0.6599737405776978
layers:6, std:0.6529982089996338
layers:7, std:0.6635789275169373
layers:8, std:0.6567519307136536
layers:9, std:0.6483654975891113
layers:10, std:0.6481558680534363
layers:11, std:0.6499091386795044
layers:12, std:0.6511178612709045
layers:13, std:0.6461487412452698
layers:14, std:0.6440340876579285
layers:15, std:0.651336669921875
layers:16, std:0.656968355178833
layers:17, std:0.6554034948348999
layers:18, std:0.6466189026832581
layers:19, std:0.6502505540847778
layers:20, std:0.6422507166862488
layers:21, std:0.6446830630302429
layers:22, std:0.6496829390525818
layers:23, std:0.6572242975234985
layers:24, std:0.6441609263420105
layers:25, std:0.641246497631073
layers:26, std:0.6543492674827576
layers:27, std:0.6472375392913818
layers:28, std:0.6486672163009644
layers:29, std:0.6516335010

可以发现经过了初始化后权重值分布在0.64-0.65左右  
pytorch里内置函数有相关的函数

In [73]:
class MLP(nn.Module):
    def __init__(self,neural_num,layers):
        super(MLP,self).__init__()
        self.linear = nn.ModuleList([nn.Linear(neural_num,neural_num,bias=False) for i in range(layers)])
        self.neural_num = neural_num
    def forward(self,x):
        for (i,linear) in enumerate(self.linear):
            x =linear(x)
            x= torch.tanh(x)
            print("layers:{}, std:{}".format(i,x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
        return x
    def initialize(self):
        for m in self.modules():
            if isinstance(m,nn.Linear):
                tanh_gain = nn.init.calculate_gain('tanh')
                nn.init.xavier_uniform(m.weight.data,gain=tanh_gain)

In [56]:
net_xver1= MLP(neurAL_nums,layer_nums)
net_xver.initialize()
output = net_xver(inputs)

layers:0, std:0.7612550854682922
layers:1, std:0.6903542280197144
layers:2, std:0.662228524684906
layers:3, std:0.6574481129646301
layers:4, std:0.648288905620575
layers:5, std:0.6484530568122864
layers:6, std:0.6495718359947205
layers:7, std:0.6528733372688293
layers:8, std:0.6482165455818176
layers:9, std:0.6567924618721008
layers:10, std:0.6460853219032288
layers:11, std:0.6438324451446533
layers:12, std:0.6438490748405457
layers:13, std:0.6451663374900818
layers:14, std:0.6498647928237915
layers:15, std:0.6491470336914062
layers:16, std:0.6414947509765625
layers:17, std:0.6499527096748352
layers:18, std:0.6527339220046997
layers:19, std:0.6521665453910828
layers:20, std:0.6528139114379883
layers:21, std:0.6483110785484314
layers:22, std:0.6438817381858826
layers:23, std:0.6520076990127563
layers:24, std:0.6498242020606995
layers:25, std:0.6567388772964478
layers:26, std:0.6621300578117371
layers:27, std:0.6573572754859924
layers:28, std:0.6543906331062317
layers:29, std:0.650356888



这种初始化方法不适用relu

Kaiming初始化
* 方差一致性：保持数据尺度维持在恰当范围，通常方差为1
* 机会函数：ReLU，及其变种

In [75]:
class MLP2(nn.Module):
    def __init__(self,neural_num,layers):
        super(MLP2,self).__init__()
        self.linear = nn.ModuleList([nn.Linear(neural_num,neural_num,bias=False) for i in range(layers)])
        self.neural_num = neural_num
    def forward(self,x):
        for (i,linear) in enumerate(self.linear):
            x =linear(x)
            x= torch.relu(x)
            print("layers:{}, std:{}".format(i,x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
        return x
    def initialize(self):
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.kaiming_normal(m.weight.data)

In [76]:
net_xver1= MLP2(neurAL_nums,layer_nums)
net_xver1.initialize()
output = net_xver(inputs)



layers:0, std:0.7595686912536621
layers:1, std:0.6903108954429626
layers:2, std:0.6655719876289368
layers:3, std:0.6573150753974915
layers:4, std:0.656446635723114
layers:5, std:0.6504065990447998
layers:6, std:0.6504220366477966
layers:7, std:0.6609078049659729
layers:8, std:0.6630220413208008
layers:9, std:0.6533516645431519
layers:10, std:0.6591590642929077
layers:11, std:0.6552177667617798
layers:12, std:0.6570743322372437
layers:13, std:0.6541009545326233
layers:14, std:0.6521506905555725
layers:15, std:0.6581734418869019
layers:16, std:0.6517672538757324
layers:17, std:0.640683650970459
layers:18, std:0.6404054164886475
layers:19, std:0.6370103359222412
layers:20, std:0.6513935923576355
layers:21, std:0.6522902250289917
layers:22, std:0.6489609479904175
layers:23, std:0.6537424325942993
layers:24, std:0.6475028991699219
layers:25, std:0.6530891060829163
layers:26, std:0.6552497744560242
layers:27, std:0.652744710445404
layers:28, std:0.648811936378479
layers:29, std:0.64106792211

十种初始化方法
* Xavier均匀分布
* Xavier标准正态分布
* Kaiming均匀分布
* Kaiming标准正态分布
* 均匀分布
* 正态分布
* 常数分布
* 正交矩阵初始化
* 单位矩阵初始化
* 稀疏矩阵初始化

nn.init.calculate_gain()的计算

In [79]:
x = torch.randn(100000)

In [80]:
out = torch.tanh(x)

In [81]:
gain = x.std()/out.std()

In [82]:
tanh_gain = nn.init.calculate_gain('tanh')

In [84]:
print('gain:{},tan_gain{}'.format(gain,tanh_gain))

gain:1.5971213579177856,tan_gain1.6666666666666667
