## 4.2 模型参数的访问、初始化和共享

In [1]:
from mxnet import init, nd
from mxnet.gluon import nn

net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize() # 使用默认的初始化方式

X = nd.random_uniform(shape=(2, 20))
Y = net(X) # 前向计算

### 4.2.1 访问模型参数

In [2]:
net[0].params, type(net[0].params)

(dense0_ (
   Parameter dense0_weight (shape=(256, 20), dtype=float32)
   Parameter dense0_bias (shape=(256,), dtype=float32)
 ),
 mxnet.gluon.parameter.ParameterDict)

In [3]:
net[0].params['dense0_weight'], net[0].weight

(Parameter dense0_weight (shape=(256, 20), dtype=float32),
 Parameter dense0_weight (shape=(256, 20), dtype=float32))

In [4]:
net[0].weight.data()


[[ 0.06700657 -0.00369488  0.0418822  ... -0.05517294 -0.01194733
  -0.00369594]
 [-0.03296221 -0.04391347  0.03839272 ...  0.05636378  0.02545484
  -0.007007  ]
 [-0.0196689   0.01582889 -0.00881553 ...  0.01509629 -0.01908049
  -0.02449339]
 ...
 [ 0.00010955  0.0439323  -0.04911506 ...  0.06975312  0.0449558
  -0.03283203]
 [ 0.04106557  0.05671307 -0.00066976 ...  0.06387014 -0.01292654
   0.00974177]
 [ 0.00297424 -0.0281784  -0.06881659 ... -0.04047417  0.00457048
   0.05696651]]
<NDArray 256x20 @cpu(0)>

In [5]:
net[0].weight.grad()


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 256x20 @cpu(0)>

In [6]:
net[1].bias.data()


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 10 @cpu(0)>

In [7]:
net.collect_params()

sequential0_ (
  Parameter dense0_weight (shape=(256, 20), dtype=float32)
  Parameter dense0_bias (shape=(256,), dtype=float32)
  Parameter dense1_weight (shape=(10, 256), dtype=float32)
  Parameter dense1_bias (shape=(10,), dtype=float32)
)

In [8]:
net.collect_params('.*weight')

sequential0_ (
  Parameter dense0_weight (shape=(256, 20), dtype=float32)
  Parameter dense1_weight (shape=(10, 256), dtype=float32)
)

### 4.2.2 初始化模型参数

In [13]:
# 非首次对模型初始化需要指定force_reinit为真
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[1].bias.data()


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 10 @cpu(0)>

In [15]:
net.initialize(init=init.Constant(1), force_reinit=True)
net[1].weight.data()


[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]
<NDArray 10x256 @cpu(0)>

In [16]:
net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
net[0].weight.data()


[[-0.05409864 -0.08726439 -0.01566513 ... -0.03656729  0.08232553
  -0.07605065]
 [ 0.08716483  0.05442163 -0.04584546 ... -0.00090979  0.02000669
  -0.09666421]
 [ 0.03573087 -0.12235439 -0.00053681 ... -0.06940991 -0.12993081
   0.02227372]
 ...
 [ 0.12988895 -0.09096066 -0.10962926 ...  0.00642923  0.07101448
  -0.03116116]
 [-0.0411308   0.03395168 -0.10716853 ... -0.1137625  -0.01520985
   0.14004415]
 [-0.13609144  0.09102303  0.08592838 ...  0.08537236 -0.03737452
  -0.01966473]]
<NDArray 256x20 @cpu(0)>

### 4.2.3 自定义初始化方法

In [17]:
class MyInit(init.Initializer):
    def _init_weight(self, name, data):
        print("Init", name, data.shape)
        data[:] = nd.random_uniform(low=-10, high=10, shape=data.shape)
        data *= data.abs() >= 5
        
net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()

Init dense0_weight (256, 20)
Init dense1_weight (10, 256)



[[-7.597596   8.738558  -0.        ...  0.         0.         0.       ]
 [-0.        -0.        -0.        ...  0.         6.5372314  0.       ]
 [ 0.        -5.1193514 -9.228855  ...  5.1330748  6.4004498 -0.       ]
 ...
 [-6.693775   5.5894766  6.431896  ... -6.90129    7.337467   8.161442 ]
 [ 0.        -0.         5.381055  ... -5.77409   -6.1074495 -6.5115404]
 [-0.        -0.         9.539856  ... -0.         5.7548885 -6.135581 ]]
<NDArray 256x20 @cpu(0)>

In [18]:
net[0].weight.set_data(net[0].weight.data() + 1)
net[0].weight.data()


[[-6.597596   9.738558   1.        ...  1.         1.         1.       ]
 [ 1.         1.         1.        ...  1.         7.5372314  1.       ]
 [ 1.        -4.1193514 -8.228855  ...  6.1330748  7.4004498  1.       ]
 ...
 [-5.693775   6.5894766  7.431896  ... -5.90129    8.337467   9.161442 ]
 [ 1.         1.         6.381055  ... -4.77409   -5.1074495 -5.5115404]
 [ 1.         1.        10.539856  ...  1.         6.7548885 -5.135581 ]]
<NDArray 256x20 @cpu(0)>

### 4.2.4 共享模型参数

- 如果不同层使用同一份参数，那么他们在前向计算和反向传播时都会共享相同的参数。

In [20]:
net = nn.Sequential()
shared = nn.Dense(8, activation='relu')
net.add(nn.Dense(8, activation='relu'),
        shared,
        nn.Dense(8, activation='relu', params=shared.params),
        nn.Dense(10))
net.initialize()

X = nd.random_uniform(shape=(2, 20))
net(X)

net[1].weight.data() == net[2].weight.data()


[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 8x8 @cpu(0)>

## 小结

- 有多种方法来访问、初始化和共享模型参数。
- 可以自定义初始化方法。

## 练习

- 查阅有关init模块的MXNet⽂档，了解不同的参数初始化⽅法。

MXNet的init模块的API:https://mxnet.apache.org/api/python/docs/api/optimizer/index.html#

- 尝试在net.initialize()后、 net(X)前访问模型参数，观察模型参数的形状。

In [32]:
net = nn.Sequential()
shared = nn.Dense(8, activation='relu')
net.add(nn.Dense(8, activation='relu'),
        shared,
        nn.Dense(8, activation='relu', params=shared.params),
        nn.Dense(10))
net.initialize()
net[0].weight
# X = nd.random_uniform(shape=(2, 20))
# net(X)

Parameter dense55_weight (shape=(8, 0), dtype=float32)

- 构造⼀个含共享参数层的多层感知机并训练。在训练过程中，观察每⼀层的模型参数和梯度。

In [1]:
# 导入所需的包和模块
import d2lzh as d2l
from mxnet import gluon, init
from mxnet.gluon import loss as gloss, nn

# 定义模型
net = nn.Sequential()
shared = nn.Dense(256, activation='relu')
net.add(nn.Dense(256, activation='relu'),
        shared,
        nn.Dense(256, activation='relu', params=shared.params),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

# 训练模型
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
loss = gloss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})
num_epochs = 5
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
for i in range(4):
    print(net[i].params, net[i].weight.grad(), net[1].bias.grad())

epoch 1, loss 1.8528, train acc 0.268, test acc 0.659
epoch 2, loss 0.7423, train acc 0.701, test acc 0.783
epoch 3, loss 0.5415, train acc 0.794, test acc 0.834
epoch 4, loss 0.4628, train acc 0.826, test acc 0.842
epoch 5, loss 0.5414, train acc 0.803, test acc 0.839
dense1_ (
  Parameter dense1_weight (shape=(256, 784), dtype=float32)
  Parameter dense1_bias (shape=(256,), dtype=float32)
) 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 256x784 @cpu(0)> 
[-1.01586655e-01 -3.36172462e-01  8.15797746e-01 -1.70786291e-01
  1.05365917e-01  3.65516782e-01  0.00000000e+00 -4.03391048e-02
  0.00000000e+00 -6.40927494e-01 -4.95604336e-01 -2.51778930e-01
  5.67175709e-02 -4.33350094e-02  2.07868636e-01  9.57349315e-02
 -9.84310880e-02  3.97335917e-01  7.31140822e-02  3.08109432e-01
 -7.86992610e-02  1.57338917e-01  1.08392507e-01  5.28969616e-03
 -5.01695156e-01  8.76252294e-

net[1]和net[2]共享参数，则net[1]和net[2]的weight和bias相同，梯度也相同。