In [1]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[ 0.0139],
        [-0.0007]], grad_fn=<AddmmBackward0>)

In [15]:
net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.2216,  0.3265,  0.1412,  0.3516],
                      [-0.3525,  0.3829, -0.2661, -0.3142],
                      [ 0.2388, -0.0058, -0.1823,  0.1125],
                      [ 0.2487,  0.2992,  0.2207,  0.2781],
                      [ 0.3993,  0.2927, -0.3483,  0.4125],
                      [ 0.4552, -0.2721, -0.2813, -0.2578],
                      [-0.3861,  0.1351, -0.0594, -0.4489],
                      [ 0.0987,  0.2688,  0.1397,  0.3739]])),
             ('0.bias',
              tensor([-0.4767,  0.4710, -0.1095,  0.0568,  0.1350,  0.0746, -0.3600, -0.0675])),
             ('2.weight',
              tensor([[ 0.1536, -0.1802,  0.3496, -0.0966,  0.3245,  0.1018, -0.2024, -0.1008]])),
             ('2.bias', tensor([0.0393]))])

In [5]:
net[0].state_dict(), net[2].state_dict()

(OrderedDict([('weight',
               tensor([[ 0.2216,  0.3265,  0.1412,  0.3516],
                       [-0.3525,  0.3829, -0.2661, -0.3142],
                       [ 0.2388, -0.0058, -0.1823,  0.1125],
                       [ 0.2487,  0.2992,  0.2207,  0.2781],
                       [ 0.3993,  0.2927, -0.3483,  0.4125],
                       [ 0.4552, -0.2721, -0.2813, -0.2578],
                       [-0.3861,  0.1351, -0.0594, -0.4489],
                       [ 0.0987,  0.2688,  0.1397,  0.3739]])),
              ('bias',
               tensor([-0.4767,  0.4710, -0.1095,  0.0568,  0.1350,  0.0746, -0.3600, -0.0675]))]),
 OrderedDict([('weight',
               tensor([[ 0.1536, -0.1802,  0.3496, -0.0966,  0.3245,  0.1018, -0.2024, -0.1008]])),
              ('bias', tensor([0.0393]))]))

In [8]:
type(net[2].bias)

torch.nn.parameter.Parameter

In [9]:
net[2].bias, net[2].bias.data

(Parameter containing:
 tensor([0.0393], requires_grad=True),
 tensor([0.0393]))

In [11]:
type(net[2].weight.grad)

NoneType

In [12]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))


In [13]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [14]:
net.state_dict()['2.bias'].data

tensor([0.0393])

In [16]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.4933],
        [0.4933]], grad_fn=<AddmmBackward0>)

In [17]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [20]:
rgnet.state_dict()

OrderedDict([('0.block 0.0.weight',
              tensor([[-0.2623, -0.2728,  0.3414, -0.2960],
                      [ 0.2970,  0.0123,  0.3117, -0.1636],
                      [-0.4141,  0.4542,  0.0652, -0.3922],
                      [ 0.2934,  0.3611, -0.3066, -0.1822],
                      [ 0.0828,  0.1259, -0.4821, -0.3836],
                      [ 0.4870,  0.4986, -0.0606, -0.1171],
                      [-0.0689,  0.0359, -0.3333,  0.2903],
                      [ 0.4719,  0.0272,  0.0932, -0.2046]])),
             ('0.block 0.0.bias',
              tensor([ 0.4610,  0.2921, -0.1457, -0.3237, -0.2987,  0.2898,  0.2954,  0.2206])),
             ('0.block 0.2.weight',
              tensor([[ 0.2650,  0.0945, -0.3423, -0.0866,  0.3205, -0.0406, -0.2645, -0.2238],
                      [-0.1642,  0.2953, -0.0012,  0.1344,  0.3211,  0.1662, -0.1640,  0.1403],
                      [-0.2716,  0.0441, -0.2046,  0.0907,  0.2011,  0.0404,  0.1750, -0.0143],
                      [ 0.

可以像访问数组一样访问嵌套的网络模块

In [21]:
rgnet[0][1][0].bias.data

tensor([-0.4143, -0.1830,  0.0185, -0.0303,  0.0978, -0.4395,  0.4984,  0.3374])

参数绑定（参数共享）

In [22]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), 
                    nn.ReLU(),
                    shared, 
                    nn.ReLU(),
                    shared, 
                    nn.ReLU(),
                    nn.Linear(8, 1))
net(X)

tensor([[0.1679],
        [0.1747]], grad_fn=<AddmmBackward0>)

In [23]:
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


In [27]:
net[2].weight.data[0,0]

tensor(-0.0559)

In [28]:
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


这两层的梯度会相加在一起

共享参数通常可以节省内存，并在以下方面具有特定的好处：

对于图像识别中的CNN，共享参数使网络能够在图像中的任何地方而不是仅在某个区域中查找给定的功能。
对于RNN，它在序列的各个时间步之间共享参数，因此可以很好地推广到不同序列长度的示例。
对于自动编码器，编码器和解码器共享参数。 在具有线性激活的单层自动编码器中，共享权重会在权重矩阵的不同隐藏层之间强制正交。