In [3]:
# access and modify parameters in neural network
# ----------------------------------------------
# a. access target layer parameters
# b. access whole layers parameters
# c. access complicated block layer paramters

import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(2, 4)
print(X)
print(net(X))

tensor([[0.7380, 0.8025, 0.0534, 0.8484],
        [0.0476, 0.0154, 0.0187, 0.8408]])
tensor([[-0.1628],
        [ 0.1541]], grad_fn=<AddmmBackward0>)


In [18]:
# a.1. print target layer paramters:
# regard sequential as a list
print(net[2].state_dict())  # 'state_dict' prints all parameters and dtyppes
print(net[2].parameters())  # 'parameters' prints that parameters are iterable generator object, included all parameters

OrderedDict([('weight', tensor([[ 0.1779,  0.1888,  0.1514,  0.1997,  0.3381,  0.0964,  0.0334, -0.3471]])), ('bias', tensor([-0.0466]))])
<generator object Module.parameters at 0x0000024BE3F8EAB0>


In [19]:
# a.2. access target layer paramter:
print(type(net[2].bias))    # print type of bias of the third layer(output layer)
print(net[2].bias)          # print information of bias of included data and gradients
print(net[2].bias.data)     # print precisely data of bias
print(net[2].weight)        # print information of weights of included data and gradients
print(net[2].weight.data)   # print precisely data of weights

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0466], requires_grad=True)
tensor([-0.0466])
Parameter containing:
tensor([[ 0.1779,  0.1888,  0.1514,  0.1997,  0.3381,  0.0964,  0.0334, -0.3471]],
       requires_grad=True)
tensor([[ 0.1779,  0.1888,  0.1514,  0.1997,  0.3381,  0.0964,  0.0334, -0.3471]])


In [7]:
# a.3. access target layer gradients:
print(net[2].weight.grad == None)
print(net[2].bias.grad == None)

True
True


In [20]:
# b. access all layers paramters
print(*[(name, param) for name, param in net[0].named_parameters()])  # access single layer parameters
print(*[(name, param) for name, param in net.named_parameters()])     # access all layers parameters

('weight', Parameter containing:
tensor([[-0.3416,  0.1208, -0.4908,  0.3961],
        [-0.2082, -0.4059, -0.3134,  0.0500],
        [-0.2749, -0.1384,  0.4878,  0.1137],
        [ 0.3319,  0.1241, -0.0963, -0.0970],
        [-0.0232,  0.0821,  0.3725,  0.0660],
        [-0.3434,  0.2402,  0.3705,  0.3277],
        [-0.4538, -0.3022,  0.2685, -0.4972],
        [ 0.4379,  0.3505,  0.0551, -0.1680]], requires_grad=True)) ('bias', Parameter containing:
tensor([-0.1668,  0.4347,  0.1727, -0.1870,  0.1206, -0.3694, -0.2640,  0.1490],
       requires_grad=True))
('0.weight', Parameter containing:
tensor([[-0.3416,  0.1208, -0.4908,  0.3961],
        [-0.2082, -0.4059, -0.3134,  0.0500],
        [-0.2749, -0.1384,  0.4878,  0.1137],
        [ 0.3319,  0.1241, -0.0963, -0.0970],
        [-0.0232,  0.0821,  0.3725,  0.0660],
        [-0.3434,  0.2402,  0.3705,  0.3277],
        [-0.4538, -0.3022,  0.2685, -0.4972],
        [ 0.4379,  0.3505,  0.0551, -0.1680]], requires_grad=True)) ('0.bias', P

In [16]:
# c. block paramters
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
print(rgnet)  # see how blocks are presented

# access paramters of every part of the blcoks
print('network[0][1][0] bias:\n', rgnet[0][1][0].bias.data)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)
network[0][1][0] bias:
 tensor([-0.0535,  0.3259,  0.0235,  0.1311, -0.4188, -0.3510, -0.0011, -0.0601])


In [26]:
# initialization
# --------------

# built-in initialization
def init_params(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0.0, std=1.0)
        nn.init.zeros_(m.bias)

net.apply(init_params)
print('normal and zero:\n', net[0].weight.data, net[0].bias.data)

def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight, gain=1.0)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print('xavier and zero:\n', net[0].weight[0], net[0].bias[0])  # we can index weight and bias by list-like index: [n]
print('constants and zero:\n', net[2].weight[0], net[2].bias[0])

normal and zero:
 tensor([[ 0.4853,  0.7469,  1.0303,  0.7938],
        [-0.9744, -0.2733,  1.3704,  0.2528],
        [ 0.3121, -1.1431, -0.0885,  0.6332],
        [-0.6108,  1.3921,  1.0519, -1.4050],
        [-0.0691, -0.6566,  0.2556,  0.1244],
        [-0.6161,  0.5901, -3.3487,  1.2075],
        [-0.4881,  1.7317,  0.8372,  0.7963],
        [ 0.8471, -1.2255, -0.5072,  0.1774]]) tensor([0., 0., 0., 0., 0., 0., 0., 0.])
xavier and zero:
 tensor([ 0.4672,  0.0905, -0.2210,  0.4433], grad_fn=<SelectBackward0>) tensor(0., grad_fn=<SelectBackward0>)
constants and zero:
 tensor([42., 42., 42., 42., 42., 42., 42., 42.], grad_fn=<SelectBackward0>) tensor(0., grad_fn=<SelectBackward0>)


In [27]:
# self-customized
def init_custom(m):
    if type(m) == nn.Linear:
        nn.init.uniform_(m.weight, a=-10, b=10)
        m.weight.data *= m.weight.data.abs() > 5  # customed calculations of weight data
net.apply(init_custom)
print(net[0].weight[:2])

# shared parameters' initialization
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), 
                    shared, nn.ReLU(), 
                    shared, nn.ReLU(), 
                    nn.Linear(8, 1))
net(X)
# check if sharing same layer would share same paramters
print(net[2].weight.data[0] == net[4].weight.data[0])  # yes
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])  # even changes make the same, which means sharing the parameters

tensor([[-7.7351, -0.0000, -9.6666,  0.0000],
        [ 0.0000, -0.0000, -0.0000, -5.6922]], grad_fn=<SliceBackward0>)
tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [29]:
# lazy initialization
import torch
from torch import nn
from d2l import torch as d2l

# lazyLinear()
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
print(net[0].weight)  # UnitializedParameter means that net have not 
                      # retreived a input to initialize weight shape.

X = torch.rand(2, 20)
print(net(X))
print(net[0].weight.shape)      # now we can see how parameters are 
                                # initialized with shape of input X

# custom way of lazy initialization
@d2l.add_to_class(d2l.Module)  #@save
def apply_init(self, inputs, init=None):
    self.forward(*inputs)
    if init is not None:
        self.net.apply(init)

<UninitializedParameter>
tensor([[-0.0591,  0.2954,  0.1913, -0.0738, -0.3221, -0.1195,  0.1166, -0.0271,
         -0.2651,  0.3326],
        [-0.0241,  0.3241,  0.1102, -0.2235, -0.3253, -0.1758,  0.0292,  0.0679,
         -0.4163,  0.2853]], grad_fn=<AddmmBackward0>)
torch.Size([256, 20])
