In [1]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[ 0.2427, -0.0005, -0.2458, -0.1095,  0.0162,  0.2877, -0.1015, -0.1225,
         -0.1616, -0.1791],
        [ 0.2828, -0.0549, -0.1405, -0.1782, -0.0882,  0.1786,  0.1114,  0.0106,
         -0.1208, -0.0368]], grad_fn=<AddmmBackward>)

In [2]:
class MLP(nn.Module):
    # 用模型参数声明层。这里，我们声明两个全连接的层
    def __init__(self):
        # 调用MLP的父类Module的构造函数来执行必要的初始化。
        # 这样，在类实例化时也可以指定其他函数参数，例如模型参数params（稍后将介绍）
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # 隐藏层
        self.out = nn.Linear(256, 10)  # 输出层

    # 定义模型的前向传播，即如何根据输入X返回所需的模型输出
    def forward(self, X):
        # 注意，这里我们使用ReLU的函数版本，其在nn.functional模块中定义。
        return self.out(F.relu(self.hidden(X)))

In [3]:
net = MLP()
net(X)

tensor([[-0.0198,  0.0459,  0.1738, -0.0156, -0.4127, -0.0670,  0.1724, -0.2412,
          0.1769,  0.2368],
        [-0.1408,  0.0531,  0.2118, -0.0666, -0.1938, -0.0516,  0.2522, -0.1206,
          0.0828,  0.2390]], grad_fn=<AddmmBackward>)

In [4]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            # 这里，module是Module子类的一个实例。我们把它保存在'Module'类的成员
            # 变量_modules中。_module的类型是OrderedDict
            self._modules[str(idx)] = module

    def forward(self, X):
        # OrderedDict保证了按照成员添加的顺序遍历它们
        for block in self._modules.values():
            X = block(X)
        return X

In [5]:
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[-0.2436,  0.0218, -0.0961,  0.0979,  0.1101, -0.1238, -0.1497, -0.2335,
         -0.0102,  0.1970],
        [-0.0772, -0.1552,  0.0198,  0.0034,  0.0354, -0.1417, -0.2012, -0.4164,
          0.0451,  0.1354]], grad_fn=<AddmmBackward>)

In [9]:
# 自定义模块操作更灵活

class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 不计算梯度的随机权重参数。因此其在训练期间保持不变
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        # 使用创建的常量参数以及relu和mm函数
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # 复用全连接层。这相当于两个全连接层共享参数
        X = self.linear(X)
        # 控制流
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [10]:
net = FixedHiddenMLP()
net(X)

tensor(0.1130, grad_fn=<SumBackward0>)

In [11]:
# 多个模块可以嵌套混合使用

class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(-0.1164, grad_fn=<SumBackward0>)

In [12]:
# 参数管理

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.0893],
        [0.0274]], grad_fn=<AddmmBackward>)

In [13]:
# 该层的所有参数
net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.0730,  0.0628,  0.0229, -0.1762,  0.0835, -0.1155,  0.2991,  0.2808]])),
             ('bias', tensor([-0.0406]))])

In [18]:
# 网络所有参数
net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.3914, -0.2723,  0.0888, -0.2920],
                      [-0.3500,  0.4990, -0.1258, -0.0966],
                      [-0.1789,  0.2291, -0.3311,  0.2726],
                      [-0.3333, -0.4423,  0.0957,  0.4735],
                      [ 0.2622,  0.1003, -0.1422,  0.2371],
                      [-0.1317, -0.1202, -0.0838,  0.0759],
                      [-0.3861, -0.4981, -0.1332,  0.0391],
                      [ 0.0991, -0.3183, -0.1360,  0.0078]])),
             ('0.bias',
              tensor([ 0.4013, -0.3551, -0.1708,  0.2553,  0.3602, -0.2720, -0.0299,  0.2385])),
             ('2.weight',
              tensor([[ 0.0730,  0.0628,  0.0229, -0.1762,  0.0835, -0.1155,  0.2991,  0.2808]])),
             ('2.bias', tensor([-0.0406]))])

In [14]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0406], requires_grad=True)
tensor([-0.0406])


In [15]:
# 未调用反向传播，梯度不会有值
net[2].weight.grad == None

True

In [16]:
# 访问所有参数
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [17]:
net.state_dict()['2.bias'].data

tensor([-0.0406])

In [19]:
# 动态向nn.Sequential中加入模块

def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[-0.3122],
        [-0.3122]], grad_fn=<AddmmBackward>)

In [20]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [21]:
rgnet[0][1][0].bias.data

tensor([-0.1167,  0.3710,  0.0191, -0.3747,  0.4938,  0.1697, -0.1938, -0.2020])

In [23]:
# 参数初始化

def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data, net[0].bias.data

(tensor([[-0.0049, -0.0066,  0.0075,  0.0073],
         [-0.0011,  0.0018,  0.0120,  0.0078],
         [ 0.0059, -0.0048,  0.0061,  0.0150],
         [-0.0042,  0.0213, -0.0051, -0.0015],
         [-0.0109,  0.0079,  0.0167, -0.0061],
         [-0.0046,  0.0196,  0.0049, -0.0226],
         [-0.0202,  0.0152, -0.0074,  0.0022],
         [ 0.0078,  0.0028, -0.0125,  0.0017]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [25]:
# 初始化为常数

def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data, net[0].bias.data

(tensor([[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [27]:
# 网络中指定层的单独初始化参数

def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data)
print(net[2].weight.data)

tensor([[ 0.3309, -0.0655, -0.1799, -0.0633],
        [ 0.2601,  0.6311,  0.3851, -0.4932],
        [-0.1594,  0.4241, -0.4309,  0.6867],
        [ 0.3079,  0.5221,  0.5911,  0.4392],
        [ 0.6417, -0.1838, -0.3501, -0.4903],
        [-0.6397,  0.4902,  0.0234, -0.1906],
        [-0.0527,  0.3439, -0.5242,  0.5829],
        [ 0.4646,  0.4087,  0.6094, -0.5477]])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [29]:
# 自定义初始化规则

def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        # 权重元素绝对值<5则置零
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-0.0000, -9.1590,  9.7635, -6.0593],
        [-0.0000, -0.0000, -5.2422, -0.0000]], grad_fn=<SliceBackward>)

In [30]:
# 直接操作参数

net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000, -8.1590, 10.7635, -5.0593])

In [31]:
# 网络中不同层可以使用同一个模块的实例

# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [32]:
# 参数延后初始化，只定义网络结构，根据第一次数据通过网络时来初始化参数shape

net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
# print(net[0].weight)  # 尚未初始化
print(net)

X = torch.rand(2, 20)
net(X)
print(net)

Sequential(
  (0): LazyLinear(in_features=0, out_features=256, bias=True)
  (1): ReLU()
  (2): LazyLinear(in_features=0, out_features=10, bias=True)
)
Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)




In [33]:
# 也可以部分层懒加载初始化

net = nn.Sequential(
    nn.Linear(20, 256), nn.ReLU(),
    nn.LazyLinear(128), nn.ReLU(),
    nn.LazyLinear(10)
)
print(net[0].weight)
print(net[2].weight)
net(X)
print(net[2].weight)

Parameter containing:
tensor([[ 0.1842,  0.0873,  0.0748,  ..., -0.0542, -0.0116,  0.1804],
        [-0.0760,  0.0801,  0.1409,  ...,  0.1480,  0.1653,  0.0989],
        [-0.0180,  0.0515, -0.1150,  ..., -0.0843, -0.0087, -0.0167],
        ...,
        [-0.1395, -0.0039,  0.1032,  ..., -0.0524, -0.1442,  0.1947],
        [ 0.2077,  0.0606,  0.1595,  ..., -0.2087,  0.0648,  0.0958],
        [-0.0535,  0.0391, -0.0310,  ..., -0.1945, -0.1404, -0.1751]],
       requires_grad=True)
Uninitialized parameter
Parameter containing:
tensor([[ 0.0455,  0.0405,  0.0437,  ..., -0.0257, -0.0159,  0.0440],
        [-0.0106, -0.0344,  0.0430,  ...,  0.0305, -0.0150,  0.0491],
        [-0.0599,  0.0076, -0.0182,  ...,  0.0189,  0.0352,  0.0554],
        ...,
        [-0.0374,  0.0552, -0.0018,  ..., -0.0215,  0.0163,  0.0550],
        [-0.0027, -0.0379, -0.0472,  ..., -0.0491,  0.0010, -0.0462],
        [ 0.0214, -0.0379, -0.0601,  ..., -0.0131, -0.0385,  0.0087]],
       requires_grad=True)


In [34]:
# 自定义层
# 定义一个现在在深度学习框架中还不存在的层

In [35]:
# 不带参数

class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()
    
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [37]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.rand(4, 8))
# 均值为0，浮点数近似0
Y.mean()

tensor(5.5879e-09, grad_fn=<MeanBackward0>)

In [38]:
# 携带参数的层

class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [39]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[-0.8914,  0.3395, -0.4893],
        [-1.1501,  1.6723, -0.3684],
        [-0.2409, -1.0638,  2.2540],
        [ 1.0139,  0.4018, -0.1868],
        [-0.1129, -0.8322, -1.8574]], requires_grad=True)

In [41]:
# 实例对象直接调用，内部实现中父类会自动在__call__方法中调用子类重载的forward方法
linear(torch.rand(2, 5))

tensor([[0.0000, 0.0000, 0.0000],
        [0.0000, 1.1081, 0.0000]])

In [42]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[4.9793],
        [0.0000]])

In [45]:
# 读写文件
# 加载和保存张量
from pathlib import Path
save_file = Path(r'../data/temp_save')

x = torch.arange(4)
torch.save(x, save_file.as_posix())

In [46]:
y = torch.zeros(4)
torch.save([x, y], save_file.as_posix())
x2, y2 = torch.load(save_file.as_posix())
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [47]:
mydict = {'x': x, 'y': y}
torch.save(mydict, save_file.as_posix())
mydict2 = torch.load(save_file.as_posix())
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

In [48]:
# 加载和保存模型参数

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

In [49]:
torch.save(net.state_dict(), save_file.as_posix())

In [54]:
# 加载模型参数state_dict
clone_MLP = MLP()
clone_MLP.load_state_dict(torch.load(save_file.as_posix()))
clone_MLP.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

In [55]:
Y_clone = clone_MLP(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])