层和块

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,20))
X = torch.rand(2,20)
net(X)

tensor([[-0.0095, -0.2507, -0.1657, -0.2215, -0.0809, -0.2350,  0.0215,  0.1097,
          0.1820,  0.0255,  0.1924,  0.0737, -0.0146,  0.0341, -0.1640,  0.1673,
          0.0681,  0.3388, -0.1606, -0.1217],
        [-0.0370, -0.2171, -0.1008, -0.3321, -0.0828, -0.2430,  0.1802,  0.1039,
         -0.0426,  0.2624,  0.1158, -0.0192, -0.0720,  0.1906, -0.1109, -0.0510,
          0.1033,  0.1217, -0.1282, -0.1730]], grad_fn=<AddmmBackward0>)

In [2]:
#自定义块
class MLP(nn.Module):
    def __init__(self):
        super().__init__()        
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)
    
    def forward(self,X):
        '''前向计算'''
        return self.out(F.relu(self.hidden(X)))

In [3]:
net = MLP()
net(X)

tensor([[-0.1587, -0.1480, -0.1731,  0.0841, -0.3282, -0.1045,  0.0688,  0.0384,
         -0.0055,  0.1775],
        [-0.0175, -0.0286, -0.0803,  0.1117, -0.1632, -0.0053,  0.0393, -0.0191,
         -0.0250,  0.0587]], grad_fn=<AddmmBackward0>)

In [4]:
#顺序块
class MySequential(nn.Module):
    def __init__(self,*args):
        super().__init__()
        for layer in args:
            self._modules[layer] = layer
            
    def forward(self,X):
        for layer in self._modules.values():
            X = layer(X)
        return X

net = MySequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,10))
net(X)

tensor([[-0.0947, -0.1717,  0.2374,  0.0767,  0.2126,  0.0640, -0.1982, -0.0414,
          0.0391,  0.0869],
        [ 0.0328, -0.2523,  0.3707, -0.0991,  0.3434,  0.1125, -0.1522,  0.0098,
         -0.0897,  0.0461]], grad_fn=<AddmmBackward0>)

In [5]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        #不计算梯度的随机权重参数，因此其在训练期间保持不变
        self.rand_weight = torch.rand((20,20),requires_grad=True)
        self.linear = nn.Linear(20,20)
        
    def forward(self,X):
        X = self.linear(X)
        #使用创建的常量参数以及relu和mm函数
        X = F.relu(torch.mm(X,self.rand_weight) + 1)
        #复用全连接层，这相当于两个全连接层共享参数
        X = self.linear(X)
        #控制流
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [6]:
net = FixedHiddenMLP()
net(X)

tensor(-0.0872, grad_fn=<SumBackward0>)

In [7]:
#混合搭配各种组合块
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20,64),nn.ReLU(),
                                nn.Linear(64,32),nn.ReLU())
        self.linear = nn.Linear(32,16)
        
    def forward(self,X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(),nn.Linear(16,20),nn.ReLU())
chimera(X)

tensor([[0.2553, 0.0915, 0.0000, 0.0000, 0.0636, 0.0000, 0.2931, 0.1515, 0.0405,
         0.0000, 0.0000, 0.0000, 0.0225, 0.0000, 0.0000, 0.0627, 0.0000, 0.0000,
         0.0000, 0.2936],
        [0.2807, 0.1030, 0.0000, 0.0000, 0.0680, 0.0000, 0.3305, 0.1605, 0.0277,
         0.0000, 0.0000, 0.0000, 0.0402, 0.0000, 0.0000, 0.0884, 0.0000, 0.0000,
         0.0000, 0.3104]], grad_fn=<ReluBackward0>)

参数管理
--访问参数，用于调试、诊断和可视化
--参数初始化
--在不同模组间共享参数

In [8]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

tensor([[0.2582],
        [0.3373]], grad_fn=<AddmmBackward0>)

参数访问

In [9]:
#参数访问
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0095, -0.2743, -0.0914, -0.0427,  0.0271,  0.2213,  0.2706,  0.2373]])), ('bias', tensor([0.2871]))])


In [10]:
#目标参数
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.2871], requires_grad=True)
tensor([0.2871])


In [11]:
print(type(net[0].bias))
print(net[0].bias)
print(net[0].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([ 0.4901, -0.4835,  0.3613, -0.0267, -0.4491,  0.1956, -0.2597, -0.1758],
       requires_grad=True)
tensor([ 0.4901, -0.4835,  0.3613, -0.0267, -0.4491,  0.1956, -0.2597, -0.1758])


In [12]:
#参数是复合的对象，包含值，梯度和额外信息
#上述过程中还未调用反向传播，所以参数的梯度处于初始状态
net[2].weight.grad == None

True

In [13]:
#一次性访问所有参数
print(*[(name,param.shape) for name,param in net[0].named_parameters()])
print(*[(name,param.shape) for name,param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [14]:
net.state_dict()['2.bias'].data

tensor([0.2871])

In [15]:
#从嵌套块收集参数
def block1():
    return nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,4),nn.ReLU())

def block2():
    net = nn.Sequential()
    #在这里嵌套
    for i in range(4):
        net.add_module(f'block{i}',block1())
    return net

rgnet = nn.Sequential(block2(),nn.Linear(4,1))
rgnet(X)

tensor([[-0.2222],
        [-0.2219]], grad_fn=<AddmmBackward0>)

In [16]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


参数初始化

In [17]:
#内置初始化
def init_normal(m):
    '''正态分布初始化'''
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight,mean=0,std=0.01) #均值为0，方差为0.01
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0],net[0].bias.data[0]

(tensor([ 0.0098,  0.0016,  0.0185, -0.0122]), tensor(0.))

In [18]:
 def init_constant(m):
        '''初始化为常数'''
        if type(m) == nn.Linear:
            nn.init.constant_(m.weight,1)
            nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0],net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [19]:
#对某些块应用不同的初始化方法
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        
def init_22(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight,22)

net[0].apply(xavier)
net[2].apply(init_22)
print(net[0].weight.data)
print(net[2].weight.data)

tensor([[ 0.1244,  0.0347, -0.6821,  0.7056],
        [-0.0490, -0.6878,  0.0857, -0.5249],
        [ 0.3420,  0.4452, -0.5306,  0.6095],
        [ 0.4234, -0.2931, -0.6981, -0.4768],
        [ 0.0982,  0.3584,  0.0026,  0.5649],
        [ 0.0263, -0.2315,  0.5587,  0.6546],
        [-0.4286, -0.1850, -0.6635, -0.1223],
        [ 0.1545,  0.6119, -0.4049, -0.4272]])
tensor([[22., 22., 22., 22., 22., 22., 22., 22.]])


In [20]:
#自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print(
            "Init",
            *[{name,param.shape} for name,param in m.named_parameters()][0])
        nn.init.uniform_(m.weight,-10,10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight.data[:2]

Init torch.Size([8, 4]) weight
Init torch.Size([1, 8]) weight


tensor([[ 0.0000,  7.7449,  0.0000, -0.0000],
        [-8.1785,  8.1991, -5.6476, -0.0000]])

In [21]:
#参数绑定
shared = nn.Linear(8,8)
net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),shared,
                    nn.ReLU(),shared,nn.ReLU(),nn.Linear(8,1))
net(X)
#2,4隐藏层共享权重--检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0,0] = 100
#实际上他们是同一个对象
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


自定义层 

In [22]:
#不带参数的层
import torch
import torch.nn.functional as F
from torch import nn

class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self,X):
        return X - X.mean()

In [23]:
#自行提供数据
layer = CenteredLayer()
layer(torch.FloatTensor([1,2,3,4,5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [24]:
#组合到更复杂的模型
net = nn.Sequential(nn.Linear(8,128),CenteredLayer())

In [27]:
Y = net(torch.rand(4,8))
Y.mean()

tensor(-3.7253e-09, grad_fn=<MeanBackward0>)

In [30]:
#带参数的层
class MyLinear(nn.Module):
    def __init__(self,in_units,units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units,units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self,X):
        linear = torch.matmul(X,self.weight.data) + self.bias.data
        return F.relu(linear)

In [31]:
linear = MyLinear(5,3)
linear.weight

Parameter containing:
tensor([[-0.0399, -1.0321, -1.9512],
        [-0.2620, -0.3568, -0.3937],
        [ 1.1930, -0.8624, -0.1014],
        [-1.0170,  1.5000,  0.0909],
        [-0.7188,  1.8407,  0.0103]], requires_grad=True)

In [32]:
linear.bias

Parameter containing:
tensor([-0.4638, -1.0623,  0.4178], requires_grad=True)

In [33]:
linear(torch.rand(2,5))

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [38]:
#还可以使用自定义构建模型，就像使用内置的全连接层一样使用自定义层
net = nn.Sequential(MyLinear(64,8),MyLinear(8,1))
net(torch.rand(2,64))

tensor([[4.3124],
        [5.0040]])

读写文件

In [39]:
#加载和保存张量
import torch
from torch import nn
from torch.nn import functional as F

X= torch.arange(4)
torch.save(X,'X-file')

In [41]:
y = torch.load("X-file")
y

tensor([0, 1, 2, 3])

In [43]:
#存储一个张量列表，然后把他们读回内存
y = torch.ones(4)
torch.save([X,y],'X-file')
x2,y2 = torch.load('X-file')
(x2,y2)

(tensor([0, 1, 2, 3]), tensor([1., 1., 1., 1.]))

In [44]:
#写入或读取从字符串映射到张量的字典
mydict = {'X':X,'y':y}
torch.save(mydict,'mydict')
mydict2 = torch.load('mydict')
mydict2

{'X': tensor([0, 1, 2, 3]), 'y': tensor([1., 1., 1., 1.])}

In [46]:
#加载和保存模型参数
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20,256)
        self.output = nn.Linear(256,10)
        
    def forward(self,X):
        return self.output(F.relu(self.hidden(X)))
    
net = MLP()
X = torch.randn(size=(2,20))
Y = net(X)

In [47]:
Y

tensor([[ 0.1554,  0.1887, -0.1701, -0.2092, -0.6236, -0.0218,  0.1294,  0.1631,
          0.0543,  0.0417],
        [ 0.1640,  0.2294,  0.0881,  0.0970, -0.4318, -0.2058,  0.0543, -0.4190,
          0.1111, -0.0534]], grad_fn=<AddmmBackward0>)

In [48]:
#将模型参数存储，放置于'mlp.params'
torch.save(net.state_dict(),'mlp.params')

In [62]:
#为了恢复模型，实例化一个原始多层感知机的备份，不初始化参数，直接读取备份参数
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

In [63]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

(device(type='cpu'), device(type='cuda'), device(type='cuda', index=1))

0