## 3.1 模型构造

In [1]:
import torch
from torch import nn
class MLP(nn.Module):
    # 声明带有模型参数的层，这里声明了两个全连接层
    def __init__(self, **kwargs):
        # 调用MLP父类Module的构造函数来进行初始化
        # 这样在构造实例时还可以指定其他函数参数
        # 如3.2节将介绍的模型参数params
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)
    # 定义模型的前向计算，即如何根据输入x计算返回所需要的模型输出
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

In [2]:
X = torch.randn(2, 784)
net = MLP()
print(net)
net(X)

MLP(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[-0.1166,  0.3709,  0.1625, -0.0634,  0.0378, -0.0738, -0.3726,  0.2768,
         -0.0705, -0.1382],
        [-0.0873,  0.2201, -0.0406, -0.0802, -0.0960,  0.2466, -0.3343,  0.0729,
          0.0937, -0.4173]], grad_fn=<AddmmBackward>)

In [3]:
class MySequential(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySequential, self).__init__()
        # 如果传入的是一个OrderedDict
        if len(args)==1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                # add_module方法会将module添加进self._modules(一个OrderedDict)
                self.add_module(key, module)
        # 传入的是一些Module
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    def forward(self, input):
        # self._modules返回一个OrderedDict，保证会按照成员添加时的顺序遍历成员
        for module in self._modules.values():
            input = module(input)
        return input

In [4]:
net = MySequential(
    nn.Linear(784, 256), 
    nn.ReLU(), 
    nn.Linear(256, 10)
)
print(net)
net(X)

MySequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[-0.1801, -0.2176,  0.0798, -0.1621, -0.0156, -0.0083,  0.2796,  0.2647,
          0.0342,  0.3056],
        [ 0.1345, -0.5100, -0.0496,  0.3354,  0.0183, -0.5262, -0.3022, -0.0921,
          0.2138,  0.2389]], grad_fn=<AddmmBackward>)

In [5]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        # 不可训练参数（常数参数）
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    def forward(self, x):
        x = self.linear(x)
        # 使用创建的常数参数，以及nn.functional中的relu函数和mm函数
        x = nn.functional.relu(torch.mm(x, self.rand_weight.data) + 1)
        # 复用全连接层，等价于两个全连接层共享参数
        x = self.linear(x)
        # 控制流，这里我们需要调用item函数来返回标量进行比较
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

In [6]:
X = torch.rand(2, 20)
net = FancyMLP()
print(net)
net(X)

FancyMLP(
  (linear): Linear(in_features=20, out_features=20, bias=True)
)


tensor(-2.4393, grad_fn=<SumBackward0>)

In [7]:
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30), nn.ReLU())
    def forward(self, x):
        return self.net(x)
net = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())
X = torch.rand(2, 40)
print(net)
net(X)

Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=40, out_features=30, bias=True)
      (1): ReLU()
    )
  )
  (1): Linear(in_features=30, out_features=20, bias=True)
  (2): FancyMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)


tensor(-4.2659, grad_fn=<SumBackward0>)

## 3.2 模型参数的访问、初始化和共享

In [8]:
net = nn.Sequential(nn.Linear(20, 256), 
                   nn.ReLU(), 
                   nn.Linear(256, 10))
print(net)
X = torch.rand(2, 20)
Y = net(X)

Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [9]:
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

weight torch.Size([256, 20]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([256]) <class 'torch.nn.parameter.Parameter'>


In [10]:
weight_0 = list(net[0].parameters())[0]
type(weight_0.data[0][0]), weight_0.data.size()

(torch.Tensor, torch.Size([256, 20]))

In [11]:
param = dict(net[0].named_parameters())

In [12]:
type(param['weight']), param['weight'].size()

(torch.nn.parameter.Parameter, torch.Size([256, 20]))

In [13]:
type(param['bias']), param['bias'].size()

(torch.nn.parameter.Parameter, torch.Size([256]))

In [14]:
weight_0.data

tensor([[ 0.1043, -0.1395,  0.0299,  ...,  0.1190,  0.1244,  0.1795],
        [-0.0209, -0.2027,  0.1324,  ...,  0.0212, -0.0165, -0.0586],
        [ 0.1829, -0.0290,  0.0041,  ..., -0.1630,  0.1055, -0.1026],
        ...,
        [-0.1275,  0.2187, -0.0203,  ...,  0.0015, -0.1478, -0.0467],
        [-0.0304,  0.1645, -0.1152,  ..., -0.1894,  0.1787, -0.1475],
        [ 0.1251, -0.0461,  0.1948,  ..., -0.1353,  0.0808,  0.1132]])

In [15]:
print(weight_0.grad)

None


In [16]:
bias_1 = list(net[2].parameters())[1]
bias_1.data

tensor([ 0.0488,  0.0115,  0.0571, -0.0447,  0.0059,  0.0128,  0.0396,  0.0289,
         0.0044,  0.0107])

In [17]:
print(net)

Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [18]:
from torch.nn import init
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

0.weight tensor([[ 0.0061,  0.0113, -0.0155,  ...,  0.0133, -0.0025, -0.0187],
        [-0.0063,  0.0111,  0.0108,  ..., -0.0122, -0.0177,  0.0070],
        [-0.0170, -0.0057,  0.0131,  ..., -0.0080,  0.0020, -0.0108],
        ...,
        [-0.0081,  0.0010, -0.0115,  ..., -0.0056, -0.0139,  0.0035],
        [ 0.0054,  0.0214, -0.0071,  ...,  0.0031,  0.0036,  0.0052],
        [ 0.0078, -0.0092,  0.0023,  ...,  0.0007, -0.0032,  0.0060]])
2.weight tensor([[ 4.4451e-03,  2.8794e-03,  1.5734e-02,  ..., -3.9530e-03,
          5.8971e-03, -7.4768e-03],
        [-2.2786e-03,  9.6592e-03, -3.5578e-03,  ..., -1.5789e-02,
         -4.1393e-03, -2.8530e-05],
        [ 3.6222e-03, -3.7768e-03, -2.5516e-03,  ...,  9.3914e-03,
         -1.3788e-03, -1.5163e-02],
        ...,
        [ 6.2915e-03, -6.8922e-04,  4.3681e-03,  ..., -1.0601e-02,
          1.3362e-02,  8.1113e-05],
        [-6.7424e-03, -2.1240e-02, -1.2532e-02,  ...,  5.3504e-03,
          1.3282e-04, -1.0524e-02],
        [-1.0599e-02

In [19]:
for name, param in net.named_parameters():
    if 'bias' in name:
        init.constant_(param, val=0)
        print(name, param.data)

0.bias tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [20]:
def xavier(m):
    if type(m)==nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m)==nn.Linear:
        torch.nn.init.constant_(m.weight, 42)
net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.0490,  0.0922,  0.0252,  0.0345, -0.0375,  0.0889,  0.0686,  0.0595,
         0.0388, -0.0629, -0.0648, -0.0676,  0.0073, -0.0999,  0.0053, -0.0298,
         0.1312, -0.0437, -0.0759,  0.1100])
tensor([[42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.],
        ...,
        [42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.]])


In [21]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()
for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)

0.weight tensor([[ 0.0000, -0.0000, -9.0268,  ..., -0.0000, -6.2936, -6.7501],
        [ 0.0000, -7.2599, -6.1810,  ..., -0.0000,  0.0000,  0.0000],
        [ 6.3452, -6.5682, -9.5816,  ..., -7.2323,  0.0000,  7.0041],
        ...,
        [-7.9436,  5.7349, -0.0000,  ...,  0.0000,  0.0000, -0.0000],
        [ 0.0000, -0.0000, -0.0000,  ..., -0.0000,  0.0000,  0.0000],
        [ 0.0000,  7.5755,  9.9136,  ...,  7.4412,  0.0000,  0.0000]])
2.weight tensor([[ 0.0000, -0.0000,  7.1459,  ..., -7.5689,  9.1732,  0.0000],
        [ 0.0000,  0.0000, -5.6028,  ..., -5.5531, -9.9678, -9.8330],
        [ 0.0000,  0.0000,  0.0000,  ...,  7.7888,  0.0000, -7.8483],
        ...,
        [ 8.7079, -0.0000,  0.0000,  ..., -0.0000, -9.1407,  0.0000],
        [-7.5157,  0.0000, -0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 7.9777,  0.0000,  0.0000,  ...,  9.2163, -0.0000, -0.0000]])


In [22]:
for name, param in net.named_parameters():
    if 'bias' in name:
        param.data += 1
        print(name, param.data)

0.bias tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1

In [23]:
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight tensor([[3.]])


In [24]:
print(id(net[0]) == id(net[1]))
print(id(net[0].weight) == id(net[1].weight))

True
True


In [25]:
x = torch.ones(1, 1)
y = net(x).sum()
print(y)
y.backward()
# 单次梯度是3，两次所以就是6
print(net[0].weight.grad)

tensor(9., grad_fn=<SumBackward0>)
tensor([[6.]])


In [26]:
class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()

In [27]:
layer = CenteredLayer()
layer(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float))

tensor([-2., -1.,  0.,  1.,  2.])

In [28]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

In [29]:
y = net(torch.rand(4, 8))
y.mean().item()

-9.313225746154785e-10

In [30]:
class MyDense(nn.Module):
    def __init__(self):
        super(MyDense, self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4, 4)) for i in range(3)])
        self.params.append(nn.Parameter(torch.randn(4, 1)))
    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x
net = MyDense()
print(net)

MyDense(
  (params): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 4x4]
      (1): Parameter containing: [torch.FloatTensor of size 4x4]
      (2): Parameter containing: [torch.FloatTensor of size 4x4]
      (3): Parameter containing: [torch.FloatTensor of size 4x1]
  )
)


In [31]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
            'linear1': nn.Parameter(torch.randn(4, 4)), 
            'linear2': nn.Parameter(torch.randn(4, 1))
        })
        # 新增
        self.params.update({'linear3': nn.Parameter(torch.randn(4, 2))})
    def forward(self, x, choice='linear1'):
        return torch.mm(x, self.params[choice])
net = MyDictDense()
print(net)

MyDictDense(
  (params): ParameterDict(
      (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
      (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
      (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
  )
)


In [32]:
x = torch.ones(1, 4)
print(net(x, 'linear1'))
print(net(x, 'linear2'))
print(net(x, 'linear3'))

tensor([[-2.3688, -0.6960, -0.5341,  2.3085]], grad_fn=<MmBackward>)
tensor([[-1.9920]], grad_fn=<MmBackward>)
tensor([[0.4029, 0.7226]], grad_fn=<MmBackward>)


In [33]:
net = nn.Sequential(
    MyDictDense(), 
    MyDense()
)
print(net)
print(net(x))

Sequential(
  (0): MyDictDense(
    (params): ParameterDict(
        (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
        (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
        (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
  (1): MyDense(
    (params): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x1]
    )
  )
)
tensor([[3.5523]], grad_fn=<MmBackward>)


In [34]:
x = torch.ones(3)
torch.save(x, 'x.pt')

In [35]:
x2 = torch.load('x.pt')
x2

tensor([1., 1., 1.])

In [36]:
y = torch.zeros(4)
torch.save([x, y], 'xy.pt')
xy_list = torch.load('xy.pt')
xy_list

[tensor([1., 1., 1.]), tensor([0., 0., 0., 0.])]

In [37]:
torch.save({'x': x, 'y': y}, 'xy_dict.pt')
xy = torch.load('xy_dict.pt')
xy

{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}

In [38]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)
net = MLP()
torch.save(net, 'model.bin')

  "type " + obj.__name__ + ". It won't be checked "


In [39]:
net2 = torch.load('model.bin')

In [40]:
Y2 = net2(x)
Y = net(x)
Y2 == Y

tensor([1], dtype=torch.uint8)

In [41]:
torch.device('cpu'), torch.cuda.device('cuda'), torch.cuda.device('cuda:1')

(device(type='cpu'),
 <torch.cuda.device at 0x7f2cba460e10>,
 <torch.cuda.device at 0x7f2d792a1710>)

In [42]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [45]:
a = torch.tensor([1, 2, 3], device=torch.device('cuda'))
a

tensor([1, 2, 3], device='cuda:0')

In [46]:
B = torch.rand(2, 3, device=torch.device('cuda:1'))
B

tensor([[0.0251, 0.4416, 0.1046],
        [0.5686, 0.7297, 0.8858]], device='cuda:1')

In [48]:
Z = x.cuda(1)
print(x)
print(Z)

tensor([1, 2, 3])
tensor([1, 2, 3], device='cuda:1')


In [49]:
Z + x

RuntimeError: expected backend CUDA and dtype Long but got backend CPU and dtype Long

In [65]:
torch.exp((Z+2).float())*x.float().cuda(1)

tensor([ 20.0855, 109.1963, 445.2395], device='cuda:1')

In [67]:
net = nn.Linear(3, 1)
net.cuda()
net.weight.data.device

device(type='cuda', index=0)

In [73]:
net(Z.view(1, 3).float().cuda())

tensor([[1.4004]], device='cuda:0', grad_fn=<AddmmBackward>)

In [75]:
net.weight.data

tensor([[ 0.2655, -0.1501,  0.5458]], device='cuda:0')