## Chapter 6 Builder's Guide

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

In [4]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
X = torch.rand(2, 20)
net(X).shape



torch.Size([2, 10])

In [5]:
class MLP(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.out = nn.LazyLinear(10)
        
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))
    
    

In [7]:
net = MLP()
X = torch.rand(4, 30)
net(X).shape

torch.Size([4, 10])

In [10]:
# Define my own version Sequentical

class MySequential(nn.Module):
    
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self.add_module(str(idx), module)
            
    
    def forward(self, X):
        for module in self.children():
            X = module(X)
        return X

In [11]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
net(X).shape

torch.Size([4, 10])

In [22]:
# Define FixedHiddenMLP

class FixedHiddenMLP(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.rand_weight = torch.rand((30, 20))
        self.linear = nn.LazyLinear(20)
        
    def forward(self, X):
        X = self.linear(X)
        X = F.relu(X @ self.rand_weight + 1)
        
        X = self.linear(X)
        
        while X.abs().sum() > 1:
            X /= 2
        
        return X.sum()
    

In [24]:
net = FixedHiddenMLP()
net(X)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x20 and 30x20)

In [27]:

class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.LazyLinear(64), nn.ReLU(),
                                nn.LazyLinear(32), nn.ReLU())
        self.linear = nn.LazyLinear(16)
        
    def forward(self, X):
        return self.linear(self.net(X))



In [28]:
chimera = nn.Sequential(NestMLP(), nn.LazyLinear(20))
chimera(X)

tensor([[-2.5163e-01, -2.2268e-01, -1.3237e-01, -2.6621e-01, -1.1483e-04,
          2.3322e-01, -1.9979e-02,  2.0555e-01,  1.2910e-01,  7.8092e-02,
          3.2890e-01,  1.0668e-01,  8.7630e-02, -1.8570e-01,  1.0008e-01,
          1.2530e-01,  1.1027e-01,  2.1349e-01,  1.0495e-01,  1.4753e-01],
        [-2.7562e-01, -2.3336e-01, -1.2803e-01, -2.4663e-01,  7.9789e-03,
          2.4696e-01, -2.5395e-02,  1.7698e-01,  1.2999e-01,  9.8150e-02,
          3.3663e-01,  1.1383e-01,  6.8412e-02, -2.2596e-01,  1.2068e-01,
          1.3152e-01,  7.6674e-02,  2.2618e-01,  7.1008e-02,  1.6124e-01],
        [-2.4077e-01, -2.1682e-01, -1.5097e-01, -2.6622e-01,  1.1464e-02,
          2.8073e-01, -5.9976e-02,  2.1258e-01,  8.9256e-02,  6.7808e-02,
          3.4041e-01,  9.3593e-02,  1.0705e-01, -1.9967e-01,  9.5986e-02,
          1.3439e-01,  1.0999e-01,  2.0902e-01,  1.2148e-01,  1.6793e-01],
        [-2.7319e-01, -2.3678e-01, -1.1238e-01, -2.4536e-01,  1.2746e-04,
          2.5838e-01, -3.8424e-02, 

In [37]:
# Try the paramter management

net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(24), nn.ReLU(), nn.LazyLinear(1))

X = torch.rand(size=(2, 4))

net(X).shape

torch.Size([2, 1])

In [42]:
net[4].state_dict()

OrderedDict([('weight',
              tensor([[-0.0692,  0.1466,  0.1105, -0.0545,  0.0843,  0.1547, -0.0919,  0.1267,
                       -0.0845, -0.1510,  0.1591,  0.1269,  0.0244,  0.0706,  0.1843, -0.0009,
                        0.0536,  0.1240, -0.1626,  0.1561,  0.1958, -0.0400, -0.0165, -0.0105]])),
             ('bias', tensor([-0.1214]))])

In [43]:
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([24, 8])),
 ('2.bias', torch.Size([24])),
 ('4.weight', torch.Size([1, 24])),
 ('4.bias', torch.Size([1]))]

In [45]:
# Shared Layer

shared = nn.LazyLinear(8)

net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), 
                    shared, nn.ReLU(), 
                    shared, nn.ReLU(), 
                    nn.LazyLinear(1))

net(X)

print(net[2].weight.data[0] == net[4].weight.data[0])

net[2].weight.data[0, 0] = 100

print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [49]:
# Try to initialize the parameters

net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(),
                nn.LazyLinear(1))

X = torch.rand(size=(100, 4))

net(X).shape

torch.Size([100, 1])

In [51]:
# Initialize all weight parameters as Gaussian random

def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)
        
net.apply(init_normal)

net[2].weight, net[2].bias

(Parameter containing:
 tensor([[ 0.0068, -0.0014,  0.0037, -0.0002,  0.0206, -0.0163,  0.0103, -0.0114]],
        requires_grad=True),
 Parameter containing:
 tensor([0.], requires_grad=True))

In [52]:
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)
        
def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)
    

In [54]:
net[0].apply(init_xavier)
net[2].apply(init_42)

print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.4842,  0.2634,  0.2337, -0.5954])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [56]:
def my_init(module):
    if type(module) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in module.named_parameters()][0])
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-5.6492, -9.6307, -0.0000, -5.0267],
        [-0.0000,  9.0222, -0.0000,  0.0000]], grad_fn=<SliceBackward0>)

In [57]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]


tensor([42.0000, -8.6307,  1.0000, -4.0267])

* Chapter 6.5 Custom Layers

In [61]:
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l


In [67]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, X):
        return X - X.mean()

In [64]:
layer = CenteredLayer()
layer(torch.tensor([1.0, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [68]:
net = nn.Sequential(nn.LazyLinear(128), CenteredLayer())

Y = net(torch.rand(4, 8))

Y.mean()

tensor(1.0245e-08, grad_fn=<MeanBackward0>)

In [69]:
# Layer with parameters

class MyLinear(nn.Module):
    
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
        
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)
    

In [70]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[-0.2311, -1.0672,  0.9563],
        [-0.3222, -0.7091,  3.1810],
        [-0.2466,  0.2823,  1.1680],
        [-0.3492, -0.9549, -0.4932],
        [-0.4948,  0.8325, -1.7294]], requires_grad=True)

In [71]:
linear(torch.rand(2, 5))

tensor([[0.0000, 0.0000, 2.0850],
        [0.0000, 0.0000, 0.0000]])

In [74]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[2.7982],
        [2.9903]])

* Chapter 6.6 File I/O

In [75]:
# Loading and saving tensors

x = torch.arange(4)
torch.save(x, '../data/x-file')

In [77]:
x2 = torch.load('../data/x-file')
x2

tensor([0, 1, 2, 3])

In [78]:
y = torch.zeros(4)
torch.save([x, y], '../data/x-file')

x2, y2 = torch.load('../data/x-file')

x2,y2

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [79]:
mydict = {'x': x, 'y':y}
torch.save(mydict, '../data/mydict')

mydict2 = torch.load('../data/mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

In [80]:
# Load and save model parameters

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.output = nn.LazyLinear(10)
        
    def forward(self, X):
        return self.output(F.relu(self.hidden(X)))
    


In [81]:
net = MLP()

X = torch.randn(size=(2, 20))
Y = net(X)

In [82]:
torch.save(net.state_dict(), '../data/mlp.params')

In [83]:
clone = MLP()

clone.load_state_dict(torch.load('../data/mlp.params'))

clone.eval()

MLP(
  (hidden): LazyLinear(in_features=0, out_features=256, bias=True)
  (output): LazyLinear(in_features=0, out_features=10, bias=True)
)

In [84]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

* Chapter 6.7 GPUs

In [85]:
import torch
from torch import nn
from d2l import torch as d2l

In [92]:
def cpu():
    return torch.device('cpu')

def gpu(i=0):
    return torch.device(f'cuda:{i}')

cpu(), gpu(), gpu(2)

(device(type='cpu'),
 device(type='cuda', index=0),
 device(type='cuda', index=2))

In [88]:
def num_gpus():
    return torch.cuda.device_count()

num_gpus()

1

In [91]:
def try_gpu(i=0):
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()

def try_all_gpus():
    return [gpu[i] for i in range(num_gpus())]

try_gpu(), try_gpu(10), try_all_gpus()

TypeError: 'function' object is not subscriptable

In [94]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [95]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [97]:
Y = torch.rand(2, 3, device=try_gpu(0))
Y

tensor([[0.1497, 0.5114, 0.1653],
        [0.6972, 0.9573, 0.7402]], device='cuda:0')

In [98]:
# NN on GPU

net = nn.Sequential(nn.LazyLinear(1))
net = net.to(device=try_gpu())

In [99]:
net(X)

tensor([[-0.4935],
        [-0.4935]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [100]:
net[0].weight.data.device

device(type='cuda', index=0)

In [106]:
def try_gpu(i=0):  #@save
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()


(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])