In [1]:
import torch
from torch import nn
from torch.nn import functional as F

In [2]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))



In [3]:
net

Sequential(
  (0): LazyLinear(in_features=0, out_features=256, bias=True)
  (1): ReLU()
  (2): LazyLinear(in_features=0, out_features=10, bias=True)
)

In [4]:
X = torch.rand(2, 20)
print(X)
net(X).shape

tensor([[0.8817, 0.3688, 0.3752, 0.2252, 0.1093, 0.8586, 0.4690, 0.0190, 0.9606,
         0.6247, 0.6362, 0.0418, 0.3044, 0.2760, 0.6413, 0.1630, 0.5387, 0.3126,
         0.2692, 0.2251],
        [0.8075, 0.2585, 0.6105, 0.4128, 0.5629, 0.2252, 0.4192, 0.0558, 0.4248,
         0.2762, 0.8086, 0.0114, 0.8259, 0.2301, 0.8942, 0.2410, 0.1797, 0.3384,
         0.8826, 0.7303]])


torch.Size([2, 10])

In [5]:
class MLP(nn.Module):
    def __init__(self):
        # Call the constructor of the parent class nn.Module to perform
        # the necessary initialization
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.out = nn.LazyLinear(10)

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input X
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [6]:
Y = torch.rand(2, 25)
print(Y)
net = MLP()
print(net(Y).shape)
print(net)

tensor([[0.7688, 0.5314, 0.1237, 0.9664, 0.5719, 0.7690, 0.1723, 0.6260, 0.6166,
         0.5428, 0.6944, 0.9658, 0.7470, 0.1695, 0.7031, 0.5386, 0.7105, 0.7451,
         0.3555, 0.8344, 0.5305, 0.0286, 0.9068, 0.7443, 0.6961],
        [0.0632, 0.5445, 0.0839, 0.5033, 0.2544, 0.8736, 0.1211, 0.6875, 0.4681,
         0.9824, 0.9614, 0.5796, 0.3575, 0.7056, 0.7241, 0.1791, 0.0068, 0.4748,
         0.0834, 0.4085, 0.7043, 0.5200, 0.1780, 0.7000, 0.5709]])
torch.Size([2, 10])
MLP(
  (hidden): Linear(in_features=25, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=10, bias=True)
)


In [7]:
print(net(Y))

tensor([[ 0.0201, -0.1414,  0.2681,  0.0479, -0.1967, -0.1659,  0.2092, -0.0777,
          0.0075, -0.0500],
        [-0.0559, -0.0453,  0.1192,  0.0237, -0.2137, -0.1102,  0.1383, -0.0716,
         -0.0948, -0.0787]], grad_fn=<AddmmBackward0>)


In [8]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self.add_module(str(idx), module)

    def forward(self, X):
        for module in self.children():
            X = module(X)
        return X

In [9]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
net(X).shape

torch.Size([2, 10])

In [10]:
print(net(X))

tensor([[-0.1928,  0.0456, -0.0861, -0.0051, -0.0765,  0.0018,  0.0587,  0.0114,
         -0.0007,  0.0992],
        [-0.0761, -0.0198, -0.1515,  0.1032, -0.0381, -0.0875,  0.0886, -0.0143,
          0.0666,  0.0400]], grad_fn=<AddmmBackward0>)


In [11]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # Random weight parameters that will not compute gradients and
        # therefore keep constant during training
        self.rand_weight = torch.rand((20, 20))
        self.linear = nn.LazyLinear(20)

    def forward(self, X):
        print(X)
        X = self.linear(X)
        print(X)
        X = F.relu(X @ self.rand_weight + 1)
        print(X)
        # Reuse the fully connected layer. This is equivalent to sharing
        # parameters with two fully connected layers
        X = self.linear(X)
        print(X)
        # Control flow
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [12]:
net = FixedHiddenMLP()
net(X)

tensor([[0.8817, 0.3688, 0.3752, 0.2252, 0.1093, 0.8586, 0.4690, 0.0190, 0.9606,
         0.6247, 0.6362, 0.0418, 0.3044, 0.2760, 0.6413, 0.1630, 0.5387, 0.3126,
         0.2692, 0.2251],
        [0.8075, 0.2585, 0.6105, 0.4128, 0.5629, 0.2252, 0.4192, 0.0558, 0.4248,
         0.2762, 0.8086, 0.0114, 0.8259, 0.2301, 0.8942, 0.2410, 0.1797, 0.3384,
         0.8826, 0.7303]])
tensor([[-0.3100,  0.0275, -0.6039, -0.0274,  0.2495,  0.2649, -0.1968, -0.2168,
          0.0144, -0.5568, -0.2003,  0.2115,  0.1632,  0.0292, -0.0314, -0.1499,
         -0.5893,  0.0174, -0.4951,  0.3001],
        [-0.2813,  0.2979, -0.1368, -0.3075,  0.1596,  0.2175, -0.0255, -0.1071,
          0.3821, -0.5584, -0.2568,  0.3668, -0.0024,  0.1399, -0.2076, -0.1234,
         -0.2040, -0.2344, -0.4771,  0.2246]], grad_fn=<AddmmBackward0>)
tensor([[0.2466, 0.0000, 0.0129, 0.3691, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.2000, 0.0000, 0.0000, 0.0513, 0.0000, 0.0000, 0.4310,
         0.0000, 0

tensor(-0.1113, grad_fn=<SumBackward0>)

In [13]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.LazyLinear(64), nn.ReLU(),
                                 nn.LazyLinear(32), nn.ReLU())
        self.linear = nn.LazyLinear(16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.LazyLinear(20), FixedHiddenMLP())
chimera(X)

tensor([[-0.0140,  0.0736,  0.2891,  0.0496,  0.0806,  0.1230,  0.2501, -0.0491,
         -0.0154, -0.2941, -0.1843, -0.1660, -0.2542, -0.0298, -0.2364,  0.0124,
          0.1949, -0.0061, -0.0244, -0.1414],
        [ 0.0081,  0.0728,  0.2695,  0.0779,  0.0841,  0.1317,  0.2501, -0.0522,
         -0.0035, -0.2925, -0.1736, -0.1789, -0.2213, -0.0490, -0.2263,  0.0443,
          0.1886, -0.0492, -0.0268, -0.1922]], grad_fn=<AddmmBackward0>)
tensor([[ 0.2367, -0.3587,  0.0816,  0.1174,  0.0903,  0.1712, -0.1872,  0.0227,
         -0.2261,  0.2275,  0.0128,  0.1494, -0.0195,  0.1676, -0.1837, -0.1147,
         -0.1716, -0.2612,  0.0226,  0.1904],
        [ 0.2267, -0.3543,  0.0996,  0.1073,  0.0904,  0.1563, -0.1823,  0.0049,
         -0.2353,  0.2284,  0.0499,  0.1433, -0.0146,  0.1615, -0.1841, -0.0986,
         -0.1527, -0.2615,  0.0238,  0.1937]], grad_fn=<AddmmBackward0>)
tensor([[1.2011, 0.5513, 0.8431, 0.6450, 1.0896, 1.2798, 0.7021, 1.0921, 0.8170,
         0.8490, 0.6853, 0.7226, 

tensor(0.0173, grad_fn=<SumBackward0>)

In [14]:
import torch
from torch import nn

In [15]:
net = nn.Sequential(nn.LazyLinear(8),
                    nn.ReLU(),
                    nn.LazyLinear(1))

X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [16]:
print(net(X)), print(len(net))

tensor([[-0.2592],
        [-0.2325]], grad_fn=<AddmmBackward0>)
3


(None, None)

In [17]:
print(net)

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)


In [18]:
net[2].state_dict()

OrderedDict([('weight',
              tensor([[-0.1059, -0.0964, -0.2920, -0.0033, -0.1329, -0.0126, -0.0365, -0.3188]])),
             ('bias', tensor([-0.0178]))])

In [19]:
net[1].state_dict()

OrderedDict()

In [20]:
net[0].state_dict()

OrderedDict([('weight',
              tensor([[ 0.1818,  0.2196,  0.1701,  0.3125],
                      [-0.4634,  0.0150,  0.1587,  0.2880],
                      [ 0.3853,  0.2453, -0.2673,  0.0608],
                      [ 0.4209, -0.2034,  0.3777, -0.0469],
                      [ 0.1038, -0.4133, -0.2384, -0.0516],
                      [-0.1722, -0.3798, -0.0452,  0.0014],
                      [-0.1569, -0.3686,  0.3151, -0.4350],
                      [-0.4617, -0.2278, -0.0748, -0.2721]])),
             ('bias',
              tensor([-0.2453, -0.0472,  0.4108,  0.2393, -0.2408, -0.1498, -0.1127,  0.1278]))])

In [21]:
type(net[2].bias), net[2].bias.data

(torch.nn.parameter.Parameter, tensor([-0.0178]))

In [22]:
net[0].bias.data

tensor([-0.2453, -0.0472,  0.4108,  0.2393, -0.2408, -0.1498, -0.1127,  0.1278])

In [23]:
net[2].weight.grad == None

True

In [24]:
type(net[2].weight)

torch.nn.parameter.Parameter

In [25]:
net[2].weight.data

tensor([[-0.1059, -0.0964, -0.2920, -0.0033, -0.1329, -0.0126, -0.0365, -0.3188]])

In [26]:
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

In [27]:
# We need to give the shared layer a name so that we can refer to its
# parameters
shared = nn.LazyLinear(8)
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.LazyLinear(1))

net(X)
# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [28]:
import torch
from torch import nn
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [29]:
print(net(X))

tensor([[-0.3194],
        [-0.2610]], grad_fn=<AddmmBackward0>)


In [30]:
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.1549,  0.1693, -0.1878, -0.4363]), tensor(0.4064))

In [31]:
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0184, -0.0021, -0.0104,  0.0021]), tensor(0.))

In [32]:
print(net(X))

tensor([[1.7155e-05],
        [1.5661e-05]], grad_fn=<AddmmBackward0>)


In [33]:
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [34]:
print(net(X))

tensor([[12.8312],
        [ 8.9793]], grad_fn=<AddmmBackward0>)


In [35]:
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)

def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.6647, -0.2535,  0.5426,  0.0472])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [36]:
print(net(X))

tensor([[87.2439],
        [63.3128]], grad_fn=<AddmmBackward0>)


In [37]:
print(net)

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)


In [38]:
def my_init(module):
    if type(module) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in module.named_parameters()][0])
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-0.0000, -9.0183,  6.3348, -0.0000],
        [ 7.1091,  0.0000,  7.9747, -0.0000]], grad_fn=<SliceBackward0>)

In [39]:
net[0].weight

Parameter containing:
tensor([[-0.0000, -9.0183,  6.3348, -0.0000],
        [ 7.1091,  0.0000,  7.9747, -0.0000],
        [-6.8346, -8.6194, -0.0000, -5.0848],
        [-0.0000, -0.0000,  7.7123, -0.0000],
        [-0.0000, -5.8656, -0.0000, -0.0000],
        [ 0.0000, -9.1871, -0.0000, -5.8814],
        [-9.4074,  8.7232,  7.2690, -5.4035],
        [-6.6960, -8.1710,  0.0000,  5.1862]], requires_grad=True)

In [40]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000, -8.0183,  7.3348,  1.0000])

In [41]:
net[0].weight

Parameter containing:
tensor([[42.0000, -8.0183,  7.3348,  1.0000],
        [ 8.1091,  1.0000,  8.9747,  1.0000],
        [-5.8346, -7.6194,  1.0000, -4.0848],
        [ 1.0000,  1.0000,  8.7123,  1.0000],
        [ 1.0000, -4.8656,  1.0000,  1.0000],
        [ 1.0000, -8.1871,  1.0000, -4.8814],
        [-8.4074,  9.7232,  8.2690, -4.4035],
        [-5.6960, -7.1710,  1.0000,  6.1862]], requires_grad=True)

In [42]:
#Lazy Initialization
import torch
from torch import nn
from d2l import torch as d2l

net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
net[0].weight
#net[0].weight.shape



<UninitializedParameter>

In [43]:
X = torch.rand(2, 20)
net(X)

net[0].weight.shape

torch.Size([256, 20])

In [44]:
net[0].weight

Parameter containing:
tensor([[ 0.0131, -0.2014, -0.0057,  ...,  0.0915,  0.0526, -0.0537],
        [ 0.0898,  0.0993,  0.0033,  ..., -0.1568, -0.1023, -0.0168],
        [-0.1921,  0.1482,  0.1427,  ..., -0.0340, -0.1235, -0.0430],
        ...,
        [-0.1826,  0.1816, -0.0626,  ..., -0.1748,  0.0444, -0.1504],
        [-0.1900, -0.0791,  0.1566,  ..., -0.0520, -0.1659, -0.1598],
        [ 0.0927, -0.1618,  0.1594,  ..., -0.0795,  0.0193,  0.1490]],
       requires_grad=True)

In [45]:
@d2l.add_to_class(d2l.Module)  #@save
def apply_init(self, inputs, init=None):
    self.forward(*inputs)
    if init is not None:
        self.net.apply(init)

In [46]:
#Custom Layers
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

layer = CenteredLayer()
X=torch.tensor([1.0, 2, 3, 4, 5])
print(X.mean())
layer(X)

tensor(3.)


tensor([-2., -1.,  0.,  1.,  2.])

In [47]:
net = nn.Sequential(nn.LazyLinear(128), CenteredLayer())
Y = net(torch.rand(4, 8))
Y.mean()

tensor(-7.4506e-09, grad_fn=<MeanBackward0>)

In [48]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))

    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [49]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[ 0.2973,  0.2372, -0.2901],
        [ 0.3082, -1.9233, -0.9046],
        [-0.5008, -0.2955, -0.3863],
        [ 1.4239, -0.0408,  1.3466],
        [-0.6026, -1.2162,  0.3429]], requires_grad=True)

In [50]:
linear(torch.rand(2, 5))

tensor([[0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3829]])

In [51]:
linear(X)

tensor([0.6947, 0.0000, 3.6955])

In [52]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[0.],
        [0.]])

In [53]:
# File I/O
import torch
from torch import nn
from torch.nn import functional as F

In [54]:
x=torch.rand(2, 10)
print(x)
torch.save(x, 'x-file')

tensor([[0.9242, 0.4807, 0.2747, 0.1142, 0.5130, 0.5309, 0.3431, 0.9387, 0.2909,
         0.5965],
        [0.1903, 0.7803, 0.1685, 0.5773, 0.6234, 0.1553, 0.3422, 0.3461, 0.6596,
         0.7240]])


In [55]:
x2 = torch.load('x-file')
x2

tensor([[0.9242, 0.4807, 0.2747, 0.1142, 0.5130, 0.5309, 0.3431, 0.9387, 0.2909,
         0.5965],
        [0.1903, 0.7803, 0.1685, 0.5773, 0.6234, 0.1553, 0.3422, 0.3461, 0.6596,
         0.7240]])

In [56]:
y = torch.zeros(4)
torch.save([x, y],'x-files')
x2, y2 = torch.load('x-files')
(x2, y2)

(tensor([[0.9242, 0.4807, 0.2747, 0.1142, 0.5130, 0.5309, 0.3431, 0.9387, 0.2909,
          0.5965],
         [0.1903, 0.7803, 0.1685, 0.5773, 0.6234, 0.1553, 0.3422, 0.3461, 0.6596,
          0.7240]]),
 tensor([0., 0., 0., 0.]))

In [57]:
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([[0.9242, 0.4807, 0.2747, 0.1142, 0.5130, 0.5309, 0.3431, 0.9387, 0.2909,
          0.5965],
         [0.1903, 0.7803, 0.1685, 0.5773, 0.6234, 0.1553, 0.3422, 0.3461, 0.6596,
          0.7240]]),
 'y': tensor([0., 0., 0., 0.])}

In [58]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.output = nn.LazyLinear(10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)
print(X)
print(Y)
print(net)

tensor([[ 1.0439, -0.0724, -0.5377,  1.8004,  0.8488,  0.8118,  0.4142,  0.0340,
         -1.2557,  0.0288,  0.4713,  0.3631,  1.9804, -1.0211,  1.3738,  0.6628,
         -0.0835,  0.4163,  1.5785,  0.2228],
        [-0.7743, -0.4233,  0.5989,  1.4304,  0.7714,  1.2853, -0.3664, -0.5093,
          1.2706, -0.0946, -0.0398,  2.2254, -0.8985,  0.2970,  0.3621,  2.1783,
         -0.7502,  0.2974, -0.6587,  0.4540]])
tensor([[ 0.0313,  0.1500,  0.0627,  0.1421, -0.1367, -0.5144, -0.0518,  0.2252,
          0.2646, -0.2093],
        [-0.0700, -0.0063, -0.3096, -0.0241, -0.0530, -0.1781, -0.4343,  0.0396,
          0.2136, -0.0968]], grad_fn=<AddmmBackward0>)
MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)


In [59]:
torch.save(net.state_dict(), 'mlp.params')

In [60]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): LazyLinear(in_features=0, out_features=256, bias=True)
  (output): LazyLinear(in_features=0, out_features=10, bias=True)
)

In [61]:
Y_clone = clone(X)
print(Y_clone)

tensor([[ 0.0313,  0.1500,  0.0627,  0.1421, -0.1367, -0.5144, -0.0518,  0.2252,
          0.2646, -0.2093],
        [-0.0700, -0.0063, -0.3096, -0.0241, -0.0530, -0.1781, -0.4343,  0.0396,
          0.2136, -0.0968]], grad_fn=<AddmmBackward0>)


In [62]:
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

In [63]:
def cpu():  #@save
    """Get the CPU device."""
    return torch.device('cpu')

def gpu(i=0):  #@save
    """Get a GPU device."""
    return torch.device(f'cuda:{i}')

cpu(), gpu(), gpu(1)

(device(type='cpu'),
 device(type='cuda', index=0),
 device(type='cuda', index=1))

In [64]:
def num_gpus():  #@save
    """Get the number of available GPUs."""
    return torch.cuda.device_count()

num_gpus()

0

In [65]:
def try_gpu(i=0):  #@save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()

def try_all_gpus():  #@save
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    return [gpu(i) for i in range(num_gpus())]

try_gpu()

device(type='cpu')

In [66]:
try_gpu(10)

device(type='cpu')

In [67]:
try_all_gpus()

[]

In [68]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [69]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.
