In [1]:
import torch
from torch import nn

by default, the parameters are initialized with a uniform distribution in the range (-0.1, 0.1) and the bias is initialized to zero.

In [6]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

## Built-in Initializers

In [7]:
# the code below initializes all weight parameters as gaussian random variables with standard deviation 0.01, while bias parameters as zeros.

def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0187,  0.0097, -0.0047,  0.0013]), tensor(0.))

In [8]:
# initializing all parameters to a given constant value (say, 1)
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

We can also apply different initializers for certain blocks. For example, below we initialize the first layer with the Xavier initializer and initialize the second layer to a constant value of 42.



In [12]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.1234, -0.7032, -0.1797, -0.6096])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


## Custom Initialization

In [13]:
def my_init(module):
    if type(module) == nn.Linear:
        print("Init", *[(name, param.shape) for name, param in module.named_parameters()][0])
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-0.0000,  0.0000, -0.0000,  0.0000],
        [-9.5509, -0.0000,  0.0000, -0.0000]], grad_fn=<SliceBackward0>)

In [14]:
# setting the parameters directly
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.,  1.,  1.,  1.])