# define layer

In [13]:
import torch.nn as nn
import torch

In [14]:
class MyDense(nn.Module):
    def __init__(self):
        super(MyDense,self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4,4)) for i in range(4)])
        self.params.append(nn.Parameter(torch.randn(4,1)))

    def forward(self,x):
        for i in range(len(self.params)):
            x = torch.mm(x,self.params[i])
        return x


In [15]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
                'linear1': nn.Parameter(torch.randn(4, 4)),
                'linear2': nn.Parameter(torch.randn(4, 1))
        })
        self.params.update({'linear3': nn.Parameter(torch.randn(4, 2))}) # 新增

    def forward(self, x, choice='linear1'):
        return torch.mm(x, self.params[choice])

In [16]:
net1 = MyDictDense()
# print(net1)

print(net1.params.keys(),net1.params.items())

x = torch.ones(1, 4)
# y = net1(x, 'linear1')
# y = net1(x, 'linear2')
y = net1(x, 'linear3')
# y = net1(x)
print(f'x: {x.shape}')
print(f'y: {y.shape}')

odict_keys(['linear1', 'linear2', 'linear3']) odict_items([('linear1', Parameter containing:
tensor([[-1.2834,  0.5038,  0.4842,  1.1676],
        [ 0.9334, -0.4258, -1.0969,  0.0597],
        [ 0.4293, -0.0191, -1.2807,  0.6863],
        [ 1.5512,  0.9490, -0.8362, -2.3339]], requires_grad=True)), ('linear2', Parameter containing:
tensor([[-0.4712],
        [ 0.8981],
        [-0.1326],
        [ 0.2472]], requires_grad=True)), ('linear3', Parameter containing:
tensor([[ 0.0543, -1.3508],
        [-1.2271, -0.2308],
        [ 0.4370,  0.8450],
        [ 1.1112,  1.0325]], requires_grad=True))])
x: torch.Size([1, 4])
y: torch.Size([1, 2])


In [19]:
layer1 = MyDense()
layer2 = MyDictDense()

net = nn.Sequential(layer2,layer1)
print(net)
print(net(x))

x = torch.ones(1, 4)
y = net(x)
print(f'x: {x.shape}')
print(f'y: {y.shape}')

Sequential(
  (0): MyDictDense(
    (params): ParameterDict(
        (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
        (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
        (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
  (1): MyDense(
    (params): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x4]
        (4): Parameter containing: [torch.FloatTensor of size 4x1]
    )
  )
)
tensor([[30.9840]], grad_fn=<MmBackward>)
x: torch.Size([1, 4])
y: torch.Size([1, 1])


refs:

    https://www.cnblogs.com/sdu20112013/p/12144843.html

In [59]:
# Inherit from Function
class LinearFunction(torch.autograd.Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        # ctx在这里类似self，ctx的属性可以在backward中调用
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias

# #调用自定义的自动求导函数
# linear = LinearFunction.apply(*args) #前向传播
# linear.backward()#反向传播
# linear.grad_fn.apply(*args)#反向传播

In [60]:
class Linear(nn.Module):
    def __init__(self, input_features, output_features, bias=True):
        super(Linear, self).__init__()
        self.input_features = input_features
        self.output_features = output_features

        # nn.Parameter is a special kind of Variable, that will get
        # automatically registered as Module's parameter once it's assigned
        # 这个很重要！ Parameters是默认需要梯度的！
        # as an attribute. Parameters and buffers need to be registered, or
        # they won't appear in .parameters() (doesn't apply to buffers), and
        # won't be converted when e.g. .cuda() is called. You can use
        # .register_buffer() to register buffers.
        # nn.Parameters can never be volatile and, different than Variables,
        # they require gradients by default.
        self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(output_features))
        else:
            # You should always register all possible parameters, but the
            # optional ones can be None if you want.
            self.register_parameter('bias', None)

        # Not a very smart way to initialize weights
        self.weight.data.uniform_(-0.1, 0.1)
        if bias is not None:
            self.bias.data.uniform_(-0.1, 0.1)

    def forward(self, input):
        # See the autograd section for explanation of what happens here.
        return LinearFunction.apply(input, self.weight, self.bias)

In [57]:
class MulConstant(torch.autograd.Function):
    @staticmethod
    def forward(ctx, tensor, constant):
        # ctx is a context object that can be used to stash information
        # for backward computation
        ctx.constant = constant
        return tensor * constant

    @staticmethod
    def backward(ctx, grad_output):
        # We return as many input gradients as there were arguments.
        # Gradients of non-Tensor arguments to forward must be None.
        return grad_output * ctx.constant, None

In [51]:
class Exp(torch.autograd.Function):
    @staticmethod
    def forward(ctx, i):
        result = i.exp()
        ctx.save_for_backward(result)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        result, = ctx.saved_tensors
        return grad_output * result

#Use it by calling the apply method:
y = Exp.apply(x)


In [50]:
import torch
 
class MyReLU(torch.autograd.Function):
    """
    我们可以通过建立torch.autograd的子类来实现我们自定义的autograd函数，
    并完成张量的正向和反向传播。
    """
    @staticmethod
    def forward(ctx, x):
        """
        在正向传播中，我们接收到一个上下文对象和一个包含输入的张量；
        我们必须返回一个包含输出的张量，
        并且我们可以使用上下文对象来缓存对象，以便在反向传播中使用。
        """
        ctx.save_for_backward(x)
        return x.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        在反向传播中，我们接收到上下文对象和一个张量，
        其包含了相对于正向传播过程中产生的输出的损失的梯度。
        我们可以从上下文对象中检索缓存的数据，
        并且必须计算并返回与正向传播的输入相关的损失的梯度。
        """
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x
 
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
# N是批大小；D_in 是输入维度；
# H 是隐藏层维度；D_out 是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10
 
# 产生输入和输出的随机张量
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
 
# 产生随机权重的张量
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)
 
learning_rate = 1e-6
for t in range(500):
    # 正向传播：使用张量上的操作来计算输出值y；
    # 我们通过调用 MyReLU.apply 函数来使用自定义的ReLU
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)

    # 计算并输出loss
    loss = (y_pred - y).pow(2).sum()
    print(f'{t} loss: {loss.item()}')

    # 使用autograd计算反向传播过程。
    loss.backward()

    with torch.no_grad():
        # 用梯度下降更新权重
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

    # 在反向传播之后手动清零梯度
    w1.grad.zero_()
    w2.grad.zero_()

0 loss: 33542972.0
1 loss: 31547180.0
2 loss: 33672980.0
3 loss: 34038432.0
4 loss: 29089642.0
5 loss: 19962214.0
6 loss: 11322884.0
7 loss: 5815654.5
8 loss: 3044255.5
9 loss: 1767338.0
10 loss: 1165894.625
11 loss: 854570.5625
12 loss: 670973.9375
13 loss: 548338.5
14 loss: 458422.625
15 loss: 388474.0
16 loss: 332230.46875
17 loss: 285971.09375
18 loss: 247433.40625
19 loss: 215083.984375
20 loss: 187737.6875
21 loss: 164504.6875
22 loss: 144625.28125
23 loss: 127528.296875
24 loss: 112762.0703125
25 loss: 99960.0859375
26 loss: 88839.421875
27 loss: 79138.3984375
28 loss: 70649.90625
29 loss: 63204.328125
30 loss: 56657.921875
31 loss: 50881.29296875
32 loss: 45774.421875
33 loss: 41249.96484375
34 loss: 37226.05078125
35 loss: 33642.36328125
36 loss: 30448.8203125
37 loss: 27595.29296875
38 loss: 25041.794921875
39 loss: 22752.044921875
40 loss: 20697.447265625
41 loss: 18850.796875
42 loss: 17187.794921875
43 loss: 15688.626953125
44 loss: 14334.4189453125
45 loss: 13109.50097656

In [61]:
import torch              
from torchstat import stat           
import torchvision.models as models      
net = models.vgg11()           
stat(net,(3,224,224))    # (3,224,224)表示输入图片的尺寸

[MAdd]: AdaptiveAvgPool2d is not supported!
[Flops]: AdaptiveAvgPool2d is not supported!
[Memory]: AdaptiveAvgPool2d is not supported!
[MAdd]: Dropout is not supported!
[Flops]: Dropout is not supported!
[Memory]: Dropout is not supported!
[MAdd]: Dropout is not supported!
[Flops]: Dropout is not supported!
[Memory]: Dropout is not supported!
        module name  input shape output shape       params memory(MB)              MAdd            Flops   MemRead(B)  MemWrite(B) duration[%]    MemR+W(B)
0        features.0    3 224 224   64 224 224       1792.0      12.25     173,408,256.0     89,915,392.0     609280.0   12845056.0       7.73%   13454336.0
1        features.1   64 224 224   64 224 224          0.0      12.25       3,211,264.0      3,211,264.0   12845056.0   12845056.0       0.86%   25690112.0
2        features.2   64 224 224   64 112 112          0.0       3.06       2,408,448.0      3,211,264.0   12845056.0    3211264.0      11.99%   16056320.0
3        features.3   64 112 11

In [70]:
import torch
 
class MyReLUF(torch.autograd.Function):
    """
    我们可以通过建立torch.autograd的子类来实现我们自定义的autograd函数，
    并完成张量的正向和反向传播。
    """
    @staticmethod
    def forward(ctx, x):
        """
        在正向传播中，我们接收到一个上下文对象和一个包含输入的张量；
        我们必须返回一个包含输出的张量，
        并且我们可以使用上下文对象来缓存对象，以便在反向传播中使用。
        """
        ctx.save_for_backward(x)
        return x.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        在反向传播中，我们接收到上下文对象和一个张量，
        其包含了相对于正向传播过程中产生的输出的损失的梯度。
        我们可以从上下文对象中检索缓存的数据，
        并且必须计算并返回与正向传播的输入相关的损失的梯度。
        """
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x

In [71]:
class Relu(nn.Module):
    def __init__(self, input_features, output_features):
        super(Relu, self).__init__()
        self.input_features = input_features
        self.output_features = output_features

        N, D_in, H, D_out = 64, 1000, 100, 10
        self.w1 = nn.Parameter(torch.randn(D_in, H, device=device, requires_grad=True))
        self.w2 = nn.Parameter(torch.randn(H, D_out, device=device, requires_grad=True))

        # Not a very smart way to initialize weights
        self.w1.data.uniform_(-0.1, 0.1)
        self.w2.data.uniform_(-0.1, 0.1)

    def forward(self, input):
        # See the autograd section for explanation of what happens here.
        return MyReLUF.apply(input.mm(self.w1)).mm(self.w2)

In [86]:
def step(model, lr):
    with torch.no_grad():
        # 用梯度下降更新权重
        model.w1 -= learning_rate * model.w1.grad
        model.w2 -= learning_rate * model.w2.grad

        # 在反向传播之后手动清零梯度
        model.w1.grad.zero_()
        model.w2.grad.zero_()
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
# N是批大小；D_in 是输入维度；
# H 是隐藏层维度；D_out 是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10
 
# 产生输入和输出的随机张量
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

relu = Relu(D_in, D_out)

learning_rate = 1e-6
for t in range(500):
    # 正向传播：使用张量上的操作来计算输出值y；
    # 我们通过调用 MyReLU.apply 函数来使用自定义的ReLU
    y_pred = relu(x)

    # 计算并输出loss
    loss = (y_pred - y).pow(2).sum()
    print(f'{t} loss: {loss.item()}')

    # 使用autograd计算反向传播过程。
    loss.backward()

    step(relu, learning_rate)



0 loss: 972.7138061523438
1 loss: 969.7625122070312
2 loss: 966.84326171875
3 loss: 963.9549560546875
4 loss: 961.096923828125
5 loss: 958.2689819335938
6 loss: 955.470458984375
7 loss: 952.7012939453125
8 loss: 949.960693359375
9 loss: 947.2482299804688
10 loss: 944.5635986328125
11 loss: 941.906494140625
12 loss: 939.2763061523438
13 loss: 936.6727294921875
14 loss: 934.0953369140625
15 loss: 931.543701171875
16 loss: 929.017578125
17 loss: 926.516357421875
18 loss: 924.0399780273438
19 loss: 921.587890625
20 loss: 919.1597900390625
21 loss: 916.7552490234375
22 loss: 914.3740234375
23 loss: 912.0156860351562
24 loss: 909.6799926757812
25 loss: 907.3665771484375
26 loss: 905.0767822265625
27 loss: 902.809814453125
28 loss: 900.5640258789062
29 loss: 898.3392944335938
30 loss: 896.13525390625
31 loss: 893.95166015625
32 loss: 891.7882690429688
33 loss: 889.644775390625
34 loss: 887.520751953125
35 loss: 885.4163208007812
36 loss: 883.3316040039062
37 loss: 881.265625
38 loss: 879.2179

497 loss: 495.0888366699219
498 loss: 494.60443115234375
499 loss: 494.12066650390625
