## 自动求导

In [1]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    # 前向传播
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # 反向传播
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 31440342.0
1 27452468.0
2 26819936.0
3 25209656.0
4 21104984.0
5 15058158.0
6 9461866.0
7 5486890.0
8 3185259.5
9 1948458.125
10 1296818.5
11 936671.0
12 722125.375
13 582047.25
14 482759.90625
15 407832.9375
16 348772.4375
17 300845.625
18 261246.265625
19 228034.5625
20 199952.0
21 176049.859375
22 155560.3125
23 137911.140625
24 122658.6328125
25 109411.8984375
26 97868.3671875
27 87767.875
28 78902.609375
29 71100.15625
30 64230.3984375
31 58161.8125
32 52776.875
33 47989.64453125
34 43722.3515625
35 39909.78125
36 36497.21875
37 33434.078125
38 30680.16796875
39 28201.01171875
40 25964.59765625
41 23942.923828125
42 22111.654296875
43 20449.8046875
44 18939.154296875
45 17564.32421875
46 16311.6171875
47 15166.9638671875
48 14120.43359375
49 13160.75390625
50 12280.7080078125
51 11472.7578125
52 10729.89453125
53 10045.849609375
54 9415.00390625
55 8832.7509765625
56 8295.3525390625
57 7798.986328125
58 7339.2822265625
59 6912.791015625
60 6516.76123046875
61 6148.857421875
62 5

## PyTorch:定义自己的自动求导函数

In [2]:
import torch

class MyReLU(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()



0 40886292.0
1 41634112.0
2 44990652.0
3 40513464.0
4 27291956.0
5 13578350.0
6 5996828.0
7 2926498.0
8 1776930.875
9 1281799.75
10 1013809.4375
11 836733.125
12 705442.75
13 602228.875
14 518601.40625
15 449679.9375
16 392205.1875
17 343827.875
18 302828.0625
19 267878.375
20 237953.359375
21 212168.625
22 189826.71875
23 170345.125
24 153319.6875
25 138366.03125
26 125188.359375
27 113536.453125
28 103199.1875
29 94003.296875
30 85801.25
31 78458.28125
32 71868.1875
33 65941.4453125
34 60593.375
35 55759.23828125
36 51381.38671875
37 47405.08984375
38 43790.4609375
39 40498.65625
40 37493.83203125
41 34746.9921875
42 32232.537109375
43 29929.51171875
44 27817.46875
45 25875.82421875
46 24091.421875
47 22447.6484375
48 20931.611328125
49 19531.53125
50 18237.62890625
51 17039.525390625
52 15929.5888671875
53 14900.623046875
54 13945.8203125
55 13058.923828125
56 12234.408203125
57 11467.62109375
58 10754.0849609375
59 10089.705078125
60 9470.234375
61 8892.484375
62 8353.5
63 7850.347