## 热身：使用NumPy

In [1]:
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 39251874.28301956
1 41481904.01847876
2 43348658.806900516
3 36986324.88563499
4 23553180.803784695
5 11528133.634589946
6 5209566.775475856
7 2661043.147497397
8 1667409.9515936247
9 1217378.0202648796
10 963139.5634019359
11 790621.8010414017
12 661037.3066726672
13 558588.7005491505
14 475612.897296955
15 407416.5473793872
16 350812.3553617229
17 303501.34015191963
18 263695.29522887326
19 229964.818682477
20 201310.42163565717
21 176833.19914960559
22 155811.23246767142
23 137694.99734791194
24 122019.00718754192
25 108397.64122282076
26 96521.87440541675
27 86136.28851922514
28 77027.55062616873
29 69018.08971669685
30 61964.597458074524
31 55737.90345093975
32 50224.07582692853
33 45335.775785820806
34 40991.50765979118
35 37121.34396918966
36 33665.39860678689
37 30576.309747028557
38 27807.371162603802
39 25321.191811085475
40 23085.39493857504
41 21072.255972164814
42 19256.587103295955
43 17616.72816455707
44 16134.227151007744
45 14791.36706885587
46 13575.430413381579
47 

## PyTorch:张量

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30290262.0
1 29985456.0
2 35632884.0
3 41318124.0
4 40094268.0
5 28988882.0
6 15525630.0
7 6843661.0
8 3118273.5
9 1717997.25
10 1163820.625
11 897176.1875
12 736283.6875
13 621521.125
14 532254.8125
15 459780.8125
16 399739.09375
17 349328.4375
18 306611.4375
19 270185.53125
20 238938.203125
21 212017.828125
22 188711.078125
23 168422.296875
24 150691.71875
25 135146.234375
26 121474.8203125
27 109409.015625
28 98741.296875
29 89280.0078125
30 80902.5625
31 73426.9375
32 66739.28125
33 60748.046875
34 55367.03125
35 50525.61328125
36 46162.4140625
37 42222.48046875
38 38660.73046875
39 35435.57421875
40 32510.19921875
41 29852.404296875
42 27436.1640625
43 25236.734375
44 23232.5078125
45 21403.982421875
46 19733.30078125
47 18205.673828125
48 16806.41796875
49 15525.51953125
50 14350.3642578125
51 13272.408203125
52 12282.10546875
53 11371.720703125
54 10534.154296875
55 9763.302734375
56 9053.025390625
57 8397.861328125
58 7793.87841796875
59 7237.01904296875
60 6722.7890625
61 62