## numpy

In [2]:
import numpy as np

In [3]:
# N: batch_size
# D_in: 输入维度
# H: 隐藏层维度
# D_out：输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 构建输入和输出
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # 前向传播
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 反向传播
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 36927244.52565097
1 39274022.52133037
2 45322000.55199043
3 44314529.098308146
4 32561716.73789096
5 16850919.658171445
6 7211765.625839885
7 3253736.4292549915
8 1856324.0249217204
9 1303939.2121019082
10 1025277.0536080117
11 846909.9419894724
12 715250.1063467949
13 611060.6932857104
14 525891.3259280995
15 455211.8949179965
16 395972.4639367478
17 346066.01742046175
18 303607.3193307778
19 267297.10358322394
20 236118.68097903393
21 209206.45752432384
22 185896.98048280913
23 165627.3188279107
24 147934.12270104728
25 132451.22613443004
26 118841.5211315639
27 106863.8998476188
28 96273.38968940811
29 86895.1715144286
30 78576.34941073816
31 71175.36040798957
32 64577.044053798345
33 58676.82007485769
34 53393.5364321815
35 48654.244900637015
36 44394.332741380174
37 40563.94250346747
38 37115.31656789435
39 33998.411524749536
40 31176.87360622029
41 28618.767914567055
42 26297.1456164247
43 24188.240745252566
44 22267.90893632749
45 20518.475045863852
46 18923.313760920086
47 17

437 2.963224888567537e-05
438 2.826743992402189e-05
439 2.6965729601345455e-05
440 2.5724827405130263e-05
441 2.454065589324997e-05
442 2.3411373760854945e-05
443 2.2334327681901077e-05
444 2.1306835416830673e-05
445 2.0327277075796047e-05
446 1.9392425985219948e-05
447 1.85008099290996e-05
448 1.765042119914604e-05
449 1.683920352551182e-05
450 1.606569490993434e-05
451 1.5327535100492468e-05
452 1.462346243143019e-05
453 1.3952016403862279e-05
454 1.3311260528483247e-05
455 1.2700301797238364e-05
456 1.2117415192117021e-05
457 1.156129939467877e-05
458 1.1030969824423617e-05
459 1.0524824245215056e-05
460 1.004211708717306e-05
461 9.581805766091923e-06
462 9.142463335593027e-06
463 8.723445317435919e-06
464 8.323601346226696e-06
465 7.94223049351678e-06
466 7.578433950668874e-06
467 7.23128901292497e-06
468 6.900189492425342e-06
469 6.584315557527243e-06
470 6.2828656441682274e-06
471 5.99535526716676e-06
472 5.7209908421019705e-06
473 5.459293916187515e-06
474 5.209598925323869e-06


## tensor

In [9]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # 前向传播
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
    
    # 反向传播
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 981.139404296875
199 9.21352767944336
299 0.140591561794281
399 0.0031710811890661716
499 0.00023623727611266077


## autograd

In [13]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
    
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

99 542.5300903320312
199 3.427515983581543
299 0.03581170737743378
399 0.0007118225912563503
499 8.890406752470881e-05
