## numpy

In [2]:
import numpy as np

In [3]:
# N: batch_size
# D_in: 输入维度
# H: 隐藏层维度
# D_out：输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 构建输入和输出
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # 前向传播
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 反向传播
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 36927244.52565097
1 39274022.52133037
2 45322000.55199043
3 44314529.098308146
4 32561716.73789096
5 16850919.658171445
6 7211765.625839885
7 3253736.4292549915
8 1856324.0249217204
9 1303939.2121019082
10 1025277.0536080117
11 846909.9419894724
12 715250.1063467949
13 611060.6932857104
14 525891.3259280995
15 455211.8949179965
16 395972.4639367478
17 346066.01742046175
18 303607.3193307778
19 267297.10358322394
20 236118.68097903393
21 209206.45752432384
22 185896.98048280913
23 165627.3188279107
24 147934.12270104728
25 132451.22613443004
26 118841.5211315639
27 106863.8998476188
28 96273.38968940811
29 86895.1715144286
30 78576.34941073816
31 71175.36040798957
32 64577.044053798345
33 58676.82007485769
34 53393.5364321815
35 48654.244900637015
36 44394.332741380174
37 40563.94250346747
38 37115.31656789435
39 33998.411524749536
40 31176.87360622029
41 28618.767914567055
42 26297.1456164247
43 24188.240745252566
44 22267.90893632749
45 20518.475045863852
46 18923.313760920086
47 17

437 2.963224888567537e-05
438 2.826743992402189e-05
439 2.6965729601345455e-05
440 2.5724827405130263e-05
441 2.454065589324997e-05
442 2.3411373760854945e-05
443 2.2334327681901077e-05
444 2.1306835416830673e-05
445 2.0327277075796047e-05
446 1.9392425985219948e-05
447 1.85008099290996e-05
448 1.765042119914604e-05
449 1.683920352551182e-05
450 1.606569490993434e-05
451 1.5327535100492468e-05
452 1.462346243143019e-05
453 1.3952016403862279e-05
454 1.3311260528483247e-05
455 1.2700301797238364e-05
456 1.2117415192117021e-05
457 1.156129939467877e-05
458 1.1030969824423617e-05
459 1.0524824245215056e-05
460 1.004211708717306e-05
461 9.581805766091923e-06
462 9.142463335593027e-06
463 8.723445317435919e-06
464 8.323601346226696e-06
465 7.94223049351678e-06
466 7.578433950668874e-06
467 7.23128901292497e-06
468 6.900189492425342e-06
469 6.584315557527243e-06
470 6.2828656441682274e-06
471 5.99535526716676e-06
472 5.7209908421019705e-06
473 5.459293916187515e-06
474 5.209598925323869e-06


## tensor

In [9]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # 前向传播
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
    
    # 反向传播
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 981.139404296875
199 9.21352767944336
299 0.140591561794281
399 0.0031710811890661716
499 0.00023623727611266077


## autograd

In [4]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    # 更新参数
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_() # 将gradient清零
        w2.grad.zero_()

0 39830248.0
1 41486928.0
2 16085545.0
3 19109456.0
4 22625164.0
5 4882352.5
6 19386894.0
7 17435704.0
8 5866794.0
9 1779476.125
10 2475355.75
11 3562235.0
12 3578970.25
13 2684801.5
14 1702981.25
15 1092437.625
16 796163.8125
17 615677.6875
18 492541.9375
19 411756.9375
20 387586.15625
21 370316.90625
22 329063.6875
23 287027.25
24 272474.75
25 273074.625
26 254139.890625
27 211765.515625
28 183986.21875
29 176667.59375
30 183827.25
31 194530.15625
32 188717.21875
33 207088.75
34 208038.4375
35 199122.078125
36 166840.03125
37 174591.03125
38 176597.484375
39 140398.609375
40 138868.796875
41 157865.65625
42 137590.78125
43 120121.234375
44 115022.4765625
45 114746.0625
46 128687.8203125
47 122513.5
48 124296.359375
49 129335.0390625
50 152576.4375
51 237963.09375
52 360334.5625
53 651817.75
54 1130345.0
55 2289449.75
56 5810840.0
57 15986164.0
58 34141536.0
59 13199038.0
60 9175166.0
61 7107295.0
62 9574194.0
63 6107208.0
64 2266863.0
65 3982544.0
66 1618833.5
67 1429752.25
68 824029

## torch.nn

In [20]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)
model = model.cuda()

torch.nn.init.normal_(model[0].weight) # 权重初始化
torch.nn.init.normal_(model[2].weight)

learning_rate = 1e-6
for t in range(500):
    y_pred = model(x)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    model.zero_grad() # 在下一次backward之前清零
    loss.backward()
    # 更新参数
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 39398904.0
1 44871072.0
2 56508696.0
3 57669480.0
4 38943396.0
5 15959992.0
6 5306344.0
7 2383199.75
8 1580726.75
9 1241528.75
10 1026383.25
11 863871.25
12 734585.375
13 629300.9375
14 542591.125
15 470506.875
16 410119.0625
17 359139.71875
18 315801.5
19 278757.65625
20 247007.328125
21 219690.390625
22 196032.625
23 175410.15625
24 157365.21875
25 141536.34375
26 127570.234375
27 115242.46875
28 104309.140625
29 94579.34375
30 85899.40625
31 78138.3515625
32 71183.6875
33 64939.48046875
34 59321.42578125
35 54260.1875
36 49686.6015625
37 45545.06640625
38 41792.71875
39 38386.83984375
40 35292.203125
41 32474.310546875
42 29905.400390625
43 27561.626953125
44 25423.38671875
45 23472.640625
46 21694.51171875
47 20065.08984375
48 18570.34765625
49 17198.462890625
50 15937.8056640625
51 14779.119140625
52 13711.990234375
53 12728.072265625
54 11821.01953125
55 10984.294921875
56 10211.517578125
57 9497.142578125
58 8836.6953125
59 8225.83984375
60 7660.16796875
61 7136.1796875
62 665

372 0.000624257605522871
373 0.0006050699739716947
374 0.0005861574900336564
375 0.0005680497852154076
376 0.000550935510545969
377 0.0005352014559321105
378 0.000517883338034153
379 0.0005020039388909936
380 0.00048727431567385793
381 0.00047284673200920224
382 0.0004592192708514631
383 0.00044580019311979413
384 0.00043341959826648235
385 0.0004203605931252241
386 0.00040844816248863935
387 0.0003971424011979252
388 0.0003857869887724519
389 0.00037541543133556843
390 0.00036533427191898227
391 0.0003554329741746187
392 0.0003461419837549329
393 0.0003362229326739907
394 0.0003274981281720102
395 0.0003184568486176431
396 0.0003095910360570997
397 0.00030174170387908816
398 0.0002937008976005018
399 0.0002862890833057463
400 0.00027842726558446884
401 0.0002713867579586804
402 0.00026485530543141067
403 0.00025839023874141276
404 0.0002516908571124077
405 0.00024563720216974616
406 0.00023938405502121896
407 0.0002333812153665349
408 0.00022796605480834842
409 0.0002222139883087948
4

In [14]:
model

Sequential(
  (0): Linear(in_features=1000, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
)

## torch.optim

In [18]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)
model = model.cuda()

# torch.nn.init.normal_(model[0].weight) # 权重初始化
# torch.nn.init.normal_(model[2].weight)

learning_rate = 1e-4
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad() # 在下一次backward之前清零
    loss.backward()
    # 更新参数
    optimizer.step()

0 725.2850341796875
1 706.8709106445312
2 688.9794921875
3 671.631591796875
4 654.6723022460938
5 638.1760864257812
6 622.1928100585938
7 606.7594604492188
8 591.7572631835938
9 577.149658203125
10 562.9713134765625
11 549.3047485351562
12 536.007080078125
13 523.0706787109375
14 510.5286560058594
15 498.3242492675781
16 486.4298095703125
17 474.8252258300781
18 463.4679260253906
19 452.3950500488281
20 441.64910888671875
21 431.2353515625
22 421.1253967285156
23 411.29803466796875
24 401.751953125
25 392.43963623046875
26 383.3364562988281
27 374.423828125
28 365.7315979003906
29 357.25457763671875
30 349.00958251953125
31 340.9651184082031
32 333.1305236816406
33 325.48388671875
34 318.01959228515625
35 310.7190856933594
36 303.5880126953125
37 296.61761474609375
38 289.79486083984375
39 283.10675048828125
40 276.5562438964844
41 270.13446044921875
42 263.8420715332031
43 257.6810302734375
44 251.6356201171875
45 245.72610473632812
46 239.94232177734375
47 234.28160095214844
48 228.7

356 0.00016047673125285655
357 0.00015123935008887202
358 0.00014255348651204258
359 0.00013434984430205077
360 0.00012662755034398288
361 0.00011935496149817482
362 0.00011250180978095159
363 0.00010604473209241405
364 9.996045992011204e-05
365 9.422607399756089e-05
366 8.88237773324363e-05
367 8.373542368644848e-05
368 7.894089503679425e-05
369 7.441878551617265e-05
370 7.015842129476368e-05
371 6.614468293264508e-05
372 6.236454646568745e-05
373 5.879883610759862e-05
374 5.543696170207113e-05
375 5.227121073403396e-05
376 4.928819907945581e-05
377 4.647619425668381e-05
378 4.382453698781319e-05
379 4.13287416449748e-05
380 3.897082569892518e-05
381 3.6749508581124246e-05
382 3.466182897682302e-05
383 3.268208456574939e-05
384 3.082420153077692e-05
385 2.906956615333911e-05
386 2.741575190157164e-05
387 2.5856605134322308e-05
388 2.4387085431953892e-05
389 2.2999762222752906e-05
390 2.169427534681745e-05
391 2.0461638996494003e-05
392 1.9300177882541902e-05
393 1.820378929551225e-05


In [22]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        # 有参数的模块都放入init
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

model = TwoLayerNet(D_in, H, D_out)
model = model.cuda()

learning_rate = 1e-4
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad() # 在下一次backward之前清零
    loss.backward()
    # 更新参数
    optimizer.step()

0 646.222900390625
1 629.8126831054688
2 613.9668579101562
3 598.5718994140625
4 583.5912475585938
5 569.0726928710938
6 554.918701171875
7 541.2702026367188
8 528.0866088867188
9 515.3579711914062
10 502.98443603515625
11 490.9180603027344
12 479.25738525390625
13 467.9801940917969
14 456.9896545410156
15 446.35015869140625
16 436.0289001464844
17 426.05810546875
18 416.3665771484375
19 406.96417236328125
20 397.8769836425781
21 389.1207580566406
22 380.602783203125
23 372.2799987792969
24 364.18353271484375
25 356.29986572265625
26 348.5525817871094
27 340.9803771972656
28 333.5666809082031
29 326.2987976074219
30 319.18719482421875
31 312.2203063964844
32 305.4586181640625
33 298.87408447265625
34 292.44744873046875
35 286.1553039550781
36 279.9931640625
37 273.9505615234375
38 268.0096435546875
39 262.2069396972656
40 256.5260925292969
41 250.923828125
42 245.42245483398438
43 240.02468872070312
44 234.7237091064453
45 229.50900268554688
46 224.3854522705078
47 219.35552978515625
4

355 0.0001834159338613972
356 0.00017267667863052338
357 0.0001625441072974354
358 0.00015298539074137807
359 0.00014397763879969716
360 0.00013548512652050704
361 0.00012747812434099615
362 0.00011993492807960138
363 0.00011282241030130535
364 0.00010611692414386198
365 9.980745380744338e-05
366 9.385679004481062e-05
367 8.8251028500963e-05
368 8.297542080981657e-05
369 7.80054833739996e-05
370 7.332249515457079e-05
371 6.891584052937105e-05
372 6.476210546679795e-05
373 6.08533518970944e-05
374 5.7177210692316294e-05
375 5.371588849811815e-05
376 5.046038131695241e-05
377 4.739152791444212e-05
378 4.450618871487677e-05
379 4.17918199673295e-05
380 3.92396432289388e-05
381 3.6839024687651545e-05
382 3.458041828707792e-05
383 3.2457905035698786e-05
384 3.0460596462944523e-05
385 2.8584294341271743e-05
386 2.6818213882506825e-05
387 2.5161256417050026e-05
388 2.3604226953466423e-05
389 2.213869447587058e-05
390 2.0762336134794168e-05
391 1.9470178813207895e-05
392 1.825482104322873e-05


In [23]:
model

TwoLayerNet(
  (linear1): Linear(in_features=1000, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=10, bias=True)
)