# Warm-up: numpy

In [1]:
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    loss = np.square(y_pred - y).sum()
    print(t, loss)

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 25238468.96175484
1 16843072.006499216
2 12873946.9568136
3 10729920.761106938
4 9342436.790480155
5 8259923.690570804
6 7219635.74978728
7 6194434.431912036
8 5164873.132608691
9 4207834.3597256355
10 3347324.480498531
11 2627627.7025865302
12 2040342.686117937
13 1581755.5288110157
14 1227031.28103017
15 958640.0085005475
16 755493.4298014969
17 602960.2779918243
18 487330.85850459756
19 399427.7347184724
20 331751.05998999823
21 279182.36222174665
22 237761.87918454097
23 204728.7049745048
24 178024.59962924424
25 156146.69722410614
26 137983.78102229588
27 122736.42461935188
28 109789.02234610188
29 98695.34257335303
30 89104.55102347568
31 80749.6421941465
32 73419.04066318982
33 66951.00540828591
34 61223.12292713462
35 56123.137614668114
36 51558.93350039655
37 47459.185718776374
38 43767.73318247953
39 40428.43936571274
40 37400.209627116856
41 34648.400961596286
42 32140.940887448047
43 29851.585535299084
44 27759.143679222918
45 25842.04430747885
46 24082.34613744948
47 224

427 0.0018906777893392757
428 0.001817905313495031
429 0.0017478761639213745
430 0.001680499767722996
431 0.0016157614935574946
432 0.0015534910390145916
433 0.0014936355237114172
434 0.0014360954184829515
435 0.0013808111953172945
436 0.001327707481623784
437 0.001276592167131334
438 0.0012274345290395714
439 0.0011801673063530733
440 0.001134733639433126
441 0.0010910689081875839
442 0.001049076484775216
443 0.001008768367900866
444 0.0009699742727042291
445 0.0009326626270948499
446 0.0008967895707251777
447 0.0008622999281455521
448 0.0008291300910874961
449 0.0007972428198695137
450 0.0007666129599897735
451 0.0007371829449432876
452 0.0007088521445652868
453 0.0006816050028728376
454 0.0006554157626989305
455 0.0006302295102752537
456 0.0006060371607822072
457 0.000582750315468384
458 0.0005604099233838081
459 0.000538887459559452
460 0.000518202142708918
461 0.0004983024534407
462 0.00047917236863188493
463 0.0004607782834281125
464 0.00044309045013928364
465 0.00042610364091007

# Pytorch: Tensors

In [2]:
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 41025824.0
1 45804540.0
2 52434772.0
3 47876240.0
4 30639646.0
5 13561927.0
6 5297458.0
7 2491413.25
8 1569271.625
9 1182325.25
10 959903.625
11 802769.8125
12 681027.5
13 582889.75
14 502356.75
15 435529.71875
16 379535.125
17 332355.03125
18 292328.75
19 258150.90625
20 228809.578125
21 203463.953125
22 181525.8125
23 162432.078125
24 145764.21875
25 131147.78125
26 118283.578125
27 106917.328125
28 96834.96875
29 87870.3203125
30 79874.0234375
31 72741.953125
32 66363.2734375
33 60646.31640625
34 55501.4609375
35 50865.15625
36 46677.08203125
37 42885.234375
38 39447.73828125
39 36332.98828125
40 33501.6015625
41 30922.888671875
42 28569.81640625
43 26422.064453125
44 24460.509765625
45 22662.830078125
46 21014.197265625
47 19500.900390625
48 18110.89453125
49 16834.103515625
50 15658.6123046875
51 14575.8857421875
52 13576.97265625
53 12654.109375
54 11801.2392578125
55 11012.375
56 10281.982421875
57 9605.65625
58 8978.962890625
59 8397.2294921875
60 7857.12890625
61 7355.287109

482 0.00031639085500501096
483 0.000310271221678704
484 0.00030421995325013995
485 0.00029861292568966746
486 0.0002921300183515996
487 0.0002863892586901784
488 0.00028073310386389494
489 0.00027574069099500775
490 0.0002700286277104169
491 0.0002654347044881433
492 0.0002599174913484603
493 0.0002550958306528628
494 0.00025106844259426
495 0.00024687551194801927
496 0.00024152096011675894
497 0.00023785994562786072
498 0.00023227109340950847
499 0.00022819239529781044


# Autograd
## Pytorch: Tensors and autograd