In [1]:
## Tensors

# Warm-up: numpy

import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29386185.277615204
1 24381573.409779944
2 25347963.732733175
3 27993518.4545618
4 28920077.629678715
5 25443378.075138606
6 18423414.02838054
7 11090798.992130037
8 6015693.40875072
9 3225656.1741571603
10 1866027.385132644
11 1208473.5397490165
12 872602.684634529
13 681600.0300202263
14 559284.197147436
15 472248.5719116131
16 405480.3301759525
17 351780.36309921386
18 307324.3475098302
19 269957.1908675149
20 238155.92630745727
21 210868.78955307748
22 187325.74505879727
23 166928.04030038006
24 149192.52423766395
25 133693.78255496506
26 120109.28542858441
27 108147.30499314092
28 97587.80603628694
29 88229.80417631645
30 79909.00501023611
31 72490.78003627344
32 65867.24975736217
33 59938.208864740984
34 54623.40075619484
35 49850.04411650407
36 45553.72245913825
37 41685.495232037865
38 38190.7638489604
39 35027.50258234168
40 32161.303076656663
41 29560.656395945796
42 27197.097485598868
43 25049.671892238148
44 23093.14985355165
45 21309.40389143237
46 19680.90623156406
47 18

382 0.00044499040864380297
383 0.0004243132207365896
384 0.0004045869546672503
385 0.00038577914918837763
386 0.00036784555867350237
387 0.0003507520581663224
388 0.00033446020836517225
389 0.0003189259508928115
390 0.00030411205413780583
391 0.00028998788256106634
392 0.00027651974998133824
393 0.0002636776849767081
394 0.00025143824584022436
395 0.0002397704337120709
396 0.00022864392414439239
397 0.00021803264602790598
398 0.0002079140231852273
399 0.00019826661964315856
400 0.00018906925625520113
401 0.00018029736943610071
402 0.00017193381140431246
403 0.00016396139240700794
404 0.00015635931759890398
405 0.00014910929689037624
406 0.00014219869530683214
407 0.00013560876111204102
408 0.0001293245471454016
409 0.0001233312617787396
410 0.00011761748159442877
411 0.000112168522489053
412 0.0001069729862754315
413 0.00010201714448422388
414 9.72908427154389e-05
415 9.278441726393486e-05
416 8.848812830255753e-05
417 8.439312917344122e-05
418 8.048828202083088e-05
419 7.6762022191918

In [2]:
# PyTorch: Tensors

import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28553870.0
1 23810144.0
2 25571608.0
3 30020836.0
4 33172264.0
5 30683462.0
6 22299642.0
7 12690911.0
8 6218472.5
9 2999072.25
10 1617806.625
11 1024041.25
12 742991.875
13 587003.5
14 485545.25
15 411146.0625
16 352579.4375
17 304753.75
18 264918.625
19 231360.015625
20 202849.8125
21 178496.59375
22 157600.59375
23 139541.46875
24 123893.1875
25 110281.203125
26 98408.859375
27 88013.84375
28 78886.53125
29 70849.3984375
30 63764.37890625
31 57498.5546875
32 51933.62890625
33 46987.17578125
34 42578.21875
35 38640.671875
36 35116.55078125
37 31956.21484375
38 29119.970703125
39 26570.09765625
40 24272.873046875
41 22198.53515625
42 20324.7578125
43 18627.994140625
44 17089.822265625
45 15694.2109375
46 14426.2578125
47 13272.6708984375
48 12221.73046875
49 11263.482421875
50 10388.7978515625
51 9589.109375
52 8857.62890625
53 8187.74560546875
54 7574.51416015625
55 7010.68896484375
56 6492.96484375
57 6017.349609375
58 5580.0390625
59 5177.4775390625
60 4806.5498046875
61 4464.5761

In [1]:
## Autograd

# PyTorch: Tensors and autograd

import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 24393210.0
1 19898516.0
2 18379532.0
3 17470066.0
4 16011955.0
5 13579842.0
6 10549851.0
7 7573131.0
8 5145821.5
9 3406325.75
10 2257453.0
11 1532198.0
12 1079197.875
13 793616.5
14 608630.5625
15 484265.8125
16 396838.1875
17 332705.90625
18 283746.875
19 245074.21875
20 213691.0
21 187684.96875
22 165800.4375
23 147156.484375
24 131124.78125
25 117227.359375
26 105108.2109375
27 94495.5
28 85172.8203125
29 76948.5546875
30 69673.703125
31 63216.09375
32 57467.58203125
33 52333.7109375
34 47738.1015625
35 43614.3828125
36 39907.14453125
37 36566.74609375
38 33552.40625
39 30826.810546875
40 28358.455078125
41 26119.1328125
42 24081.162109375
43 22228.734375
44 20543.515625
45 19005.30078125
46 17599.392578125
47 16312.9267578125
48 15134.0634765625
49 14053.376953125
50 13060.455078125
51 12147.380859375
52 11306.744140625
53 10531.6640625
54 9816.30859375
55 9155.69140625
56 8544.599609375
57 7979.20654296875
58 7455.421875
59 6969.9541015625
60 6519.42529296875
61 6101.3193359375


In [2]:
# PyTorch: Defining new autograd functions

import torch

class MyReLU(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
    
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    relu = MyReLU.apply
    
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 37723876.0
1 32243950.0
2 28160962.0
3 21920772.0
4 14836189.0
5 8902282.0
6 5163921.5
7 3096202.0
8 2019411.375
9 1436042.25
10 1095044.5
11 875098.0625
12 720367.9375
13 604013.875
14 512762.03125
15 438884.46875
16 378224.03125
17 327719.84375
18 285301.5
19 249401.0625
20 218793.6875
21 192584.265625
22 170059.21875
23 150581.484375
24 133688.84375
25 118973.3828125
26 106125.6171875
27 94872.59375
28 84982.796875
29 76265.0390625
30 68564.5546875
31 61742.04296875
32 55684.265625
33 50294.90234375
34 45495.06640625
35 41206.91015625
36 37368.7890625
37 33927.42578125
38 30837.134765625
39 28056.791015625
40 25554.408203125
41 23297.744140625
42 21260.12890625
43 19416.998046875
44 17749.619140625
45 16239.9130859375
46 14872.1640625
47 13628.6875
48 12497.5361328125
49 11468.67578125
50 10532.3544921875
51 9679.4765625
52 8900.16015625
53 8188.888671875
54 7538.59716796875
55 6943.76806640625
56 6399.40869140625
57 5900.87939453125
58 5443.67041015625
59 5024.2626953125
60 4639.

452 4.3030708184232935e-05
453 4.2497635149629787e-05
454 4.177588925813325e-05
455 4.145426646573469e-05
456 4.082079976797104e-05
457 4.042177170049399e-05
458 3.980812834925018e-05
459 3.9169666706584394e-05
460 3.871975059155375e-05
461 3.837568874587305e-05
462 3.7761012208648026e-05
463 3.727358125615865e-05
464 3.675563493743539e-05
465 3.636349720181897e-05
466 3.575021764845587e-05
467 3.54553631041199e-05
468 3.487420690362342e-05
469 3.4456657886039466e-05
470 3.397001273697242e-05
471 3.36980156134814e-05
472 3.3204018109245226e-05
473 3.27296438626945e-05
474 3.2404019293608144e-05
475 3.202541483915411e-05
476 3.16886835207697e-05
477 3.136873783660121e-05
478 3.114940409432165e-05
479 3.0646853701910004e-05
480 3.052971078432165e-05
481 3.0099201467237435e-05
482 2.9913186153862625e-05
483 2.965663225040771e-05
484 2.933678297267761e-05
485 2.912984200520441e-05
486 2.886545735236723e-05
487 2.8504315196187235e-05
488 2.8213038604008034e-05
489 2.7786476493929513e-05
490

In [None]:
# TensorFlow: Static Graphs

import tensorflow as tf
import numpy as np


N, D_in, H, D_out = 64, 1000, 100, 10

x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y - y_pred) ** 2.0)

grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        loss_value, _, _ = sess.run([loss, new_w1, new_w2], feed_dict={x: x_value, y: y_value})
        print(loss_value)

In [1]:
## nn module

# PyTorch: nn

import torch


N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                            torch.nn.ReLU(),
                            torch.nn.Linear(H, D_out),
                           )

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 700.7489624023438
1 647.357421875
2 600.7996215820312
3 560.2442016601562
4 524.2097778320312
5 492.04022216796875
6 462.7208557128906
7 436.01507568359375
8 411.5037841796875
9 388.8995056152344
10 367.64849853515625
11 347.6166687011719
12 328.6358947753906
13 310.6544189453125
14 293.6636657714844
15 277.517578125
16 262.1627502441406
17 247.5381622314453
18 233.57264709472656
19 220.19517517089844
20 207.4617919921875
21 195.34657287597656
22 183.8236083984375
23 172.86669921875
24 162.44992065429688
25 152.57144165039062
26 143.204345703125
27 134.3265838623047
28 125.95684814453125
29 118.05594635009766
30 110.60294342041016
31 103.59131622314453
32 96.97369384765625
33 90.75634002685547
34 84.92403411865234
35 79.45568084716797
36 74.32743072509766
37 69.51313781738281
38 65.00189971923828
39 60.78141403198242
40 56.82994842529297
41 53.13576126098633
42 49.68157196044922
43 46.4616584777832
44 43.45791244506836
45 40.653778076171875
46 38.04478073120117
47 35.60920333862305
4

In [1]:
# PyTorch: optim

import torch


N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                            torch.nn.ReLU(),
                            torch.nn.Linear(H, D_out),)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 589.2819213867188
1 573.3584594726562
2 557.8797607421875
3 542.8906860351562
4 528.3540649414062
5 514.3221435546875
6 500.7491149902344
7 487.79608154296875
8 475.3086853027344
9 463.2677917480469
10 451.5050048828125
11 440.1197814941406
12 429.098876953125
13 418.3712158203125
14 407.9158935546875
15 397.77581787109375
16 387.8565979003906
17 378.1832275390625
18 368.7757263183594
19 359.7100524902344
20 350.89935302734375
21 342.3202819824219
22 333.9433898925781
23 325.764404296875
24 317.77935791015625
25 310.00592041015625
26 302.4096374511719
27 294.9933166503906
28 287.77435302734375
29 280.71539306640625
30 273.8501892089844
31 267.2042541503906
32 260.7480773925781
33 254.45346069335938
34 248.3216552734375
35 242.3748321533203
36 236.58099365234375
37 230.94631958007812
38 225.452392578125
39 220.0713653564453
40 214.7965850830078
41 209.64073181152344
42 204.61807250976562
43 199.71868896484375
44 194.90594482421875
45 190.19398498535156
46 185.5951690673828
47 181.0863

438 8.166056204572669e-07
439 7.708489420110709e-07
440 7.276319138327381e-07
441 6.872818403280689e-07
442 6.486739607680647e-07
443 6.124359401837864e-07
444 5.781882350675005e-07
445 5.454751885736187e-07
446 5.147951469552936e-07
447 4.85955069962074e-07
448 4.5861943931413407e-07
449 4.32583306064771e-07
450 4.0816820501277107e-07
451 3.853167527267942e-07
452 3.632268885667145e-07
453 3.4260739312230726e-07
454 3.229742731036822e-07
455 3.0470715728370124e-07
456 2.8756031156262907e-07
457 2.710236230996088e-07
458 2.556564027145214e-07
459 2.4104477347464126e-07
460 2.2743103045286261e-07
461 2.141314041637088e-07
462 2.019852445300785e-07
463 1.9057003441957931e-07
464 1.79551619794438e-07
465 1.6899524268865207e-07
466 1.5950745080317574e-07
467 1.5013915799499955e-07
468 1.416343735627379e-07
469 1.3341079352358065e-07
470 1.2562883000555303e-07
471 1.1848956660287513e-07
472 1.1161203872234182e-07
473 1.0508230730010837e-07
474 9.875570583517401e-08
475 9.320591232153674e-08

In [3]:
# PyTorch: Custom nn Modules

import torch


class TwoLayerNet(torch.nn.Module):
    
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 604.2501220703125
1 561.86328125
2 524.5908813476562
3 491.9476623535156
4 463.1071472167969
5 436.72003173828125
6 412.4529113769531
7 390.1387939453125
8 369.47979736328125
9 350.2677001953125
10 332.143798828125
11 315.17681884765625
12 299.19195556640625
13 284.0091247558594
14 269.6421813964844
15 256.0260009765625
16 243.0230255126953
17 230.65936279296875
18 218.90084838867188
19 207.71484375
20 197.05064392089844
21 186.8916778564453
22 177.2196502685547
23 167.98228454589844
24 159.16815185546875
25 150.7909393310547
26 142.80313110351562
27 135.1962890625
28 127.92610931396484
29 121.0017318725586
30 114.40604400634766
31 108.12629699707031
32 102.16450500488281
33 96.49870300292969
34 91.12225341796875
35 86.03184509277344
36 81.1997299194336
37 76.61700439453125
38 72.29312133789062
39 68.20164489746094
40 64.33855438232422
41 60.68056869506836
42 57.231632232666016
43 53.974300384521484
44 50.90089797973633
45 48.0051155090332
46 45.274559020996094
47 42.6924934387207
48

In [5]:
# PyTorch: Control Flow + Weight Sharing

import random
import torch


class DynamicNet(torch.nn.Module):
    
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred
    
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = DynamicNet(D_in, H, D_out)

critetion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 597.8226928710938
1 596.164306640625
2 638.4486694335938
3 608.0631713867188
4 587.9027099609375
5 589.6530151367188
6 590.8796997070312
7 566.471923828125
8 588.7586669921875
9 577.8988037109375
10 575.964599609375
11 525.9031982421875
12 585.696533203125
13 585.0036010742188
14 405.91796875
15 583.5460815429688
16 564.4199829101562
17 337.3154602050781
18 474.47821044921875
19 270.4401550292969
20 578.6642456054688
21 549.1363525390625
22 174.41534423828125
23 423.47430419921875
24 125.30274963378906
25 391.5503845214844
26 88.29550170898438
27 73.43832397460938
28 59.8542366027832
29 556.13525390625
30 298.8168029785156
31 275.7334289550781
32 458.5106506347656
33 72.72035217285156
34 197.39535522460938
35 173.93968200683594
36 149.89788818359375
37 461.4508056640625
38 127.04894256591797
39 116.1816177368164
40 97.67300415039062
41 368.4410095214844
42 240.96116638183594
43 309.318115234375
44 189.52256774902344
45 234.33847045898438
46 199.8353271484375
47 168.9393310546875
48 7

472 0.8493508696556091
473 0.7718315720558167
474 0.172787144780159
475 0.7475428581237793
476 0.7682396769523621
477 0.17503702640533447
478 0.6794538497924805
479 1.1440181732177734
480 1.013344407081604
481 0.14376141130924225
482 0.17260237038135529
483 0.7129086256027222
484 0.2038899064064026
485 1.1513720750808716
486 1.0244065523147583
487 0.5338542461395264
488 0.16853995621204376
489 0.16548241674900055
490 0.14502158761024475
491 0.5999506711959839
492 0.7799789309501648
493 0.07754373550415039
494 0.5445480346679688
495 0.8472185730934143
496 0.4963778257369995
497 0.776951789855957
498 0.20566990971565247
499 0.40270671248435974
