In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29742693.404789023
1 24200587.86243423
2 21585942.981861822
3 18918900.11652121
4 15327085.806363292
5 11332985.811212962
6 7753783.418669596
7 5078926.292870736
8 3307818.688685353
9 2205083.4261343298
10 1534302.3698504241
11 1121138.463077792
12 858716.398449783
13 684291.8013320686
14 562459.3503348267
15 473152.0093900049
16 404710.4978673364
17 350463.77096806926
18 306205.3167741669
19 269360.4411053555
20 238193.96106634787
21 211569.9919219577
22 188614.82306740413
23 168684.06200370676
24 151313.74825350696
25 136092.51809378437
26 122700.46140236588
27 110877.22335515832
28 100399.66915295462
29 91092.2001328825
30 82812.10590038166
31 75426.26459750024
32 68814.02660193398
33 62883.8635688995
34 57554.610624344976
35 52755.45544328415
36 48423.62530846175
37 44513.03130954297
38 40975.828627623094
39 37766.68993756043
40 34849.81103455392
41 32197.232931199447
42 29780.926543084875
43 27577.291248560494
44 25563.98231865008
45 23720.84049842956
46 22031.64313163033
47 204

429 0.009788243300146751
430 0.009475828522915898
431 0.009173392909212676
432 0.008880620689306127
433 0.008597222956064356
434 0.008322977555322521
435 0.008057420600306772
436 0.007800345272043836
437 0.007551465216270835
438 0.007310540750839103
439 0.007077317196776554
440 0.0068515539728517694
441 0.0066329856947065485
442 0.006421404272861923
443 0.006216587496812948
444 0.006018325055628403
445 0.005826400907748171
446 0.005640585210565518
447 0.00546072067036389
448 0.005286587398374565
449 0.005118017002521037
450 0.004954820065657472
451 0.004796844128907597
452 0.00464390318576181
453 0.0044958558940919584
454 0.004352538851251579
455 0.004213802162284697
456 0.004079516336610924
457 0.00394948213209936
458 0.003823610743075745
459 0.003701734711251424
460 0.003583749150226132
461 0.0034695309132458697
462 0.0033589644086430997
463 0.003251924465237291
464 0.003148300758303831
465 0.0030479856075255546
466 0.002950860182847258
467 0.002856835628732377
468 0.0027658121879736

In [4]:
import torch


dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30598046.0
1 27520068.0
2 28807726.0
3 29446630.0
4 26298672.0
5 19275880.0
6 11756910.0
7 6357585.0
8 3391766.0
9 1937764.75
10 1237253.75
11 878543.25
12 674937.125
13 544997.875
14 453220.5625
15 383605.4375
16 328247.15625
17 282916.53125
18 245190.40625
19 213451.78125
20 186581.5
21 163668.75
22 143992.71875
23 127069.625
24 112440.796875
25 99722.6875
26 88635.953125
27 78943.8125
28 70444.171875
29 62969.515625
30 56386.2265625
31 50570.5625
32 45429.66796875
33 40868.87890625
34 36816.203125
35 33208.47265625
36 29990.76953125
37 27119.345703125
38 24549.4609375
39 22245.99609375
40 20179.26171875
41 18322.43359375
42 16652.3671875
43 15148.9921875
44 13792.9326171875
45 12568.80859375
46 11463.240234375
47 10465.587890625
48 9561.9599609375
49 8742.9912109375
50 7999.8798828125
51 7324.9052734375
52 6711.39404296875
53 6153.03466796875
54 5644.34375
55 5180.8330078125
56 4758.203125
57 4372.63037109375
58 4020.2900390625
59 3698.395263671875
60 3403.84228515625
61 3134.3112

377 7.522724627051502e-05
378 7.359888695646077e-05
379 7.230320625239983e-05
380 7.094382453942671e-05
381 6.94628179189749e-05
382 6.855846004327759e-05
383 6.696760829072446e-05
384 6.585325172636658e-05
385 6.442397716455162e-05
386 6.332462362479419e-05
387 6.224087701411918e-05
388 6.103510895627551e-05
389 6.008254786138423e-05
390 5.8849589549936354e-05
391 5.8150610129814595e-05
392 5.709750621463172e-05
393 5.614052861346863e-05
394 5.500709085026756e-05
395 5.4344345699064434e-05
396 5.3423595090862364e-05
397 5.2528299420373514e-05
398 5.189163493923843e-05
399 5.081737253931351e-05
400 5.017189323552884e-05
401 4.958142380928621e-05
402 4.854674989474006e-05
403 4.789692320628092e-05
404 4.691073263529688e-05
405 4.644472574000247e-05
406 4.573693513520993e-05
407 4.511207953328267e-05
408 4.43023927800823e-05
409 4.374530180939473e-05
410 4.3003186874557287e-05
411 4.241718124831095e-05
412 4.182677002972923e-05
413 4.128791260882281e-05
414 4.080848884768784e-05
415 4.01

In [5]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 25122816.0
1 18165072.0
2 14046324.0
3 11012541.0
4 8556373.0
5 6529447.0
6 4913071.5
7 3665459.0
8 2736786.75
9 2058878.75
10 1569006.75
11 1214361.5
12 956257.5
13 766340.75
14 624406.0625
15 516351.25
16 432586.8125
17 366477.78125
18 313478.53125
19 270331.65625
20 234764.0625
21 205100.09375
22 180105.453125
23 158869.34375
24 140702.203125
25 125060.71875
26 111505.3359375
27 99724.75
28 89436.328125
29 80416.3125
30 72472.171875
31 65454.4453125
32 59230.06640625
33 53699.07421875
34 48775.45703125
35 44382.59375
36 40450.06640625
37 36927.55859375
38 33768.50390625
39 30924.818359375
40 28360.939453125
41 26043.701171875
42 23946.9296875
43 22047.8515625
44 20325.57421875
45 18759.2421875
46 17332.171875
47 16034.0703125
48 14849.453125
49 13766.5703125
50 12775.357421875
51 11867.18359375
52 11033.73046875
53 10268.1787109375
54 9563.880859375
55 8915.736328125
56 8318.369140625
57 7768.02685546875
58 7259.93212890625
59 6790.29541015625
60 6355.5732421875
61 5952.9482421875

393 0.01242360845208168
394 0.012018254026770592
395 0.011623957194387913
396 0.011241976171731949
397 0.010876486077904701
398 0.010524122044444084
399 0.010178305208683014
400 0.009846679866313934
401 0.009530464187264442
402 0.00922110490500927
403 0.008923443034291267
404 0.008633538149297237
405 0.008354675956070423
406 0.008083795197308064
407 0.00782761536538601
408 0.007574477698653936
409 0.007333463989198208
410 0.007097137626260519
411 0.006870051380246878
412 0.006652187090367079
413 0.006441907957196236
414 0.006231214385479689
415 0.006036771927028894
416 0.005848174449056387
417 0.00565897673368454
418 0.005482226610183716
419 0.005312113557010889
420 0.005144928582012653
421 0.0049871779046952724
422 0.004830086603760719
423 0.004676770884543657
424 0.0045323967933654785
425 0.004391192924231291
426 0.004256180487573147
427 0.0041268873028457165
428 0.0039967563934624195
429 0.0038757158908993006
430 0.003752953140065074
431 0.0036408137530088425
432 0.00352634373120963

## PyTorch: Defining New autograd Functions

In [7]:
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 32162964.0
1 32212562.0
2 38893408.0
3 44593312.0
4 41192448.0
5 27214270.0
6 13102309.0
7 5350409.5
8 2409013.0
9 1375248.625
10 964914.25
11 756712.8125
12 623551.625
13 525434.25
14 448019.75
15 385044.53125
16 332972.84375
17 289479.40625
18 252827.6875
19 221774.265625
20 195356.28125
21 172744.609375
22 153268.46875
23 136412.046875
24 121760.6953125
25 108994.4765625
26 97821.015625
27 87991.7421875
28 79328.2734375
29 71660.0546875
30 64851.6171875
31 58795.27734375
32 53392.0625
33 48572.3984375
34 44261.5546875
35 40391.40234375
36 36910.3671875
37 33772.48046875
38 30940.748046875
39 28381.978515625
40 26064.541015625
41 23966.353515625
42 22065.173828125
43 20336.642578125
44 18764.42578125
45 17330.3984375
46 16020.5751953125
47 14823.0107421875
48 13727.15625
49 12723.166015625
50 11802.203125
51 10956.7666015625
52 10179.984375
53 9465.1044921875
54 8806.8642578125
55 8200.345703125
56 7641.1298828125
57 7124.5263671875
58 6646.8603515625
59 6205.16650390625
60 5796.24

397 0.001281964941881597
398 0.001240330166183412
399 0.0011998952832072973
400 0.0011641669552773237
401 0.0011282701743766665
402 0.0010941632790490985
403 0.0010613000486046076
404 0.0010302530135959387
405 0.0009994155261665583
406 0.0009693116880953312
407 0.000941159320063889
408 0.0009141755290329456
409 0.0008880543173290789
410 0.0008607065537944436
411 0.0008361950749531388
412 0.0008123600855469704
413 0.0007888997788541019
414 0.0007679773843847215
415 0.0007467156974598765
416 0.0007246905006468296
417 0.0007059205090627074
418 0.000686315936036408
419 0.0006676242337562144
420 0.000648807268589735
421 0.0006311272736638784
422 0.0006135433213785291
423 0.0005963529692962766
424 0.000581452448386699
425 0.0005663521587848663
426 0.0005519157275557518
427 0.000535777595359832
428 0.0005225377972237766
429 0.0005094644729979336
430 0.0004967546556144953
431 0.00048422307008877397
432 0.0004717700358014554
433 0.00045949112973175943
434 0.0004485345561988652
435 0.00043834693