#### PyTorch provides two main features:
- An n-dimensional Tensor, similar to numpy but can run on GPUs
- Automatic differentiation for building and training neural networks

# Tensors
## Warm-up: numpy

Use numpy to fit a two-layer network to random data by manually implementing the forward and backward passes through the network using numpy operations:

In [4]:
# -*- coding: utf-8 -*-
import numpy as np

# N     : batch size
# D_in  : input dimension;
# H     : hidden dimension
# D_out : output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
    # forward pass : compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 w.r.t loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33023360.8578
1 26808786.9957
2 23505154.6783
3 19947747.9246
4 15569382.8697
5 11063084.5197
6 7331815.00211
7 4714426.09576
8 3063719.34836
9 2070963.66684
10 1475009.28155
11 1106461.34594
12 866862.5103
13 701889.254985
14 582133.445905
15 490885.414448
16 418712.86513
17 360205.277745
18 311931.888013
19 271561.522851
20 237441.344578
21 208379.370798
22 183485.466467
23 162059.485086
24 143550.134583
25 127476.066594
26 113477.538087
27 101264.300262
28 90575.3319654
29 81171.6671278
30 72868.6847571
31 65523.0484417
32 59008.7011413
33 53220.420918
34 48067.1045168
35 43469.8471608
36 39360.4341161
37 35682.6644953
38 32388.9691572
39 29439.7135233
40 26786.6273342
41 24397.2776248
42 22242.7435219
43 20298.6723436
44 18540.0016005
45 16947.8959854
46 15504.8563601
47 14195.9941686
48 13007.6573301
49 11927.6235664
50 10944.9841352
51 10050.5404042
52 9235.42869162
53 8491.74049456
54 7812.58771421
55 7192.25124815
56 6625.14390366
57 6106.33143056
58 5631.59968392
59 5196.480

453 3.36767662383e-06
454 3.20636941503e-06
455 3.05281967612e-06
456 2.90661987661e-06
457 2.76742628605e-06
458 2.63491287813e-06
459 2.50876527113e-06
460 2.38866688885e-06
461 2.27431708904e-06
462 2.16546976071e-06
463 2.06181892281e-06
464 1.96314265964e-06
465 1.869194371e-06
466 1.77974587053e-06
467 1.69458685885e-06
468 1.61350392041e-06
469 1.5363185679e-06
470 1.46282672893e-06
471 1.39284884333e-06
472 1.32622531612e-06
473 1.26279284618e-06
474 1.20240199391e-06
475 1.14489925762e-06
476 1.09016184939e-06
477 1.03803942719e-06
478 9.88410066061e-07
479 9.41157469923e-07
480 8.9616408597e-07
481 8.53325865314e-07
482 8.12537009214e-07
483 7.73703845209e-07
484 7.36729014907e-07
485 7.01521557066e-07
486 6.67999357405e-07
487 6.36081620324e-07
488 6.05692438179e-07
489 5.7675586362e-07
490 5.49249094307e-07
491 5.23016247782e-07
492 4.98034414141e-07
493 4.7424953673e-07
494 4.51601393067e-07
495 4.30034942141e-07
496 4.0949884768e-07
497 3.89945690008e-07
498 3.71328311209

## PyTorch: Tensors
* supports GPU, could be 50x faster than CPU
* fundamental: Tensor == n-d array, can use GPU

Use PyTorch Tensors to fit a two-layer network to random data. Like the numpy example above we need to manually implement the forward and backward passes through the network:

In [5]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)             # Performs a matrix multiplication of the matrices
                             # (Tensor n-d .mm(), 1-d .dot(); numpy any-d .dot())
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)   

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()    # Returns a copy of the tensor. The copy has 
                                    # the same size and data type as the original tensor.
                                    # (torch.Tensor .clone(), numpy .copy())
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27529051.3991107
1 23201723.75478793
2 24210397.279147554
3 27057892.170833558
4 28366565.99698246
5 25482777.401514173
6 18667952.979622602
7 11325710.570364475
8 6101191.186830822
9 3245389.2671052366
10 1858580.7114679264
11 1198181.513323991
12 864074.6056752391
13 675672.3199238554
14 555117.1736837835
15 469200.18844648823
16 403110.1808186706
17 349807.31771053723
18 305604.9620912669
19 268355.9020139451
20 236625.32458330295
21 209415.3845059016
22 185952.54319962068
23 165605.64966819793
24 147882.55546068278
25 132422.55853032065
26 118861.70962858235
27 106922.55684585136
28 96380.89647943489
29 87056.443295467
30 78788.9501763439
31 71441.75890196319
32 64887.89451515148
33 59043.36793374957
34 53808.338288482366
35 49109.36654624378
36 44886.16672094271
37 41079.28133509832
38 37644.62632847726
39 34541.248849467884
40 31729.501324434095
41 29178.44631307908
42 26860.94317858701
43 24753.47081968088
44 22832.943097959524
45 21081.039035419883
46 19481.29564991752
47 180

410 0.0005261271929042355
411 0.0005115855324839824
412 0.000497028643066777
413 0.0004828187578233556
414 0.0004690604534833831
415 0.0004564346731112945
416 0.00044365334909701737
417 0.0004319903280735615
418 0.0004204847128789213
419 0.00040880866763849666
420 0.00039755275419256497
421 0.00038718930941566976
422 0.0003769911332704434
423 0.00036712978121089535
424 0.00035748333897084317
425 0.00034804254603937324
426 0.0003396197216798402
427 0.00033048227328408286
428 0.00032237200489965145
429 0.00031492317418896354
430 0.00030673227495492394
431 0.00029921157159928935
432 0.00029181397951749677
433 0.0002844011915252853
434 0.00027827867139113005
435 0.00027070247992738716
436 0.00026474483880295496
437 0.00025881211591288755
438 0.0002534697000579067
439 0.00024726942195824053
440 0.0002405008926058133
441 0.0002355990152190235
442 0.00023049445682504777
443 0.00022487609710236667
444 0.0002193536520789796
445 0.000214219390782866
446 0.00020965853002574697
447 0.0002046008898

# Autograd
## PyTorch: Variables and autograd
* Even if we can manually implement the backward pass, it gets very complex when the netork is large/deep.
* Automatic differentiation can automate the computation of backprop in neural nets. In Pytorch this functionality is the **autograd** package.
* The forawrd pass build the computational graph, nodes in graph are Tensors wrapped by Variable objects.
* If **x** is a Variable then **x.data** is a Tensor, **x.grad** is a inside-Variable holding the gradient of **x**.

Use PyTorch Variables and autograd to implement our two-layer network; now we no longer need to manually implement the backward pass through the network:

In [6]:
# -*- coding: utf-8 -*-

import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 29073602.0
1 28479960.0
2 32340818.0
3 35698616.0
4 33308496.0
5 24395394.0
6 13858985.0
7 6776146.0
8 3308680.75
9 1844814.0
10 1215062.25
11 908901.625
12 731039.125
13 609870.625
14 518197.96875
15 444915.15625
16 384555.71875
17 334144.40625
18 291600.46875
19 255471.21875
20 224566.90625
21 198055.5
22 175194.875
23 155388.21875
24 138171.0
25 123180.796875
26 110067.3828125
27 98556.421875
28 88433.9921875
29 79503.203125
30 71593.203125
31 64576.1875
32 58341.80078125
33 52785.71484375
34 47824.6953125
35 43388.90234375
36 39415.53125
37 35848.140625
38 32642.33984375
39 29756.935546875
40 27154.67578125
41 24804.826171875
42 22682.08203125
43 20761.10546875
44 19020.091796875
45 17440.87890625
46 16005.8896484375
47 14700.423828125
48 13511.8037109375
49 12429.16015625
50 11441.251953125
51 10539.021484375
52 9715.7119140625
53 8962.6669921875
54 8272.8349609375
55 7640.833984375
56 7061.32470703125
57 6529.38330078125
58 6040.75830078125
59 5591.830078125
60 5179.10595703125

390 0.0002664442581590265
391 0.0002592254604678601
392 0.0002527102187741548
393 0.00024539994774386287
394 0.0002394897019257769
395 0.00023387007240671664
396 0.00022790637740399688
397 0.00022181219537742436
398 0.0002164457691833377
399 0.00021168627426959574
400 0.00020638499699998647
401 0.00020172873337287456
402 0.0001975027989828959
403 0.00019231418264098465
404 0.00018872026703320444
405 0.00018360430840402842
406 0.00017973121430259198
407 0.00017564870358910412
408 0.00017150858184322715
409 0.00016789126675575972
410 0.000163983553647995
411 0.00016047662938944995
412 0.00015641209029126912
413 0.00015298151993192732
414 0.00014982582069933414
415 0.00014633579121436924
416 0.00014346484385896474
417 0.0001411061966791749
418 0.00013779971050098538
419 0.00013483234215527773
420 0.00013192022743169218
421 0.0001293190725846216
422 0.0001270240027224645
423 0.00012475177936721593
424 0.00012244522804394364
425 0.00011968660692218691
426 0.00011752227874239907
427 0.000115

## PyTorch: Defining new autograd functions
* In PyTorch we can easily define our own autograd operator by defining a subclass of **torch.autograd.Function** and implementing the **forward** and **backward** functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Variables containing input data.

custom ReLU nonlinearity:

In [7]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    def forward(self, input):
        """
        In the forward pass we receive a Tensor containing the input and return a
        Tensor containing the output. You can cache arbitrary Tensors for use in the
        backward pass using the save_for_backward method.
        """
        self.save_for_backward(input)
        return input.clamp(min=0)
    
    
    def backward(self, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = self.saved_tensors
        grad_input = grad_output.clone() 
        grad_input[input < 0] = 0
        return grad_input
    
    
dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Construct an instance of our MyReLU class to use in our network
    relu = MyReLU()

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)   

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 49918252.0
1 57483868.0
2 64731772.0
3 53089312.0
4 27205142.0
5 9592026.0
6 3673673.25
7 2109813.5
8 1568958.625
9 1274429.125
10 1065063.5
11 902542.5
12 772068.375
13 665533.375
14 577559.4375
15 504265.03125
16 442619.625
17 390510.46875
18 346062.78125
19 307957.09375
20 275063.65625
21 246500.640625
22 221588.78125
23 199788.390625
24 180623.6875
25 163653.84375
26 148620.34375
27 135259.625
28 123341.53125
29 112682.4140625
30 103123.046875
31 94528.640625
32 86787.484375
33 79792.390625
34 73462.3828125
35 67721.0546875
36 62500.28515625
37 57747.43359375
38 53418.2265625
39 49476.6328125
40 45871.328125
41 42572.62109375
42 39543.42578125
43 36765.8515625
44 34215.14453125
45 31866.732421875
46 29701.310546875
47 27704.478515625
48 25860.650390625
49 24157.2890625
50 22582.705078125
51 21125.7734375
52 19776.5390625
53 18528.779296875
54 17369.32421875
55 16291.7578125
56 15288.62109375
57 14354.5673828125
58 13484.0302734375
59 12672.115234375
60 11914.7626953125
61 11207.5

451 0.0008389236172661185
452 0.0008159077842719853
453 0.000795031781308353
454 0.0007752072415314615
455 0.0007537520723417401
456 0.0007347926148213446
457 0.0007154716295190156
458 0.0006978800520300865
459 0.0006803463329561055
460 0.0006626990507356822
461 0.0006466035847552121
462 0.0006304223788902164
463 0.0006139189354144037
464 0.0005993674858473241
465 0.0005849653971381485
466 0.0005703171482309699
467 0.0005561173893511295
468 0.0005433133337646723
469 0.0005302052013576031
470 0.0005182893364690244
471 0.0005064245196990669
472 0.0004942824598401785
473 0.000482528266729787
474 0.00047152943443506956
475 0.000460772163933143
476 0.00045044979196973145
477 0.00044065582915209234
478 0.00043110986007377505
479 0.0004208179307170212
480 0.0004123118706047535
481 0.00040353808435611427
482 0.0003940205497201532
483 0.0003854474052786827
484 0.0003772824420593679
485 0.0003701024397742003
486 0.0003621745272539556
487 0.0003539932076819241
488 0.00034652664908207953
489 0.000

## TensorFlow: Static Graphs
* Biggest difference: TensorFlow’s computational graphs are **static** and PyTorch uses **dynamic computational graphs**
* TensorFlow defines computational graph once and execute the same graph over and over gain. Can optimize graph up front.
* PyTorch each forward pass defines a new computational graph, can use imperative flow control to perform different computation for each input.

Example: use TensorFlow to fit a simple two-layer net:

In [10]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for t in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(t, loss_value)

0 2.66181e+07
1 2.24606e+07
2 2.42139e+07
3 2.85657e+07
4 3.1959e+07
5 3.02183e+07
6 2.27189e+07
7 1.34061e+07
8 6.7413e+06
9 3.2576e+06
10 1.71717e+06
11 1.05055e+06
12 741431.0
13 576547.0
14 473700.0
15 400702.0
16 344398.0
17 298799.0
18 260929.0
19 228967.0
20 201733.0
21 178369.0
22 158201.0
23 140748.0
24 125616.0
25 112395.0
26 100805.0
27 90618.6
28 81633.0
29 73694.8
30 66659.7
31 60405.1
32 54830.1
33 49852.7
34 45394.2
35 41394.9
36 37799.6
37 34563.9
38 31645.4
39 29009.1
40 26622.2
41 24459.5
42 22495.9
43 20710.9
44 19086.0
45 17605.4
46 16253.7
47 15019.0
48 13889.4
49 12855.4
50 11907.3
51 11037.0
52 10237.5
53 9502.67
54 8826.27
55 8203.08
56 7628.58
57 7098.66
58 6609.3
59 6157.03
60 5738.78
61 5351.61
62 4992.97
63 4660.68
64 4352.78
65 4067.16
66 3801.93
67 3555.42
68 3326.27
69 3113.13
70 2914.76
71 2730.02
72 2557.94
73 2397.49
74 2247.86
75 2108.28
76 1978.0
77 1856.36
78 1742.7
79 1636.48
80 1537.23
81 1444.38
82 1357.48
83 1276.16
84 1200.01
85 1128.68
86 1061

# nn module
## PyTorch: nn package
* defines a set of modules, similar to layers in neural nets, using received input Variables and computes output Variables, and can contian learnable parameters
* defines a set of useful loss functions

Use the nn package to implement our two-layer network:

In [12]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 673.5753173828125
1 623.371337890625
2 580.822509765625
3 543.7760620117188
4 510.8729553222656
5 481.2712097167969
6 454.4088134765625
7 429.8927001953125
8 407.43304443359375
9 386.560791015625
10 367.0504150390625
11 348.75042724609375
12 331.3560791015625
13 314.6939697265625
14 298.7685852050781
15 283.49249267578125
16 268.8531188964844
17 254.77113342285156
18 241.2061309814453
19 228.2128143310547
20 215.77352905273438
21 203.88978576660156
22 192.4931640625
23 181.56732177734375
24 171.12686157226562
25 161.1765594482422
26 151.67501831054688
27 142.62509155273438
28 134.0227813720703
29 125.84313201904297
30 118.08403015136719
31 110.74441528320312
32 103.8026123046875
33 97.25028228759766
34 91.05146789550781
35 85.20780944824219
36 79.7103500366211
37 74.5466079711914
38 69.70275115966797
39 65.15086364746094
40 60.87061309814453
41 56.86510467529297
42 53.10206985473633
43 49.578922271728516
44 46.28231430053711
45 43.201210021972656
46 40.33061218261719
47 37.6467819213

393 0.00010492549336049706
394 0.00010195337381446734
395 9.906492778100073e-05
396 9.626061364542693e-05
397 9.353803034173325e-05
398 9.089201194001362e-05
399 8.831846207613125e-05
400 8.582609007135034e-05
401 8.339720807271078e-05
402 8.104298467515036e-05
403 7.875354640418664e-05
404 7.65293079894036e-05
405 7.437126623699442e-05
406 7.227464084280655e-05
407 7.023582293186337e-05
408 6.825465970905498e-05
409 6.632896838709712e-05
410 6.445969484047964e-05
411 6.264368130359799e-05
412 6.0879661759827286e-05
413 5.916395457461476e-05
414 5.7497090892866254e-05
415 5.588157364400104e-05
416 5.430770761449821e-05
417 5.2779931138502434e-05
418 5.1297050958964974e-05
419 4.985547639080323e-05
420 4.8454025090904906e-05
421 4.70926497655455e-05
422 4.576976425596513e-05
423 4.4484731915872544e-05
424 4.3235770135652274e-05
425 4.202170384814963e-05
426 4.084243846591562e-05
427 3.9696446037851274e-05
428 3.858202035189606e-05
429 3.750108226086013e-05
430 3.644903335953131e-05
431 

## PyTorch: optim
* more sophisticated optimizer (method of updating parameters) like AdaGrad, RMSProp, Adam, etc than just SGD
* abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms

Use the nn package to define our model as before, but we will optimize the model using the Adam algorithm provided by the optim package:

In [13]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 587.8131713867188
1 572.5282592773438
2 557.6829833984375
3 543.2288818359375
4 529.1452026367188
5 515.46728515625
6 502.19384765625
7 489.3094482421875
8 476.8896789550781
9 464.7651062011719
10 452.905517578125
11 441.4320373535156
12 430.325927734375
13 419.6915588378906
14 409.37701416015625
15 399.3348693847656
16 389.5597229003906
17 380.0503234863281
18 370.76910400390625
19 361.7469787597656
20 352.94891357421875
21 344.3811950683594
22 336.01544189453125
23 327.8770751953125
24 319.94146728515625
25 312.2323913574219
26 304.7126159667969
27 297.4097595214844
28 290.2928161621094
29 283.36083984375
30 276.60772705078125
31 270.0397644042969
32 263.6568298339844
33 257.43133544921875
34 251.37673950195312
35 245.45806884765625
36 239.6923065185547
37 234.0599365234375
38 228.53729248046875
39 223.1271514892578
40 217.83273315429688
41 212.66546630859375
42 207.63412475585938
43 202.7109375
44 197.90380859375
45 193.20632934570312
46 188.63417053222656
47 184.1605987548828
48 

361 0.000147759317769669
362 0.00013823043263982981
363 0.00012929200602229685
364 0.00012091075768694282
365 0.00011304629151709378
366 0.00010567568097030744
367 9.876848343992606e-05
368 9.229083661921322e-05
369 8.622728637419641e-05
370 8.054196950979531e-05
371 7.521802035626024e-05
372 7.023430953267962e-05
373 6.556555308634415e-05
374 6.119487807154655e-05
375 5.7107095926767215e-05
376 5.327934559318237e-05
377 4.9700047384249046e-05
378 4.635205550584942e-05
379 4.321999222156592e-05
380 4.0291772165801376e-05
381 3.755453144549392e-05
382 3.499812009977177e-05
383 3.260761877754703e-05
384 3.0374380003195256e-05
385 2.8287597160669975e-05
386 2.634107841004152e-05
387 2.4520890292478725e-05
388 2.282353125337977e-05
389 2.1240393834887072e-05
390 1.9761881048907526e-05
391 1.8383532733423635e-05
392 1.7097341697080992e-05
393 1.5896832337602973e-05
394 1.4778247532376554e-05
395 1.3736801520281006e-05
396 1.2764936400344595e-05
397 1.1859971891681198e-05
398 1.1015698873961

## PyTorch: Custom nn Modules
* allow define own Modules by subclassing **nn.Modul**e and defining a **forward** which receives input Variables and produces output Variables using other modules or other autograd operations on Variables

Implement our two-layer network as a custom Module subclass:

In [14]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 664.4738159179688
1 616.2555541992188
2 574.6283569335938
3 537.9423217773438
4 505.2079162597656
5 475.970703125
6 449.4344482421875
7 425.1484069824219
8 402.5427551269531
9 381.5075378417969
10 361.8284606933594
11 343.3133239746094
12 325.8405456542969
13 309.2971496582031
14 293.62939453125
15 278.6930236816406
16 264.4184265136719
17 250.83546447753906
18 237.83282470703125
19 225.36656188964844
20 213.44911193847656
21 202.03787231445312
22 191.126708984375
23 180.70193481445312
24 170.75584411621094
25 161.24832153320312
26 152.18408203125
27 143.5303192138672
28 135.2935791015625
29 127.44318389892578
30 119.999755859375
31 112.9679946899414
32 106.31851959228516
33 100.02416229248047
34 94.06138610839844
35 88.4030990600586
36 83.07069396972656
37 78.0440673828125
38 73.30974578857422
39 68.8549575805664
40 64.66197967529297
41 60.717498779296875
42 57.008697509765625
43 53.51830291748047
44 50.24296951293945
45 47.16770935058594
46 44.28645706176758
47 41.57908248901367
48

386 0.00011839065700769424
387 0.00011515800724737346
388 0.00011201603774679825
389 0.00010896213643718511
390 0.00010599340021144599
391 0.00010310734069207683
392 0.00010030072007793933
393 9.757321822689846e-05
394 9.49216409935616e-05
395 9.234408935299143e-05
396 8.984209125628695e-05
397 8.740436169318855e-05
398 8.503578283125535e-05
399 8.273185812868178e-05
400 8.049386815400794e-05
401 7.83176947152242e-05
402 7.620024553034455e-05
403 7.414363790303469e-05
404 7.21409305697307e-05
405 7.019642362138256e-05
406 6.830548954894766e-05
407 6.646430119872093e-05
408 6.46771295578219e-05
409 6.293590558925644e-05
410 6.124396168161184e-05
411 5.9599351516226307e-05
412 5.799803329864517e-05
413 5.644171324092895e-05
414 5.492764103109948e-05
415 5.345481986296363e-05
416 5.202262764214538e-05
417 5.062988930149004e-05
418 4.9275444325758144e-05
419 4.7957619244698435e-05
420 4.667797838919796e-05
421 4.5431199396261945e-05
422 4.421942139742896e-05
423 4.3039352021878585e-05
424 

## PyTorch: Control Flow + Weight Sharing
* As an example of dynamic graphs and weight sharing, we implement a very strange model: a fully-connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.
* For this model we can use normal Python flow control to implement the loop, and we can implement weight sharing among the innermost layers by simply reusing the same Module multiple times when defining the forward pass.

We can easily implement this model as a Module subclass:

In [15]:
# -*- coding: utf-8 -*-
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 698.8958740234375
1 638.7724609375
2 638.3764038085938
3 638.4556884765625
4 636.538818359375
5 637.0535278320312
6 629.5477294921875
7 616.265625
8 627.1588745117188
9 624.3635864257812
10 454.09234619140625
11 617.3840942382812
12 399.64398193359375
13 630.551025390625
14 540.14111328125
15 628.05322265625
16 626.0320434570312
17 251.7554473876953
18 221.6058807373047
19 479.45538330078125
20 615.026123046875
21 439.69793701171875
22 603.9712524414062
23 595.0504150390625
24 100.81912231445312
25 92.21809387207031
26 495.6492004394531
27 300.2763977050781
28 523.4839477539062
29 418.6372985839844
30 381.87615966796875
31 342.1514892578125
32 305.9565124511719
33 218.6727752685547
34 264.4963073730469
35 212.4693145751953
36 188.542236328125
37 195.5040740966797
38 287.46063232421875
39 129.8793182373047
40 123.5234603881836
41 253.15399169921875
42 153.32861328125
43 240.34671020507812
44 232.5124053955078
45 217.864990234375
46 102.38078308105469
47 74.54974365234375
48 245.307464

422 0.2238084226846695
423 0.7137449383735657
424 0.5866928100585938
425 0.1912330985069275
426 0.5782556533813477
427 0.15091459453105927
428 0.3905407786369324
429 0.4135490655899048
430 0.4005129635334015
431 0.29384300112724304
432 0.26517152786254883
433 0.30361688137054443
434 0.1766175478696823
435 0.5910953283309937
436 0.2801344692707062
437 0.233175590634346
438 0.30570632219314575
439 0.3494766652584076
440 1.1939435005187988
441 0.23447565734386444
442 1.0021846294403076
443 0.32167384028434753
444 0.30373722314834595
445 0.19673146307468414
446 0.217478409409523
447 0.20449163019657135
448 0.14779803156852722
449 0.7914077639579773
450 0.19958871603012085
451 0.27250996232032776
452 0.1675376445055008
453 0.23824439942836761
454 0.14941273629665375
455 0.0910712257027626
456 0.4829808473587036
457 0.25371673703193665
458 0.42504170536994934
459 0.2253745198249817
460 0.7587385177612305
461 0.7250651717185974
462 0.07752228528261185
463 0.14384187757968903
464 0.40638136863