Tutorial URL: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html#examples-download

# Learning PyTorch with Examples 

In [28]:
# Preamble 

import numpy as np
import torch

## 1. Tensors

### 1.1 Warm up: numpy

In [27]:
# This is a good exercise to create a simple network, forward pass and backward pass all in numpy

# We will use a fully-connected ReLU network as our running example. 
# The network will have a single hidden layer, and will be trained with gradient descent to fit random data 
# by minimizing the Euclidean distance between the network output and the true output.

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

W1 = np.random.randn(D_in, H)
W2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    z = np.matmul(x,W1)
    h = np.maximum(z,0) # ReLU(z)
    y_pred = np.matmul(h, W2)
    
    loss = np.square(y - y_pred).sum()
    print("iteration {0}, loss {1}".format(iteration, loss))
    
    # Backward pass
    dy_pred = 2*(y_pred - y)
    dW2 = np.matmul(h.T, dy_pred)
    dh = np.matmul(dy_pred, W2.T)
    dz = np.copy(dh)
    dz[h <= 0] = 0 # elementwise signum function
    dW1 = np.matmul(x.T, dz)
    
    # Upgrade weights
    W2 -= learning_rate * dW2
    W1 -= learning_rate * dW1
    

iteration 0, loss 31417111.294516183
iteration 1, loss 24247502.40470317
iteration 2, loss 20005988.921914797
iteration 3, loss 16117189.756072668
iteration 4, loss 12222566.806160655
iteration 5, loss 8681132.341945734
iteration 6, loss 5921160.999809977
iteration 7, loss 3985456.805415063
iteration 8, loss 2726648.7923619077
iteration 9, loss 1930456.40365782
iteration 10, loss 1426214.1303353019
iteration 11, loss 1097778.2426994792
iteration 12, loss 875346.4834761431
iteration 13, loss 717348.0116823423
iteration 14, loss 600032.4888708794
iteration 15, loss 509547.3710711617
iteration 16, loss 437700.77640174585
iteration 17, loss 379195.22779171984
iteration 18, loss 330728.7218008532
iteration 19, loss 290039.7601245702
iteration 20, loss 255541.8900700019
iteration 21, loss 226044.82380652192
iteration 22, loss 200671.51937150437
iteration 23, loss 178722.92196428566
iteration 24, loss 159637.15994853375
iteration 25, loss 142965.61337117775
iteration 26, loss 128348.635157358

iteration 352, loss 0.003868875353187718
iteration 353, loss 0.0036995710574081997
iteration 354, loss 0.0035376384170539244
iteration 355, loss 0.0033829389284771335
iteration 356, loss 0.003235042829697705
iteration 357, loss 0.0030936065876860396
iteration 358, loss 0.002958297165910378
iteration 359, loss 0.002829001107230333
iteration 360, loss 0.0027053299807272497
iteration 361, loss 0.0025871272592269757
iteration 362, loss 0.002474054226402662
iteration 363, loss 0.0023659566557337618
iteration 364, loss 0.0022626143673307834
iteration 365, loss 0.0021638290809715605
iteration 366, loss 0.002069328816086304
iteration 367, loss 0.0019789974363336846
iteration 368, loss 0.0018926229600031183
iteration 369, loss 0.0018100243847241808
iteration 370, loss 0.0017310129561394045
iteration 371, loss 0.0016554593838388513
iteration 372, loss 0.001583257497483751
iteration 373, loss 0.0015142134514090394
iteration 374, loss 0.0014481686737121775
iteration 375, loss 0.0013849925289979
it

### 1.2 Pytorch: tensors

In [39]:
# implementing the same example in torch

device = torch.device("cpu")
#device = torch.device("cuda")
datatype = torch.float

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

W1 = torch.randn(D_in, H, dtype=datatype, device=device)
W2 = torch.randn(H, D_out, dtype=datatype, device=device)

learning_rate = 1e-6
for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    z = x.mm(W1)
    h = z.clamp(min=0) # ReLU(z)
    y_pred = h.mm(W2)
    
    loss = (y - y_pred).pow(2).sum().item()
    print("iteration {0}, loss {1}".format(iteration, loss))
    
    # Backward pass
    dy_pred = 2*(y_pred - y)
    dW2 = h.t().mm(dy_pred)
    dh = dy_pred.mm(W2.t())
    dz = dh.clone()
    dz[h <= 0] = 0 # elementwise signum function
    dW1 = x.t().mm(dz)
    
    # Upgrade weights
    W2 -= learning_rate * dW2
    W1 -= learning_rate * dW1
    

iteration 0, loss 29840098.0
iteration 1, loss 25132306.0
iteration 2, loss 23104618.0
iteration 3, loss 20635870.0
iteration 4, loss 16945792.0
iteration 5, loss 12462604.0
iteration 6, loss 8392902.0
iteration 7, loss 5364406.5
iteration 8, loss 3420446.5
iteration 9, loss 2250130.5
iteration 10, loss 1561521.375
iteration 11, loss 1147531.125
iteration 12, loss 887931.75
iteration 13, loss 715402.6875
iteration 14, loss 593580.25
iteration 15, loss 502751.4375
iteration 16, loss 432092.25
iteration 17, loss 375228.375
iteration 18, loss 328332.15625
iteration 19, loss 289108.84375
iteration 20, loss 255893.953125
iteration 21, loss 227396.875
iteration 22, loss 202823.640625
iteration 23, loss 181465.765625
iteration 24, loss 162798.328125
iteration 25, loss 146422.6875
iteration 26, loss 132015.71875
iteration 27, loss 119299.4453125
iteration 28, loss 108034.640625
iteration 29, loss 98019.734375
iteration 30, loss 89102.0390625
iteration 31, loss 81140.578125
iteration 32, loss 7

iteration 410, loss 0.02181771583855152
iteration 411, loss 0.021132128313183784
iteration 412, loss 0.020472481846809387
iteration 413, loss 0.01981993019580841
iteration 414, loss 0.019197581335902214
iteration 415, loss 0.018587276339530945
iteration 416, loss 0.017997348681092262
iteration 417, loss 0.017429586499929428
iteration 418, loss 0.016886014491319656
iteration 419, loss 0.016340669244527817
iteration 420, loss 0.015828918665647507
iteration 421, loss 0.01533582340925932
iteration 422, loss 0.014862775802612305
iteration 423, loss 0.014388865791261196
iteration 424, loss 0.01393766701221466
iteration 425, loss 0.013503475114703178
iteration 426, loss 0.013088960200548172
iteration 427, loss 0.012679187580943108
iteration 428, loss 0.012285293079912663
iteration 429, loss 0.011903454549610615
iteration 430, loss 0.01152788195759058
iteration 431, loss 0.011171871796250343
iteration 432, loss 0.010821240022778511
iteration 433, loss 0.010487817227840424
iteration 434, loss 0

## 2. Autograd

### 2.1 PyTorch: Tensors and autograd

In [43]:
# implementing the same example in torch

device = torch.device("cpu")
#device = torch.device("cuda:0")
datatype = torch.float

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

W1 = torch.randn(D_in, H, dtype=datatype, device=device, requires_grad=True)
W2 = torch.randn(H, D_out, dtype=datatype, device=device, requires_grad=True)

learning_rate = 1e-6
for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    z = x.mm(W1)
    h = z.clamp(min=0) # ReLU(z)
    y_pred = h.mm(W2)
    
    loss = (y - y_pred).pow(2).sum()
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    # Backward pass
    loss.backward()
    
    with torch.no_grad():
        W2 -= learning_rate * W2.grad
        W1 -= learning_rate * W1.grad
        
        W2.grad.zero_()
        W1.grad.zero_()

iteration 0, loss 27654396.0
iteration 1, loss 26002932.0
iteration 2, loss 29281282.0
iteration 3, loss 32977422.0
iteration 4, loss 32428502.0
iteration 5, loss 25427372.0
iteration 6, loss 15606912.0
iteration 7, loss 7964821.0
iteration 8, loss 3845618.25
iteration 9, loss 1998753.5
iteration 10, loss 1202566.75
iteration 11, loss 835005.25
iteration 12, loss 640811.3125
iteration 13, loss 520708.25
iteration 14, loss 436216.8125
iteration 15, loss 371594.84375
iteration 16, loss 319785.875
iteration 17, loss 277117.78125
iteration 18, loss 241437.71875
iteration 19, loss 211293.390625
iteration 20, loss 185650.703125
iteration 21, loss 163716.859375
iteration 22, loss 144910.6875
iteration 23, loss 128678.3359375
iteration 24, loss 114584.5234375
iteration 25, loss 102303.171875
iteration 26, loss 91561.8203125
iteration 27, loss 82137.9375
iteration 28, loss 73853.71875
iteration 29, loss 66542.03125
iteration 30, loss 60065.23046875
iteration 31, loss 54323.29296875
iteration 32

iteration 412, loss 0.00015402314602397382
iteration 413, loss 0.0001506778207840398
iteration 414, loss 0.00014717770682182163
iteration 415, loss 0.00014395145990420133
iteration 416, loss 0.00014177484263200313
iteration 417, loss 0.00013860299077350646
iteration 418, loss 0.00013573697651736438
iteration 419, loss 0.0001326702331425622
iteration 420, loss 0.00012978589802514762
iteration 421, loss 0.00012688440619967878
iteration 422, loss 0.00012456234253477305
iteration 423, loss 0.0001222957653226331
iteration 424, loss 0.00011981152783846483
iteration 425, loss 0.0001175985235022381
iteration 426, loss 0.00011515052756294608
iteration 427, loss 0.00011268934758845717
iteration 428, loss 0.00011098255345132202
iteration 429, loss 0.00010870154801523313
iteration 430, loss 0.00010653772187652066
iteration 431, loss 0.00010487579129403457
iteration 432, loss 0.00010236288653686643
iteration 433, loss 0.0001006933453027159
iteration 434, loss 9.874009265331551e-05
iteration 435, lo

### 2.2 PyTorch: Defining new autograd functions

In [48]:
class MyReLU(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, input):
        relu = input.clamp(min=0)
        ctx.save_for_backward(relu)
        return relu
    
    @staticmethod
    def backward(ctx,grad_output):
        relu, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[relu <= 0] = 0
        return grad_input
    
device = torch.device("cpu")
#device = torch.device("cuda:0")
datatype = torch.float

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

W1 = torch.randn(D_in, H, dtype=datatype, device=device, requires_grad=True)
W2 = torch.randn(H, D_out, dtype=datatype, device=device, requires_grad=True)

learning_rate = 1e-6
for iteration in np.arange(500):
    relu = MyReLU.apply
    
    # Forward pass (forward pass ends at loss)
    z = x.mm(W1)
    h = relu(z) # ReLU(z)
    y_pred = h.mm(W2)
    
    loss = (y - y_pred).pow(2).sum()
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    # Backward pass
    loss.backward()
    
    with torch.no_grad():
        W2 -= learning_rate * W2.grad
        W1 -= learning_rate * W1.grad
        
        W2.grad.zero_()
        W1.grad.zero_()


forward
iteration 0, loss 30022618.0
backward
forward
iteration 1, loss 22213114.0
backward
forward
iteration 2, loss 19139246.0
backward
forward
iteration 3, loss 17307442.0
backward
forward
iteration 4, loss 15276606.0
backward
forward
iteration 5, loss 12612155.0
backward
forward
iteration 6, loss 9635052.0
backward
forward
iteration 7, loss 6877531.0
backward
forward
iteration 8, loss 4696401.0
backward
forward
iteration 9, loss 3153943.0
backward
forward
iteration 10, loss 2133609.25
backward
forward
iteration 11, loss 1481988.625
backward
forward
iteration 12, loss 1067844.0
backward
forward
iteration 13, loss 801169.25
backward
forward
iteration 14, loss 623969.0625
backward
forward
iteration 15, loss 501846.4375
backward
forward
iteration 16, loss 414046.6875
backward
forward
iteration 17, loss 348356.8125
backward
forward
iteration 18, loss 297296.125
backward
forward
iteration 19, loss 256429.3125
backward
forward
iteration 20, loss 222918.453125
backward
forward
iteration 21

iteration 208, loss 0.24402643740177155
backward
forward
iteration 209, loss 0.22987858951091766
backward
forward
iteration 210, loss 0.21658504009246826
backward
forward
iteration 211, loss 0.20405171811580658
backward
forward
iteration 212, loss 0.19222751259803772
backward
forward
iteration 213, loss 0.18113575875759125
backward
forward
iteration 214, loss 0.17070727050304413
backward
forward
iteration 215, loss 0.1608772873878479
backward
forward
iteration 216, loss 0.15155600011348724
backward
forward
iteration 217, loss 0.14279896020889282
backward
forward
iteration 218, loss 0.13458096981048584
backward
forward
iteration 219, loss 0.12685002386569977
backward
forward
iteration 220, loss 0.11957568675279617
backward
forward
iteration 221, loss 0.11264494806528091
backward
forward
iteration 222, loss 0.1062159314751625
backward
forward
iteration 223, loss 0.10008803009986877
backward
forward
iteration 224, loss 0.09433910995721817
backward
forward
iteration 225, loss 0.08893819898

backward
forward
iteration 428, loss 4.316696504247375e-05
backward
forward
iteration 429, loss 4.232087667332962e-05
backward
forward
iteration 430, loss 4.16866059822496e-05
backward
forward
iteration 431, loss 4.123483449802734e-05
backward
forward
iteration 432, loss 4.047932452522218e-05
backward
forward
iteration 433, loss 3.9842248952481896e-05
backward
forward
iteration 434, loss 3.942669718526304e-05
backward
forward
iteration 435, loss 3.888586434186436e-05
backward
forward
iteration 436, loss 3.839027704088949e-05
backward
forward
iteration 437, loss 3.787412788369693e-05
backward
forward
iteration 438, loss 3.748030212591402e-05
backward
forward
iteration 439, loss 3.668001954792999e-05
backward
forward
iteration 440, loss 3.620635834522545e-05
backward
forward
iteration 441, loss 3.583406942198053e-05
backward
forward
iteration 442, loss 3.554209979483858e-05
backward
forward
iteration 443, loss 3.4972879802808166e-05
backward
forward
iteration 444, loss 3.449690848356113e