Tutorial URL: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html#examples-download

# Learning PyTorch with Examples 

In [97]:
# Preamble 

import numpy as np
import torch
import random

#device = torch.device("cpu")
device = torch.device("cuda:0")

datatype = torch.float

## 1. Tensors

### 1.1 Warm up: numpy

In [98]:
# This is a good exercise to create a simple network, forward pass and backward pass all in numpy

# We will use a fully-connected ReLU network as our running example. 
# The network will have a single hidden layer, and will be trained with gradient descent to fit random data 
# by minimizing the Euclidean distance between the network output and the true output.

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

W1 = np.random.randn(D_in, H)
W2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    z = np.matmul(x,W1)
    h = np.maximum(z,0) # ReLU(z)
    y_pred = np.matmul(h, W2)
    
    loss = np.square(y - y_pred).sum()
    print("iteration {0}, loss {1}".format(iteration, loss))
    
    # Backward pass
    dy_pred = 2*(y_pred - y)
    dW2 = np.matmul(h.T, dy_pred)
    dh = np.matmul(dy_pred, W2.T)
    dz = np.copy(dh)
    dz[h <= 0] = 0 # elementwise signum function
    dW1 = np.matmul(x.T, dz)
    
    # Upgrade weights
    W2 -= learning_rate * dW2
    W1 -= learning_rate * dW1
    

iteration 0, loss 46103921.24431324
iteration 1, loss 55335319.40049319
iteration 2, loss 64856416.68490945
iteration 3, loss 56328213.97213833
iteration 4, loss 30249422.78934653
iteration 5, loss 10635371.68525007
iteration 6, loss 3797737.360140309
iteration 7, loss 2065809.7523898603
iteration 8, loss 1506268.13934818
iteration 9, loss 1212992.608010401
iteration 10, loss 1006364.8889980961
iteration 11, loss 845641.5291087683
iteration 12, loss 716724.4384063184
iteration 13, loss 611770.0729005195
iteration 14, loss 525327.9287492852
iteration 15, loss 453639.09952636133
iteration 16, loss 393640.84285191656
iteration 17, loss 343196.92708472384
iteration 18, loss 300501.17207822256
iteration 19, loss 264173.5857252701
iteration 20, loss 233045.55894569898
iteration 21, loss 206274.50088595736
iteration 22, loss 183197.1375950476
iteration 23, loss 163167.06907626428
iteration 24, loss 145735.27821881382
iteration 25, loss 130510.27222634724
iteration 26, loss 117160.8232068207
i

iteration 363, loss 0.0004967799457679184
iteration 364, loss 0.00047405988826575705
iteration 365, loss 0.00045239069052041916
iteration 366, loss 0.00043171936892629186
iteration 367, loss 0.00041199824130047364
iteration 368, loss 0.0003931882880835756
iteration 369, loss 0.00037524460574672974
iteration 370, loss 0.0003581265908069001
iteration 371, loss 0.00034179921744813366
iteration 372, loss 0.0003262247682720681
iteration 373, loss 0.00031135739268338706
iteration 374, loss 0.00029717310405100346
iteration 375, loss 0.0002836395529747798
iteration 376, loss 0.00027072693385004324
iteration 377, loss 0.0002584070009918406
iteration 378, loss 0.00024665285800572976
iteration 379, loss 0.00023543671765784212
iteration 380, loss 0.00022474216982300982
iteration 381, loss 0.0002145293607731752
iteration 382, loss 0.00020478454062717276
iteration 383, loss 0.0001954867437069086
iteration 384, loss 0.00018661214244178854
iteration 385, loss 0.00017814984613522846
iteration 386, loss

### 1.2 Pytorch: tensors

In [99]:
# implementing the same example in torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

W1 = torch.randn(D_in, H, dtype=datatype, device=device)
W2 = torch.randn(H, D_out, dtype=datatype, device=device)

learning_rate = 1e-6
for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    z = x.mm(W1)
    h = z.clamp(min=0) # ReLU(z)
    y_pred = h.mm(W2)
    
    loss = (y - y_pred).pow(2).sum().item()
    print("iteration {0}, loss {1}".format(iteration, loss))
    
    # Backward pass
    dy_pred = 2*(y_pred - y)
    dW2 = h.t().mm(dy_pred)
    dh = dy_pred.mm(W2.t())
    dz = dh.clone()
    dz[h <= 0] = 0 # elementwise signum function
    dW1 = x.t().mm(dz)
    
    # Upgrade weights
    W2 -= learning_rate * dW2
    W1 -= learning_rate * dW1
    

iteration 0, loss 26021376.0
iteration 1, loss 22584786.0
iteration 2, loss 23596730.0
iteration 3, loss 25752796.0
iteration 4, loss 26340710.0
iteration 5, loss 23204560.0
iteration 6, loss 17169966.0
iteration 7, loss 10711865.0
iteration 8, loss 6015136.0
iteration 9, loss 3280649.5
iteration 10, loss 1872432.875
iteration 11, loss 1166895.875
iteration 12, loss 804179.25
iteration 13, loss 603233.375
iteration 14, loss 481085.71875
iteration 15, loss 399086.03125
iteration 16, loss 339233.5
iteration 17, loss 292814.34375
iteration 18, loss 255302.78125
iteration 19, loss 224220.34375
iteration 20, loss 198039.953125
iteration 21, loss 175718.421875
iteration 22, loss 156523.4375
iteration 23, loss 139903.5
iteration 24, loss 125466.1015625
iteration 25, loss 112852.5234375
iteration 26, loss 101801.0703125
iteration 27, loss 92082.0703125
iteration 28, loss 83487.4609375
iteration 29, loss 75852.46875
iteration 30, loss 69059.671875
iteration 31, loss 63027.48828125
iteration 32,

iteration 327, loss 0.1559273898601532
iteration 328, loss 0.15061362087726593
iteration 329, loss 0.14551061391830444
iteration 330, loss 0.14060448110103607
iteration 331, loss 0.13584403693675995
iteration 332, loss 0.13124772906303406
iteration 333, loss 0.12679897248744965
iteration 334, loss 0.12249643355607986
iteration 335, loss 0.11837176978588104
iteration 336, loss 0.11436454951763153
iteration 337, loss 0.11048740148544312
iteration 338, loss 0.10675722360610962
iteration 339, loss 0.10313135385513306
iteration 340, loss 0.09967608004808426
iteration 341, loss 0.09629126638174057
iteration 342, loss 0.09302114695310593
iteration 343, loss 0.08989352732896805
iteration 344, loss 0.08687429130077362
iteration 345, loss 0.08393412083387375
iteration 346, loss 0.08110698312520981
iteration 347, loss 0.07838107645511627
iteration 348, loss 0.07575645297765732
iteration 349, loss 0.07321449369192123
iteration 350, loss 0.07072597742080688
iteration 351, loss 0.06834151595830917
i

## 2. Autograd

### 2.1 PyTorch: Tensors and autograd

In [100]:
# implementing the same example in torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

W1 = torch.randn(D_in, H, dtype=datatype, device=device, requires_grad=True)
W2 = torch.randn(H, D_out, dtype=datatype, device=device, requires_grad=True)

learning_rate = 1e-6
for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    z = x.mm(W1)
    h = z.clamp(min=0) # ReLU(z)
    y_pred = h.mm(W2)
    
    loss = (y - y_pred).pow(2).sum()
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    # Backward pass
    loss.backward()
    
    with torch.no_grad():
        W2 -= learning_rate * W2.grad
        W1 -= learning_rate * W1.grad
        
        W2.grad.zero_()
        W1.grad.zero_()

iteration 0, loss 31663520.0
iteration 1, loss 27991946.0
iteration 2, loss 28222632.0
iteration 3, loss 27743500.0
iteration 4, loss 23999806.0
iteration 5, loss 17425660.0
iteration 6, loss 10814874.0
iteration 7, loss 6106819.0
iteration 8, loss 3432409.25
iteration 9, loss 2053820.5
iteration 10, loss 1353646.875
iteration 11, loss 977970.6875
iteration 12, loss 756797.8125
iteration 13, loss 612187.25
iteration 14, loss 508853.96875
iteration 15, loss 430070.71875
iteration 16, loss 367380.75
iteration 17, loss 316221.15625
iteration 18, loss 273791.03125
iteration 19, loss 238221.375
iteration 20, loss 208145.65625
iteration 21, loss 182562.703125
iteration 22, loss 160715.484375
iteration 23, loss 141991.375
iteration 24, loss 125828.609375
iteration 25, loss 111829.5234375
iteration 26, loss 99659.0859375
iteration 27, loss 89026.8515625
iteration 28, loss 79714.890625
iteration 29, loss 71538.0859375
iteration 30, loss 64334.84765625
iteration 31, loss 57972.1328125
iteration 

iteration 308, loss 0.006953850854188204
iteration 309, loss 0.006643050350248814
iteration 310, loss 0.006347417365759611
iteration 311, loss 0.006067201029509306
iteration 312, loss 0.005804762244224548
iteration 313, loss 0.005549834109842777
iteration 314, loss 0.0053060175850987434
iteration 315, loss 0.005072643514722586
iteration 316, loss 0.004850757773965597
iteration 317, loss 0.0046408153139054775
iteration 318, loss 0.004441520664840937
iteration 319, loss 0.004253089893609285
iteration 320, loss 0.0040689678862690926
iteration 321, loss 0.003894977970048785
iteration 322, loss 0.0037302093114703894
iteration 323, loss 0.0035708374343812466
iteration 324, loss 0.003420985070988536
iteration 325, loss 0.0032768857199698687
iteration 326, loss 0.003140539862215519
iteration 327, loss 0.003007710911333561
iteration 328, loss 0.0028847353532910347
iteration 329, loss 0.0027661144267767668
iteration 330, loss 0.0026554809883236885
iteration 331, loss 0.002545753261074424
iterati

### 2.2 PyTorch: Defining new autograd functions

In [101]:
class MyReLU(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, input):
        relu = input.clamp(min=0)
        ctx.save_for_backward(relu)
        return relu
    
    @staticmethod
    def backward(ctx,grad_output):
        relu, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[relu <= 0] = 0
        return grad_input

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

W1 = torch.randn(D_in, H, dtype=datatype, device=device, requires_grad=True)
W2 = torch.randn(H, D_out, dtype=datatype, device=device, requires_grad=True)
b1 = torch.randn(H, dtype=datatype, device=device, requires_grad=True)
b2 = torch.randn(D_out, dtype=datatype, device=device, requires_grad=True)

learning_rate = 1e-6
for iteration in np.arange(500):
    relu = MyReLU.apply
    
    # Forward pass (forward pass ends at loss)
    z = x.mm(W1) + b1
    h = relu(z) # ReLU(z)
    y_pred = h.mm(W2) + b2
    
    loss = (y - y_pred).pow(2).sum()
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    # Backward pass
    loss.backward()
    
    with torch.no_grad():
        W2 -= learning_rate * W2.grad
        W1 -= learning_rate * W1.grad
        b2 -= learning_rate * b2.grad
        b1 -= learning_rate * b1.grad
        
        W2.grad.zero_()
        W1.grad.zero_()
        b2.grad.zero_()
        b1.grad.zero_()


iteration 0, loss 26212858.0
iteration 1, loss 21896194.0
iteration 2, loss 21858198.0
iteration 3, loss 23005750.0
iteration 4, loss 22921930.0
iteration 5, loss 20259130.0
iteration 6, loss 15311140.0
iteration 7, loss 10057202.0
iteration 8, loss 5992724.0
iteration 9, loss 3475756.5
iteration 10, loss 2079067.0
iteration 11, loss 1336380.75
iteration 12, loss 934223.4375
iteration 13, loss 703958.4375
iteration 14, loss 561053.625
iteration 15, loss 464512.1875
iteration 16, loss 394344.03125
iteration 17, loss 340167.8125
iteration 18, loss 296615.0
iteration 19, loss 260641.484375
iteration 20, loss 230350.4375
iteration 21, loss 204525.59375
iteration 22, loss 182298.71875
iteration 23, loss 163086.953125
iteration 24, loss 146331.53125
iteration 25, loss 131647.234375
iteration 26, loss 118729.5625
iteration 27, loss 107315.609375
iteration 28, loss 97202.078125
iteration 29, loss 88205.28125
iteration 30, loss 80184.515625
iteration 31, loss 73013.125
iteration 32, loss 66587.

iteration 246, loss 0.22584763169288635
iteration 247, loss 0.21440009772777557
iteration 248, loss 0.20351772010326385
iteration 249, loss 0.19321097433567047
iteration 250, loss 0.1834266483783722
iteration 251, loss 0.17413833737373352
iteration 252, loss 0.16531012952327728
iteration 253, loss 0.15696129202842712
iteration 254, loss 0.14899130165576935
iteration 255, loss 0.14145053923130035
iteration 256, loss 0.13432398438453674
iteration 257, loss 0.12749695777893066
iteration 258, loss 0.12107862532138824
iteration 259, loss 0.1149253249168396
iteration 260, loss 0.10912556201219559
iteration 261, loss 0.10361149162054062
iteration 262, loss 0.09838104993104935
iteration 263, loss 0.09342491626739502
iteration 264, loss 0.08871405571699142
iteration 265, loss 0.0842457115650177
iteration 266, loss 0.07999363541603088
iteration 267, loss 0.07596572488546371
iteration 268, loss 0.07211719453334808
iteration 269, loss 0.06848961114883423
iteration 270, loss 0.0650365799665451
iter

iteration 461, loss 6.669329013675451e-05
iteration 462, loss 6.569574179593474e-05
iteration 463, loss 6.437947740778327e-05
iteration 464, loss 6.343035056488588e-05
iteration 465, loss 6.227393896551803e-05
iteration 466, loss 6.124771607574075e-05
iteration 467, loss 6.0271864640526474e-05
iteration 468, loss 5.928164318902418e-05
iteration 469, loss 5.827002314617857e-05
iteration 470, loss 5.722991409129463e-05
iteration 471, loss 5.644732300424948e-05
iteration 472, loss 5.552920265472494e-05
iteration 473, loss 5.4650012316415086e-05
iteration 474, loss 5.386986958910711e-05
iteration 475, loss 5.295498704072088e-05
iteration 476, loss 5.2064689953112975e-05
iteration 477, loss 5.124606832396239e-05
iteration 478, loss 5.049395622336306e-05
iteration 479, loss 4.980440644430928e-05
iteration 480, loss 4.898387487628497e-05
iteration 481, loss 4.8342320951633155e-05
iteration 482, loss 4.7586450818926096e-05
iteration 483, loss 4.6991899580461904e-05
iteration 484, loss 4.621305

### 2.3 TensorFlow: Static Graphs

Not implementing this part of the tutorial because it just provides a contrast between Pytorch Autograd and Tensorflow. 

Essentially the key point to remember is that in tensorflow we create a graph first folowed by opening a tensorflow session to run the same. 

Ref: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html#tensorflow-static-graphs

## 3. nn module

### 3.1 Pytorch: nn

In [102]:
# implementing the same example using torch.nn package

# NOTE: 
# Learning rate has changed from previous examples. 
# This is motivated by the fact that loss is decreasing just not fast enough
# Though it is not clear why the same model would warant different learning rate when implemented with nn package.

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

if device.type == 'cuda':
    model = model.cuda(device)

loss_function = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    y_pred = model(x)
    
    loss = loss_function(y_pred, y)
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
        

iteration 0, loss 722.8011474609375
iteration 1, loss 667.1231689453125
iteration 2, loss 619.0684814453125
iteration 3, loss 576.9715576171875
iteration 4, loss 539.89404296875
iteration 5, loss 506.85028076171875
iteration 6, loss 476.9220886230469
iteration 7, loss 449.794189453125
iteration 8, loss 424.9374084472656
iteration 9, loss 401.92108154296875
iteration 10, loss 380.38409423828125
iteration 11, loss 360.1080322265625
iteration 12, loss 341.0399475097656
iteration 13, loss 322.756103515625
iteration 14, loss 305.42596435546875
iteration 15, loss 289.02850341796875
iteration 16, loss 273.399169921875
iteration 17, loss 258.46533203125
iteration 18, loss 244.1697540283203
iteration 19, loss 230.5232696533203
iteration 20, loss 217.54598999023438
iteration 21, loss 205.13197326660156
iteration 22, loss 193.27008056640625
iteration 23, loss 181.9656982421875
iteration 24, loss 171.16929626464844
iteration 25, loss 160.922119140625
iteration 26, loss 151.18582153320312
iteration

iteration 225, loss 0.024444621056318283
iteration 226, loss 0.02372988685965538
iteration 227, loss 0.023036973550915718
iteration 228, loss 0.022365542128682137
iteration 229, loss 0.02171441726386547
iteration 230, loss 0.021082963794469833
iteration 231, loss 0.020471083000302315
iteration 232, loss 0.019878428429365158
iteration 233, loss 0.01930391415953636
iteration 234, loss 0.01874655857682228
iteration 235, loss 0.018206072971224785
iteration 236, loss 0.01768212951719761
iteration 237, loss 0.017173517495393753
iteration 238, loss 0.016680622473359108
iteration 239, loss 0.016202203929424286
iteration 240, loss 0.015738341957330704
iteration 241, loss 0.015288172289729118
iteration 242, loss 0.014851509593427181
iteration 243, loss 0.014427940361201763
iteration 244, loss 0.014016978442668915
iteration 245, loss 0.013618243858218193
iteration 246, loss 0.01323151495307684
iteration 247, loss 0.012856277637183666
iteration 248, loss 0.0124918632209301
iteration 249, loss 0.01

iteration 449, loss 5.777354090241715e-05
iteration 450, loss 5.6318040151381865e-05
iteration 451, loss 5.4898366215638816e-05
iteration 452, loss 5.3513795137405396e-05
iteration 453, loss 5.216568388277665e-05
iteration 454, loss 5.084961230750196e-05
iteration 455, loss 4.9568516260478646e-05
iteration 456, loss 4.8323447117581964e-05
iteration 457, loss 4.7105553676374257e-05
iteration 458, loss 4.592053301166743e-05
iteration 459, loss 4.476350295590237e-05
iteration 460, loss 4.363871266832575e-05
iteration 461, loss 4.2541920265648514e-05
iteration 462, loss 4.147242725593969e-05
iteration 463, loss 4.0430924855172634e-05
iteration 464, loss 3.941497561754659e-05
iteration 465, loss 3.8425343518611044e-05
iteration 466, loss 3.7459820305230096e-05
iteration 467, loss 3.6521632864605635e-05
iteration 468, loss 3.5604309232439846e-05
iteration 469, loss 3.4710305044427514e-05
iteration 470, loss 3.384103183634579e-05
iteration 471, loss 3.299372474430129e-05
iteration 472, loss 3

### 3.3 Pytorch: optim

In [103]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

if device.type == 'cuda':
    model = model.cuda(device)

loss_function = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    y_pred = model(x)
    
    loss = loss_function(y_pred, y)
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
    optimizer.step()
        

iteration 0, loss 781.8439331054688
iteration 1, loss 763.1244506835938
iteration 2, loss 744.964111328125
iteration 3, loss 727.3046875
iteration 4, loss 710.059814453125
iteration 5, loss 693.2385864257812
iteration 6, loss 676.8621826171875
iteration 7, loss 660.990478515625
iteration 8, loss 645.5855712890625
iteration 9, loss 630.6854858398438
iteration 10, loss 616.1856689453125
iteration 11, loss 602.0438232421875
iteration 12, loss 588.224853515625
iteration 13, loss 574.8394775390625
iteration 14, loss 561.8868408203125
iteration 15, loss 549.3050537109375
iteration 16, loss 537.050048828125
iteration 17, loss 525.126708984375
iteration 18, loss 513.48095703125
iteration 19, loss 502.1620178222656
iteration 20, loss 491.121337890625
iteration 21, loss 480.3432922363281
iteration 22, loss 469.953369140625
iteration 23, loss 459.876953125
iteration 24, loss 450.0588684082031
iteration 25, loss 440.4822692871094
iteration 26, loss 431.1605529785156
iteration 27, loss 422.05969238

iteration 266, loss 0.08973495662212372
iteration 267, loss 0.08569058775901794
iteration 268, loss 0.08182896673679352
iteration 269, loss 0.0781402736902237
iteration 270, loss 0.07461853325366974
iteration 271, loss 0.07125508785247803
iteration 272, loss 0.06804170459508896
iteration 273, loss 0.0649736300110817
iteration 274, loss 0.062043141573667526
iteration 275, loss 0.059244997799396515
iteration 276, loss 0.05657241493463516
iteration 277, loss 0.05402098968625069
iteration 278, loss 0.05158504843711853
iteration 279, loss 0.04925819858908653
iteration 280, loss 0.0470360592007637
iteration 281, loss 0.04491424560546875
iteration 282, loss 0.04288914427161217
iteration 283, loss 0.04095599800348282
iteration 284, loss 0.03910956159234047
iteration 285, loss 0.037346579134464264
iteration 286, loss 0.03566347062587738
iteration 287, loss 0.034055862575769424
iteration 288, loss 0.03252062574028969
iteration 289, loss 0.031055081635713577
iteration 290, loss 0.0296551138162612

In [104]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.Linear1 = torch.nn.Linear(D_in, H)
        self.Linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        z = self.Linear1(x)
        h = z.clamp(min=0)
        y_pred = self.Linear2(h)
        
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

model = TwoLayerNet(D_in, H, D_out)

if device.type == 'cuda':
    model = model.cuda(device)

loss_function = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    y_pred = model(x)
    
    loss = loss_function(y_pred, y)
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
    optimizer.step()

iteration 0, loss 645.8909912109375
iteration 1, loss 629.0782470703125
iteration 2, loss 612.7051391601562
iteration 3, loss 596.8748779296875
iteration 4, loss 581.6117553710938
iteration 5, loss 566.8129272460938
iteration 6, loss 552.460205078125
iteration 7, loss 538.5153198242188
iteration 8, loss 525.016845703125
iteration 9, loss 511.91925048828125
iteration 10, loss 499.3203125
iteration 11, loss 487.110107421875
iteration 12, loss 475.3283386230469
iteration 13, loss 463.8554992675781
iteration 14, loss 452.7047119140625
iteration 15, loss 441.9375
iteration 16, loss 431.5498046875
iteration 17, loss 421.43499755859375
iteration 18, loss 411.5952453613281
iteration 19, loss 402.0108642578125
iteration 20, loss 392.67840576171875
iteration 21, loss 383.5556945800781
iteration 22, loss 374.69439697265625
iteration 23, loss 366.0718994140625
iteration 24, loss 357.6633605957031
iteration 25, loss 349.4852294921875
iteration 26, loss 341.5057067871094
iteration 27, loss 333.72509

iteration 219, loss 0.5458111763000488
iteration 220, loss 0.5201559662818909
iteration 221, loss 0.49562501907348633
iteration 222, loss 0.4721716046333313
iteration 223, loss 0.4497354030609131
iteration 224, loss 0.4282647371292114
iteration 225, loss 0.4077549874782562
iteration 226, loss 0.3881549835205078
iteration 227, loss 0.3694046139717102
iteration 228, loss 0.35150229930877686
iteration 229, loss 0.334400475025177
iteration 230, loss 0.31806686520576477
iteration 231, loss 0.3024771511554718
iteration 232, loss 0.28759247064590454
iteration 233, loss 0.2733842432498932
iteration 234, loss 0.2598424553871155
iteration 235, loss 0.24691565334796906
iteration 236, loss 0.23459258675575256
iteration 237, loss 0.22282052040100098
iteration 238, loss 0.2116117775440216
iteration 239, loss 0.20092982053756714
iteration 240, loss 0.19073930382728577
iteration 241, loss 0.1810360550880432
iteration 242, loss 0.17179328203201294
iteration 243, loss 0.16299059987068176
iteration 244, 

iteration 463, loss 3.3503834373505015e-08
iteration 464, loss 3.083783894908265e-08
iteration 465, loss 2.8345011671149223e-08
iteration 466, loss 2.6158872401538247e-08
iteration 467, loss 2.4055118785781815e-08
iteration 468, loss 2.2112679687325e-08
iteration 469, loss 2.0387425081480615e-08
iteration 470, loss 1.878484745532205e-08
iteration 471, loss 1.728317755578246e-08
iteration 472, loss 1.592715470621897e-08
iteration 473, loss 1.462845666821977e-08
iteration 474, loss 1.3472472915054823e-08
iteration 475, loss 1.2395792836628061e-08
iteration 476, loss 1.1396334542723707e-08
iteration 477, loss 1.0503438119258135e-08
iteration 478, loss 9.655490629256747e-09
iteration 479, loss 8.874819101833964e-09
iteration 480, loss 8.168589360479928e-09
iteration 481, loss 7.516428368603556e-09
iteration 482, loss 6.932586504859728e-09
iteration 483, loss 6.384199835451909e-09
iteration 484, loss 5.910928635444179e-09
iteration 485, loss 5.464299235313774e-09
iteration 486, loss 5.02578

In [105]:
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.Linear1 = torch.nn.Linear(D_in, H)
        self.middle = torch.nn.Linear(H, H)
        self.Linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        z = self.Linear1(x)
        h = z.clamp(min=0)
        
        for i in range(random.randint(0,3)):
            h = self.middle(h).clamp(min=0)

        y_pred = self.Linear2(h)
        
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=datatype, device=device)
y = torch.randn(N, D_out, dtype=datatype, device=device)

model = DynamicNet(D_in, H, D_out)

if device.type == 'cuda':
    model = model.cuda(device)

loss_function = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, momentum=0.9)

for iteration in np.arange(500):
    # Forward pass (forward pass ends at loss)
    y_pred = model(x)
    
    loss = loss_function(y_pred, y)
    print("iteration {0}, loss {1}".format(iteration, loss.item()))
    
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
    optimizer.step()

iteration 0, loss 665.3807373046875
iteration 1, loss 667.4822998046875
iteration 2, loss 662.0380859375
iteration 3, loss 659.9161376953125
iteration 4, loss 656.7757568359375
iteration 5, loss 652.5364990234375
iteration 6, loss 658.068115234375
iteration 7, loss 690.323974609375
iteration 8, loss 644.5076904296875
iteration 9, loss 638.4566650390625
iteration 10, loss 628.17724609375
iteration 11, loss 653.489501953125
iteration 12, loss 633.30126953125
iteration 13, loss 630.4693603515625
iteration 14, loss 386.815185546875
iteration 15, loss 650.8760986328125
iteration 16, loss 649.807373046875
iteration 17, loss 295.4185791015625
iteration 18, loss 646.57666015625
iteration 19, loss 644.215087890625
iteration 20, loss 208.1729736328125
iteration 21, loss 599.1480712890625
iteration 22, loss 633.9720458984375
iteration 23, loss 628.6323852539062
iteration 24, loss 621.5452880859375
iteration 25, loss 110.98944091796875
iteration 26, loss 603.543701171875
iteration 27, loss 542.518

iteration 235, loss 1.269221305847168
iteration 236, loss 1.016634225845337
iteration 237, loss 1.99981689453125
iteration 238, loss 0.96180260181427
iteration 239, loss 1.0563240051269531
iteration 240, loss 1.7325334548950195
iteration 241, loss 1.9191982746124268
iteration 242, loss 1.783538579940796
iteration 243, loss 1.5374025106430054
iteration 244, loss 1.0737732648849487
iteration 245, loss 1.053750991821289
iteration 246, loss 0.8736110925674438
iteration 247, loss 0.8216944932937622
iteration 248, loss 0.8604941368103027
iteration 249, loss 0.8533264398574829
iteration 250, loss 0.7644381523132324
iteration 251, loss 0.7355989217758179
iteration 252, loss 0.828278660774231
iteration 253, loss 0.7325918078422546
iteration 254, loss 0.5133733153343201
iteration 255, loss 0.5911728143692017
iteration 256, loss 3.1362500190734863
iteration 257, loss 0.6509977579116821
iteration 258, loss 0.3626253604888916
iteration 259, loss 0.7949477434158325
iteration 260, loss 0.647560417652

iteration 459, loss 0.11623502522706985
iteration 460, loss 0.5585395693778992
iteration 461, loss 0.48920246958732605
iteration 462, loss 0.07855023443698883
iteration 463, loss 0.1149032860994339
iteration 464, loss 0.40988779067993164
iteration 465, loss 0.33608633279800415
iteration 466, loss 0.32324522733688354
iteration 467, loss 0.2836441397666931
iteration 468, loss 0.1806631088256836
iteration 469, loss 0.2867644727230072
iteration 470, loss 0.2633781433105469
iteration 471, loss 0.21497085690498352
iteration 472, loss 0.164661705493927
iteration 473, loss 0.31206971406936646
iteration 474, loss 0.32596883177757263
iteration 475, loss 0.9420532584190369
iteration 476, loss 0.28692981600761414
iteration 477, loss 0.2793562412261963
iteration 478, loss 0.89931321144104
iteration 479, loss 0.16659320890903473
iteration 480, loss 0.2636495530605316
iteration 481, loss 0.1808192878961563
iteration 482, loss 0.20606039464473724
iteration 483, loss 0.24417173862457275
iteration 484, 