# Warmup-up: Numpy

In [5]:
import numpy as np

# N is batch size, D_in is input dimension
# H is hidden dimension, D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2*(y_pred - y)
    grad_w2 = np.dot(h_relu.T, grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = np.dot(x.T, grad_h)
    
    # update weights
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2


(0, 38986362.36737524)
(1, 39330040.744057506)
(2, 38925501.36495319)
(3, 32186778.727717999)
(4, 21039231.002708167)
(5, 11176063.04426901)
(6, 5527932.4716059044)
(7, 2929469.1691435063)
(8, 1805188.2310959275)
(9, 1278991.8949941718)
(10, 991650.02844230831)
(11, 807182.63237080898)
(12, 673975.69335974683)
(13, 570733.89422376081)
(14, 487679.14820602641)
(15, 419621.63509858458)
(16, 363158.46859233873)
(17, 315765.61307142826)
(18, 275689.30494595319)
(19, 241680.26648459106)
(20, 212606.85676483117)
(21, 187625.91647919672)
(22, 166072.62220773631)
(23, 147415.62453362439)
(24, 131197.98314753879)
(25, 117040.18026530847)
(26, 104645.25855273375)
(27, 93761.087578807332)
(28, 84172.452287953405)
(29, 75707.426874519995)
(30, 68218.645690902922)
(31, 61573.231239813205)
(32, 55664.971301907601)
(33, 50401.009077001174)
(34, 45699.968649760536)
(35, 41497.742489660988)
(36, 37739.137191866248)
(37, 34368.064297334102)
(38, 31335.734517819619)
(39, 28603.626366762688)
(40, 26138.64

(382, 2.7911584453167494e-05)
(383, 2.6438734430353694e-05)
(384, 2.5043501020518683e-05)
(385, 2.372277994738514e-05)
(386, 2.247173917313053e-05)
(387, 2.1286811387426955e-05)
(388, 2.0166381154723543e-05)
(389, 1.9103829617068345e-05)
(390, 1.8097480525487195e-05)
(391, 1.7144396692237284e-05)
(392, 1.6241681733917469e-05)
(393, 1.5386648641674042e-05)
(394, 1.4577952262712341e-05)
(395, 1.3810926454825134e-05)
(396, 1.3084301387213423e-05)
(397, 1.2396342927289505e-05)
(398, 1.1744677906558208e-05)
(399, 1.1127217297148403e-05)
(400, 1.0543183335504973e-05)
(401, 9.9894017814038916e-06)
(402, 9.4646935685842001e-06)
(403, 8.9676344179719704e-06)
(404, 8.4968009712715265e-06)
(405, 8.0508349554948052e-06)
(406, 7.6288405474734808e-06)
(407, 7.2287039556487146e-06)
(408, 6.8495203925433002e-06)
(409, 6.4903500593183539e-06)
(410, 6.1501557755689158e-06)
(411, 5.8277913115824798e-06)
(412, 5.5225692417032491e-06)
(413, 5.2333871739356484e-06)
(414, 4.9592697249603778e-06)
(415, 4.6995

# Pytorch tensor

In [9]:
import torch
dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    
    grad_y_pred = 2*(y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0 
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2

(0, 28007829.46771729)
(1, 25044559.105969045)
(2, 28095745.912068486)
(3, 32879384.62703055)
(4, 34965164.57981189)
(5, 29948337.325250022)
(6, 20033735.8296204)
(7, 10560891.974687815)
(8, 5037777.591617608)
(9, 2490579.099401131)
(10, 1430984.3919850849)
(11, 967477.4871667605)
(12, 736978.9488108801)
(13, 600637.9946418897)
(14, 506692.65865491284)
(15, 435112.58854095393)
(16, 377475.5383432496)
(17, 329724.96695839614)
(18, 289522.6902755012)
(19, 255312.7408567321)
(20, 226012.36107113864)
(21, 200777.32697501406)
(22, 178925.73370084394)
(23, 159928.06049280148)
(24, 143368.41395951418)
(25, 128855.91909694811)
(26, 116118.77278241445)
(27, 104862.08115945845)
(28, 94891.02779679684)
(29, 86036.82664754543)
(30, 78149.96946994904)
(31, 71108.12424173063)
(32, 64808.45681314006)
(33, 59160.16002062881)
(34, 54080.55166363601)
(35, 49506.354277112376)
(36, 45378.31555845843)
(37, 41645.36112189356)
(38, 38264.57205716582)
(39, 35197.85284848527)
(40, 32412.25873647601)
(41, 29876

# Auto grad

In [10]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    loss.backward()
    
    w1.data -= learning_rate*w1.grad.data
    w2.data -= learning_rate*w2.grad.data
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()


(0, 31730022.0)
(1, 27815458.0)
(2, 30755278.0)
(3, 34956920.0)
(4, 35131272.0)
(5, 27757618.0)
(6, 16939354.0)
(7, 8352010.5)
(8, 3923795.75)
(9, 2011998.25)
(10, 1220968.875)
(11, 859437.1875)
(12, 665399.9375)
(13, 541787.3125)
(14, 452658.25)
(15, 383601.375)
(16, 327963.625)
(17, 282167.15625)
(18, 243952.046875)
(19, 211772.796875)
(20, 184505.28125)
(21, 161278.265625)
(22, 141413.921875)
(23, 124350.1953125)
(24, 109652.6015625)
(25, 96918.34375)
(26, 85885.2578125)
(27, 76282.7734375)
(28, 67895.125)
(29, 60544.78125)
(30, 54093.6015625)
(31, 48415.1953125)
(32, 43409.2890625)
(33, 38978.48828125)
(34, 35052.3515625)
(35, 31568.33984375)
(36, 28465.779296875)
(37, 25702.705078125)
(38, 23235.1015625)
(39, 21028.3984375)
(40, 19052.419921875)
(41, 17285.595703125)
(42, 15703.666015625)
(43, 14280.0)
(44, 12997.76171875)
(45, 11841.228515625)
(46, 10797.232421875)
(47, 9853.5390625)
(48, 8999.3818359375)
(49, 8225.4794921875)
(50, 7523.90576171875)
(51, 6887.32080078125)
(52, 63

(338, 0.0005457549705170095)
(339, 0.0005260838661342859)
(340, 0.0005089634214527905)
(341, 0.000492334773298353)
(342, 0.0004749481158796698)
(343, 0.000459071685327217)
(344, 0.0004441225028131157)
(345, 0.0004314403922762722)
(346, 0.00041716566192917526)
(347, 0.00040449219522997737)
(348, 0.00039178706356324255)
(349, 0.00037846859777346253)
(350, 0.00036638774327002466)
(351, 0.00035502284299582243)
(352, 0.0003448361239861697)
(353, 0.0003343290591146797)
(354, 0.00032534374622628093)
(355, 0.0003157648607157171)
(356, 0.0003063408948946744)
(357, 0.000297821854474023)
(358, 0.00028901867335662246)
(359, 0.0002805046387948096)
(360, 0.00027366820722818375)
(361, 0.0002649143279995769)
(362, 0.00025787443155422807)
(363, 0.00025016992003656924)
(364, 0.00024350982857868075)
(365, 0.00023717456497251987)
(366, 0.00023104243155103177)
(367, 0.0002258180029457435)
(368, 0.00021930929506197572)
(369, 0.0002131040528183803)
(370, 0.0002070568734779954)
(371, 0.00020245049381628633)
(

# Define new auto grad function

In [21]:
import torch
from torch.autograd import Variable

class MyReLU(torch.autograd.Function):
    def forward(self, input):
        self.save_for_backward(input)
        return input.clamp(min=0)
    
    def backward(self, grad_output):
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    relu = MyReLU()
    y_pred = relu(x.mm(w1)).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    loss.backward()
    
    w1.data -= learning_rate*w1.grad.data
    w2.data -= learning_rate*w2.grad.data
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()


(0, 25245832.0)
(1, 19660758.0)
(2, 16933136.0)
(3, 14903144.0)
(4, 12814985.0)
(5, 10450025.0)
(6, 8096123.0)
(7, 5969415.0)
(8, 4281296.0)
(9, 3021698.5)
(10, 2142891.5)
(11, 1539071.375)
(12, 1130975.25)
(13, 852759.0)
(14, 661374.9375)
(15, 526513.8125)
(16, 429264.21875)
(17, 357109.5)
(18, 302123.0)
(19, 259118.109375)
(20, 224713.578125)
(21, 196626.28125)
(22, 173295.71875)
(23, 153632.796875)
(24, 136866.046875)
(25, 122437.515625)
(26, 109913.484375)
(27, 98973.7265625)
(28, 89366.9921875)
(29, 80889.703125)
(30, 73382.1484375)
(31, 66714.6953125)
(32, 60773.1875)
(33, 55469.62890625)
(34, 50719.92578125)
(35, 46446.58984375)
(36, 42603.37109375)
(37, 39132.23046875)
(38, 35994.60546875)
(39, 33149.52734375)
(40, 30564.1640625)
(41, 28213.103515625)
(42, 26070.044921875)
(43, 24112.958984375)
(44, 22323.814453125)
(45, 20685.98828125)
(46, 19184.830078125)
(47, 17806.603515625)
(48, 16540.3359375)
(49, 15376.08984375)
(50, 14303.220703125)
(51, 13314.4267578125)
(52, 12402.23

(380, 0.0007653833017684519)
(381, 0.0007401083130389452)
(382, 0.0007157154614105821)
(383, 0.0006930098752491176)
(384, 0.0006705291452817619)
(385, 0.0006479655276052654)
(386, 0.000626409484539181)
(387, 0.0006068003131076694)
(388, 0.0005880823591724038)
(389, 0.0005695121944881976)
(390, 0.0005514355143532157)
(391, 0.0005341683281585574)
(392, 0.0005164106842130423)
(393, 0.0005026987055316567)
(394, 0.0004879590414930135)
(395, 0.00047222350258380175)
(396, 0.00045821312232874334)
(397, 0.0004445304803084582)
(398, 0.00043066643411293626)
(399, 0.0004193939676042646)
(400, 0.0004076658806297928)
(401, 0.0003962367190979421)
(402, 0.0003847795887850225)
(403, 0.0003741444961633533)
(404, 0.0003628054400905967)
(405, 0.0003519348974805325)
(406, 0.00034277880331501365)
(407, 0.000334023789037019)
(408, 0.00032445701071992517)
(409, 0.0003152413701172918)
(410, 0.00030761086964048445)
(411, 0.0002986263716593385)
(412, 0.00029074324993416667)
(413, 0.000283257249975577)
(414, 0.00

# nn module

In [26]:
import torch
from torch.autograd import Variable

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    model.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.data -= learning_rate*param.grad.data


(0, 730.5633544921875)
(1, 672.9207153320312)
(2, 624.10546875)
(3, 581.7683715820312)
(4, 544.5946655273438)
(5, 511.5589294433594)
(6, 481.8892822265625)
(7, 454.7994689941406)
(8, 429.915771484375)
(9, 406.9998474121094)
(10, 385.4917907714844)
(11, 365.3119812011719)
(12, 346.3130187988281)
(13, 328.2848205566406)
(14, 311.1324157714844)
(15, 294.7282409667969)
(16, 279.0507507324219)
(17, 264.04736328125)
(18, 249.75721740722656)
(19, 236.03733825683594)
(20, 222.95559692382812)
(21, 210.43373107910156)
(22, 198.49270629882812)
(23, 187.12380981445312)
(24, 176.29823303222656)
(25, 165.97860717773438)
(26, 156.2010498046875)
(27, 146.93223571777344)
(28, 138.13050842285156)
(29, 129.78221130371094)
(30, 121.89208221435547)
(31, 114.45233917236328)
(32, 107.44840240478516)
(33, 100.85572052001953)
(34, 94.65180206298828)
(35, 88.81632995605469)
(36, 83.34194946289062)
(37, 78.20233154296875)
(38, 73.39117431640625)
(39, 68.88695526123047)
(40, 64.66492462158203)
(41, 60.72470474243

(352, 0.00024653278524056077)
(353, 0.0002387069835094735)
(354, 0.0002311355492565781)
(355, 0.00022379470465239137)
(356, 0.0002166977501474321)
(357, 0.00020983432477805763)
(358, 0.00020319079339969903)
(359, 0.00019675481598824263)
(360, 0.00019053200958296657)
(361, 0.00018450600327923894)
(362, 0.00017866575217340142)
(363, 0.00017301902698818594)
(364, 0.0001675548410275951)
(365, 0.00016226961452048272)
(366, 0.0001571547909406945)
(367, 0.0001521954545751214)
(368, 0.00014739389007445425)
(369, 0.00014274877321440727)
(370, 0.0001382517657475546)
(371, 0.00013390443928074092)
(372, 0.00012968509690836072)
(373, 0.00012560620962176472)
(374, 0.00012165808584541082)
(375, 0.00011783840454882011)
(376, 0.00011413810716476291)
(377, 0.0001105606061173603)
(378, 0.00010708235640777275)
(379, 0.00010372293036198243)
(380, 0.00010047249816125259)
(381, 9.732401667861268e-05)
(382, 9.427309123566374e-05)
(383, 9.131967090070248e-05)
(384, 8.846150012686849e-05)
(385, 8.56907645356841

In [29]:
import torch
from torch.autograd import Variable

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), 
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


(0, 711.1709594726562)
(1, 694.0101318359375)
(2, 677.3471069335938)
(3, 661.1732788085938)
(4, 645.435546875)
(5, 630.02734375)
(6, 615.035888671875)
(7, 600.5211791992188)
(8, 586.44580078125)
(9, 572.840576171875)
(10, 559.5697021484375)
(11, 546.6441650390625)
(12, 534.0801391601562)
(13, 521.9061889648438)
(14, 510.00567626953125)
(15, 498.5125732421875)
(16, 487.4322814941406)
(17, 476.6235656738281)
(18, 466.06134033203125)
(19, 455.7318115234375)
(20, 445.6510314941406)
(21, 435.8425598144531)
(22, 426.255615234375)
(23, 416.9244384765625)
(24, 407.85284423828125)
(25, 399.0084228515625)
(26, 390.3917541503906)
(27, 381.9842224121094)
(28, 373.82318115234375)
(29, 365.81890869140625)
(30, 357.97601318359375)
(31, 350.29315185546875)
(32, 342.79193115234375)
(33, 335.4647521972656)
(34, 328.3249206542969)
(35, 321.3110656738281)
(36, 314.44390869140625)
(37, 307.7537536621094)
(38, 301.20452880859375)
(39, 294.76593017578125)
(40, 288.4307861328125)
(41, 282.20489501953125)
(42,

(352, 0.000980472774244845)
(353, 0.000929699104744941)
(354, 0.0008814677712507546)
(355, 0.0008356510079465806)
(356, 0.0007921638316474855)
(357, 0.000750879873521626)
(358, 0.0007117059431038797)
(359, 0.0006744691054336727)
(360, 0.0006391283241100609)
(361, 0.0006056174752302468)
(362, 0.0005738078034482896)
(363, 0.0005436266656033695)
(364, 0.0005149595672264695)
(365, 0.00048777335905469954)
(366, 0.0004619702813215554)
(367, 0.00043748828466050327)
(368, 0.0004142879042774439)
(369, 0.000392253918107599)
(370, 0.0003713658661581576)
(371, 0.0003515679854899645)
(372, 0.0003327817830722779)
(373, 0.00031497233430854976)
(374, 0.0002980835852213204)
(375, 0.00028207703144289553)
(376, 0.00026690028607845306)
(377, 0.0002525221789255738)
(378, 0.00023889071599114686)
(379, 0.00022598479699809104)
(380, 0.00021373998606577516)
(381, 0.00020213219977449626)
(382, 0.00019115087343379855)
(383, 0.00018073941464535892)
(384, 0.00017088190361391753)
(385, 0.00016154161130543798)
(386,

# Custom nn Modules

In [34]:
import torch
from torch.autograd import Variable

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 676.0763549804688)
(1, 621.9150390625)
(2, 575.5523071289062)
(3, 534.9147338867188)
(4, 499.2742004394531)
(5, 467.7075500488281)
(6, 439.2510986328125)
(7, 413.10791015625)
(8, 388.96856689453125)
(9, 366.5581359863281)
(10, 345.6530456542969)
(11, 326.0626525878906)
(12, 307.6127014160156)
(13, 290.2270812988281)
(14, 273.7981262207031)
(15, 258.2350158691406)
(16, 243.4474334716797)
(17, 229.4440155029297)
(18, 216.16891479492188)
(19, 203.5167236328125)
(20, 191.51776123046875)
(21, 180.09994506835938)
(22, 169.2792510986328)
(23, 158.9963836669922)
(24, 149.22824096679688)
(25, 140.00611877441406)
(26, 131.27401733398438)
(27, 123.01776885986328)
(28, 115.22220611572266)
(29, 107.89202880859375)
(30, 100.998046875)
(31, 94.5133285522461)
(32, 88.4368896484375)
(33, 82.72191619873047)
(34, 77.37397766113281)
(35, 72.34632110595703)
(36, 67.63880920410156)
(37, 63.232505798339844)
(38, 59.11809158325195)
(39, 55.27109146118164)
(40, 51.685813903808594)
(41, 48.332977294921875)


(459, 5.894109563087113e-06)
(460, 5.7245351854362525e-06)
(461, 5.562000296777114e-06)
(462, 5.402866008807905e-06)
(463, 5.248493380349828e-06)
(464, 5.097852408653125e-06)
(465, 4.953360075887758e-06)
(466, 4.812246970686829e-06)
(467, 4.6747491069254465e-06)
(468, 4.5421174945659e-06)
(469, 4.413220267451834e-06)
(470, 4.287156571081141e-06)
(471, 4.16698821936734e-06)
(472, 4.0478125811205246e-06)
(473, 3.932875642931322e-06)
(474, 3.8217608562263194e-06)
(475, 3.713818159667426e-06)
(476, 3.6081851249036845e-06)
(477, 3.506150733301183e-06)
(478, 3.4070135370711796e-06)
(479, 3.310021611468983e-06)
(480, 3.2166481105377898e-06)
(481, 3.125898729194887e-06)
(482, 3.038237537111854e-06)
(483, 2.951395345007768e-06)
(484, 2.8680592549790163e-06)
(485, 2.7874327770405216e-06)
(486, 2.708780357352225e-06)
(487, 2.632668383739656e-06)
(488, 2.558781488914974e-06)
(489, 2.486708353899303e-06)
(490, 2.4162818590411916e-06)
(491, 2.34894855566381e-06)
(492, 2.2828505734651117e-06)
(493, 2

# Control flow and weight sharing1

In [37]:
import random
import torch
from torch.autograd import Variable

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = DynamicNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 655.9651489257812)
(1, 630.1058349609375)
(2, 614.493408203125)
(3, 608.3214111328125)
(4, 609.5244750976562)
(5, 479.3810119628906)
(6, 606.03271484375)
(7, 602.8109741210938)
(8, 378.9876708984375)
(9, 341.65826416015625)
(10, 594.1591186523438)
(11, 546.1640625)
(12, 587.2954711914062)
(13, 527.1539916992188)
(14, 594.138671875)
(15, 495.5043029785156)
(16, 173.0673065185547)
(17, 151.74720764160156)
(18, 431.459716796875)
(19, 404.2012939453125)
(20, 529.502685546875)
(21, 513.1823120117188)
(22, 92.0029296875)
(23, 470.692626953125)
(24, 444.1133117675781)
(25, 507.60858154296875)
(26, 222.3856964111328)
(27, 203.6152801513672)
(28, 425.277099609375)
(29, 295.9630126953125)
(30, 360.4263000488281)
(31, 218.7469024658203)
(32, 188.86070251464844)
(33, 288.022216796875)
(34, 100.0836181640625)
(35, 71.46471405029297)
(36, 48.1985969543457)
(37, 33.319183349609375)
(38, 264.7107238769531)
(39, 194.0807647705078)
(40, 197.9416046142578)
(41, 150.7322235107422)
(42, 148.48110961914

(357, 0.8609052300453186)
(358, 0.78895103931427)
(359, 0.7365065813064575)
(360, 0.5993508100509644)
(361, 0.7910258769989014)
(362, 0.15316878259181976)
(363, 0.22681356966495514)
(364, 1.1316311359405518)
(365, 1.1681677103042603)
(366, 0.7377431988716125)
(367, 0.5642670392990112)
(368, 0.8003160953521729)
(369, 0.8176155090332031)
(370, 0.5821180939674377)
(371, 0.7655678391456604)
(372, 0.2349797487258911)
(373, 0.8141429424285889)
(374, 0.16246455907821655)
(375, 0.1352093666791916)
(376, 0.10409339517354965)
(377, 0.7716462016105652)
(378, 0.7879191637039185)
(379, 0.5711411833763123)
(380, 0.5950185060501099)
(381, 0.3217116594314575)
(382, 0.6263071298599243)
(383, 0.5220919847488403)
(384, 0.5817577838897705)
(385, 0.1425199955701828)
(386, 0.10114362835884094)
(387, 0.08002573996782303)
(388, 0.08356775343418121)
(389, 0.09368228912353516)
(390, 1.312788724899292)
(391, 0.055565524846315384)
(392, 0.5186153054237366)
(393, 1.0046272277832031)
(394, 0.9385887980461121)
(395,