In [None]:
%matplotlib inline


PyTorch: Tensors
----------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation uses PyTorch tensors to manually compute the forward pass,
loss, and backward pass.

A PyTorch Tensor is basically the same as a numpy array: it does not know
anything about deep learning or computational graphs or gradients, and is just
a generic n-dimensional array to be used for arbitrary numeric computation.

The biggest difference between a numpy array and a PyTorch Tensor is that
a PyTorch Tensor can run on either CPU or GPU. To run operations on the GPU,
just cast the Tensor to a cuda datatype.



In [2]:
import torch


dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

(0, 29718010.0)
(1, 23124498.0)
(2, 21245900.0)
(3, 20435218.0)
(4, 18944624.0)
(5, 16079093.0)
(6, 12329544.0)
(7, 8603896.0)
(8, 5646710.0)
(9, 3608432.25)
(10, 2331985.25)
(11, 1561815.25)
(12, 1100489.375)
(13, 817215.875)
(14, 636013.125)
(15, 513838.46875)
(16, 426681.71875)
(17, 361299.90625)
(18, 310192.875)
(19, 268970.96875)
(20, 234962.828125)
(21, 206435.296875)
(22, 182203.375)
(23, 161451.640625)
(24, 143531.875)
(25, 127973.7265625)
(26, 114409.4921875)
(27, 102548.5390625)
(28, 92140.53125)
(29, 82958.9609375)
(30, 74836.40625)
(31, 67632.25)
(32, 61233.4296875)
(33, 55534.00390625)
(34, 50442.85546875)
(35, 45885.89453125)
(36, 41801.19921875)
(37, 38132.88671875)
(38, 34829.06640625)
(39, 31848.9453125)
(40, 29158.599609375)
(41, 26724.734375)
(42, 24518.154296875)
(43, 22515.10546875)
(44, 20694.56640625)
(45, 19037.87890625)
(46, 17528.892578125)
(47, 16152.6640625)
(48, 14895.916015625)
(49, 13750.0185546875)
(50, 12701.326171875)
(51, 11740.4521484375)
(52, 10859.

(411, 0.0003046701895073056)
(412, 0.0002964297018479556)
(413, 0.0002896718215197325)
(414, 0.00028256591758690774)
(415, 0.0002763792290352285)
(416, 0.0002702175115700811)
(417, 0.00026265886845067143)
(418, 0.0002572580415289849)
(419, 0.00025102810468524694)
(420, 0.0002448788727633655)
(421, 0.00023956134100444615)
(422, 0.00023397729091811925)
(423, 0.00022859744785819203)
(424, 0.00022345957404468209)
(425, 0.00021863482834305614)
(426, 0.00021413002104964107)
(427, 0.00020906623103655875)
(428, 0.00020428132847882807)
(429, 0.00019950440037064254)
(430, 0.00019547883130144328)
(431, 0.00019182401592843235)
(432, 0.0001870947307907045)
(433, 0.000182921823579818)
(434, 0.00017894267512019724)
(435, 0.000174925837200135)
(436, 0.00017209304496645927)
(437, 0.00016796549607533962)
(438, 0.0001653602230362594)
(439, 0.00016160414088517427)
(440, 0.00015785083814989775)
(441, 0.000154903216753155)
(442, 0.00015163284842856228)
(443, 0.0001487803820054978)
(444, 0.000145506186527200