In [0]:
# Install Pytorch.
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
%matplotlib inline


PyTorch: Tensors
----------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation uses PyTorch tensors to manually compute the forward pass,
loss, and backward pass.

A PyTorch Tensor is basically the same as a numpy array: it does not know
anything about deep learning or computational graphs or gradients, and is just
a generic n-dimensional array to be used for arbitrary numeric computation.

The biggest difference between a numpy array and a PyTorch Tensor is that
a PyTorch Tensor can run on either CPU or GPU. To run operations on the GPU,
just cast the Tensor to a cuda datatype.



In [4]:
# Run on CPU
# dtype = torch.FloatTensor

# Run on GPU
dtype = torch.cuda.FloatTensor

# N: batch size, D_in: input dim, H: hidden dim, D_out: output dim
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 24425122.0
1 18589966.0
2 18383284.0
3 21087264.0
4 24708448.0
5 26909268.0
6 25183740.0
7 19585784.0
8 12592527.0
9 7100060.0
10 3792593.0
11 2091465.0
12 1258218.375
13 844982.8125
14 625636.875
15 497134.1875
16 413073.4375
17 352516.875
18 305679.75
19 267661.1875
20 235838.875
21 208736.484375
22 185398.375
23 165145.625
24 147463.546875
25 131974.84375
26 118342.8515625
27 106322.984375
28 95712.03125
29 86311.3984375
30 77959.0625
31 70543.0859375
32 63921.26171875
33 57997.50390625
34 52690.33984375
35 47934.20703125
36 43660.8515625
37 39814.12890625
38 36345.91796875
39 33214.1953125
40 30381.9921875
41 27816.69921875
42 25492.015625
43 23383.451171875
44 21467.970703125
45 19728.16015625
46 18144.03515625
47 16701.9296875
48 15387.021484375
49 14186.6787109375
50 13089.7919921875
51 12085.3828125
52 11165.9091796875
53 10322.412109375
54 9548.865234375
55 8838.34765625
56 8185.765625
57 7585.7451171875
58 7033.21484375
59 6524.78369140625
60 6056.19775390625
61 5624.436523