# Learning Pytorch with Examples
Link: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

## Implement network using numpy

In [3]:
# -*- coding: utf-8 -*-
import numpy as np


# N is bash size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradient of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 25716667.52609746
1 19208177.704846527
2 16925943.95623912
3 16272030.967289142
4 15878929.02195695
5 14850644.977473713
6 12929828.898457605
7 10337548.982875062
8 7657868.410944873
9 5338349.85868625
10 3600140.0721950885
11 2400177.7480116785
12 1619074.7050179483
13 1121304.188425892
14 805270.9911233713
15 601757.1061896046
16 467371.89580057084
17 375469.57068296394
18 310064.3187146545
19 261644.77971179516
20 224405.71628490213
21 194787.32017049214
22 170621.17864025338
23 150457.99786424398
24 133372.89996470942
25 118703.46856779745
26 106082.03748094616
27 95063.85310081358
28 85413.35493955816
29 76908.65095178381
30 69380.54256433577
31 62696.69779072444
32 56741.915902059
33 51427.93130881807
34 46668.81535363033
35 42402.53191379833
36 38570.702339576565
37 35124.07463997612
38 32019.279518322994
39 29214.585801878013
40 26680.011543587676
41 24388.38941146013
42 22312.872962570338
43 20430.893371042137
44 18722.376326000613
45 17169.115307704094
46 15755.34615591324


482 5.578399992979778e-08
483 5.2745261823670356e-08
484 4.9871505277556315e-08
485 4.7153797343733485e-08
486 4.4584651921765476e-08
487 4.2156641242048154e-08
488 3.986021536097982e-08
489 3.768851781885028e-08
490 3.563530462338084e-08
491 3.3694232212118826e-08
492 3.1859245484362996e-08
493 3.012477742050367e-08
494 2.848399562587672e-08
495 2.6932562237083417e-08
496 2.546603510441858e-08
497 2.407923792464567e-08
498 2.2767986085518317e-08
499 2.152837651267707e-08


## PyTorch Tensors

In [2]:
import torch

dtype = torch.float
device = torch.device("cpu")

# N is bash size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
    
    # Backprop to compute radients of w1 and w2 ith respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # upgrade weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 492.3892822265625
199 2.3734118938446045
299 0.016925575211644173
399 0.00035549444146454334
499 4.6073964767856523e-05


## Pytorch: Tensors and autograd

In [4]:
import torch

# N is bash size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with the respect to these Tensors during backward pass
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; These
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to ntermidiate values since
    # we are not implementing the backward pass by hand.
    
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with required_grad=True.
    # After this call w1.grad and w2.grad will be Tensors hoding the gradient
    # of the loss with respect to w1 and w2 respectively
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 776.7689208984375
199 5.674274444580078
299 0.06827440112829208
399 0.0012478386051952839
499 0.00010865596414078027


## PyTorch: Defining new autograd functions

In [None]:
import Torch

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forawrd and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containg the input and return
        a Tensor containg the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    