# 1. Implement a two-layer network with numpy

In [5]:
# 1. generate a 2-layer network with numpy
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(10):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 32897865.442742407
1 31483252.777447995
2 30956974.294953004
3 26765111.66923166
4 19388711.159411073
5 11658098.506877884
6 6393968.990223771
7 3494521.3523915857
8 2086107.5829685542
9 1392892.8611802687


# 

# 2. Implement a two-layer network with Pytorch

In [18]:
# 2. implement a two-layer network with Pytorch
import torch

# decide to use cpu or gpu
device = torch.device('cpu')
# device = torch.device('cuda')

# N is batch_size, D_in is input dimension;
# H is hidden dimension, D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for i in range(10):
    # forward pass: compute y
    h = x.mm(w1)
    # torch.clamp limits a tensor to its lower and upper boundary
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    loss = (y_pred - y).pow(2).sum()
    # Tensor.item() transforms a Tensor to an int
    print(i, loss.item())

    # backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    # torch.t is transpose of tensor
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 26278576.0
1 24012896.0
2 27376562.0
3 32710462.0
4 35686628.0
5 31511208.0
6 21584854.0
7 11561235.0
8 5491053.5
9 2666031.75


# 3. Autograd in Pytorch
When defining the forward function, a computational graph is created. Nodes in the graphs are Tensors, and edges are functions that produce output Tensors.
If we want to set a Tensor to perform backpropagation, set `a = torch.tensor([1, 2, 3], requires_grad=True). After backpropagation a.grad will be a Tensor that holds the gradient of x. 

In [11]:
# 3. implement a two-layer network with auto-grad
import torch

device = torch.device('cpu')
# device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# if we want to apply autograd to a Tensor, just set requires_grad = True
w1 = torch.randn(D_in, H, requires_grad=True, device=device)
w2 = torch.randn(H, D_out, requires_grad=True, device=device)

learning_rate = 1e-6
for i in range(100):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    print(i, loss.item())
    # use loss.backward() can calculate gradient for parameters that are set
    # requires_grad=True. After calculation, w1.grad and w2.grad will be Tensors
    # that hold gradient for w1 and w2
    loss.backward()

    # next step is update gradients. this step we don't want to build up a
    # computation graph for w1 and w2. so use torch.no_grad() to prevent
    # Pytorch from building a computational graph for the updates
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
    # manually set grad to zeros. This step can't skip because w1.grad
    # will not clear automatically.
        w1.grad.zero_()
        w2.grad.zero_()



0 33327216.0
1 29294718.0
2 26060116.0
3 21083236.0
4 15052104.0
5 9628452.0
6 5851244.5
7 3593964.75
8 2341618.5
9 1645527.125
10 1239974.0
11 985208.1875
12 810859.625
13 682920.1875
14 583901.1875
15 504268.9375
16 438884.5
17 384362.8125
18 338230.40625
19 298880.625
20 265072.5625
21 235857.40625
22 210491.65625
23 188371.78125
24 169008.140625
25 152022.046875
26 137076.0
27 123855.9375
28 112125.4765625
29 101694.328125
30 92398.9140625
31 84096.4296875
32 76657.0078125
33 69980.421875
34 63974.2578125
35 58561.03515625
36 53676.01953125
37 49272.2734375
38 45284.71484375
39 41666.07421875
40 38376.41796875
41 35383.7734375
42 32658.07421875
43 30170.47265625
44 27900.673828125
45 25824.5859375
46 23922.69140625
47 22177.962890625
48 20577.484375
49 19106.224609375
50 17753.818359375
51 16508.3359375
52 15360.6318359375
53 14302.2265625
54 13326.380859375
55 12425.537109375
56 11592.2373046875
57 10821.3125
58 10107.544921875
59 9445.5732421875
60 8831.779296875
61 8262.04394531

# 4. define modified autograd functions

In [14]:
# 4. define modified autograd functions
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, x):
        """
        In the forward pass we receive a context object and a Tensor containing the
        input; we must return a Tensor containing the output, and we can use the
        context object to cache objects for use in the backward pass.
        """
        ctx.save_for_backward(x)
        return x.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive the context object and a Tensor containing
        the gradient of the loss with respect to the output produced during the
        forward pass. We can retrieve cached data from the context object, and must
        compute and return the gradient of the loss with respect to the input to the
        forward function.
        """
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x


device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and output
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(10):
    # Forward pass: compute predicted y using operations on Tensors; we call our
    # custom ReLU implementation using the MyReLU.apply function
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    with torch.no_grad():
        # Update weights using gradient descent
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after running the backward pass
        w1.grad.zero_()
        w2.grad.zero_()


0 37900088.0
1 35574084.0
2 35044824.0
3 30612100.0
4 21881946.0
5 12847150.0
6 6787024.0
7 3643665.75
8 2178955.75
9 1478969.5


# 5. Pytorch.nn

In [15]:
# 5. Implement a two-layer network with torch.nn module
import torch

device = torch.device('cpu')
# device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# nn.Sequential is a module which contains other modules and
# combines them in sequence to generate output. Each linear
# module computes output using a linear function.
# After building a Sequential model, use .to() method to move
# it to cpu or gpu.
model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                            torch.nn.ReLU(),
                            torch.nn.Linear(H, D_out)
                            ).to(device)

# reduction is the way that loss_func is calculated among samples.
# it's more common to set reduction='elementwise_mean'
loss_func = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6

for t in range(10):
    y_pred = model(x)
    loss = loss_func(y, y_pred)
    print(t, loss.item())

    # remember to zero the gradient before running the back_propagation
    model.zero_grad()

    # calculate the back propagation
    loss.backward()

    # update all parameters in model
    with torch.no_grad():
        for param in model.parameters():
            param.data -= learning_rate * param.grad


0 684.2965087890625
1 683.763916015625
2 683.2320556640625
3 682.7007446289062
4 682.1700439453125
5 681.639892578125
6 681.1102905273438
7 680.5814819335938
8 680.0530395507812
9 679.5254516601562


# 6. Pytorch:optim

In [16]:
# 6. optim in Pytorch. Use to replace manually update parameter
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )
loss_fn = torch.nn.MSELoss(reduction='elementwise_mean')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for i in range(10):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(i, loss.item())
    
    # use optimizer.zero_grad() to clear gradident for parameters
    optimizer.zero_grad() 
    loss.backward()
    # call the step function to perform a step to update parameters
    optimizer.step()

0 1.115718960762024
1 1.0883523225784302
2 1.0618207454681396
3 1.03631591796875
4 1.0116201639175415
5 0.9877586960792542
6 0.9646045565605164
7 0.942121148109436
8 0.9202743172645569
9 0.899020791053772


In [None]:
# 7. custom nn module

In [None]:
# 7. Custom nn Modules
import torch


class TwoLayerNet(torch.nn.Module):
    # init function defines network structure
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    # forward defines how to calculate output
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


N, D_in, H, D_out = 64, 1000, 100, 10

# create random Tensors for input and output
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = TwoLayerNet(D_in, H, D_out)

loss_fn = torch.nn.MSELoss(reduction='elementwise_mean')
optim = torch.optim.Adam(model.parameters())
for i in range(10):
    y_pred = model(x)

    # compute and print loss func
    loss = loss_fn(y_pred, y)
    print(i, loss.item())

    optim.zero_grad()
    loss.backward()
    optim.step()