In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Warm-up: `numpy`
Implement a network using `numpy`. 

The chain rule for matrix gradients.  
$$
\begin{aligned}
z = f(Y), Y = AX + B \rightarrow \frac{\partial z}{\partial X} = A^T \frac{\partial z}{\partial Y} \\
z = f(Y), Y = XA + B \rightarrow \frac{\partial z}{\partial X} = \frac{\partial z}{\partial Y} A^T 
\end{aligned}
$$

In [2]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # (1) Forward pass
    # h: (N, H)
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    # y_pred: (N, D_out)
    y_pred = h_relu.dot(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss)

    # (3) Backward propagation
    # China rule 
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # (4) Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

50 17779.934445622137
100 1272.2658109026818
150 197.74240228243002
200 37.54745337570657
250 7.718365241910927
300 1.6499618175849955
350 0.3603497852045676
400 0.07973804354714127
450 0.017789189447301703
500 0.003991372345085601


# PyTorch: Tensors

In [3]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # (1) Forward pass
    # h: (N, H)
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    # y_pred: (N, D_out)
    y_pred = h_relu.mm(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    # China rule 
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.T)
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.T.mm(grad_h)

    # (4) Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

50 11755.205078125
100 310.06280517578125
150 13.410354614257812
200 0.683499276638031
250 0.037457942962646484
300 0.002346301218494773
350 0.00029403201187960804
400 7.899075717432424e-05
450 3.370572812855244e-05
500 1.897680886031594e-05


# PyTorch: Autograd

In [4]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # (1) Forward pass
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    loss.backward()

    # (4) Update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

50 12863.048828125
100 437.13348388671875
150 23.839937210083008
200 1.5797489881515503
250 0.1170041561126709
300 0.009401821531355381
350 0.001031187130138278
400 0.00022921152412891388
450 8.458275988232344e-05
500 4.188505045021884e-05


# PyTorch: Defining new autograd functions

In [5]:
class MyReLU(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input):
        """
        Receive: A Tensor containing the input
        Return: A Tensor containing the output
        ctx is a context object that can be used to stash information for backward 
        computation. You can cache arbitrary objects for use in the backward pass 
        using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        Receive: A Tensor containing the gradient of the loss w.r.t the output
        Return: A Tensor containing the gradient of the loss w.r.t the input
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [6]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
relu = MyReLU.apply

for t in range(500):
    # (1) Forward pass
    y_pred = relu(x.mm(w1)).mm(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    loss.backward()

    # (4) Update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

50 15177.96484375
100 493.72174072265625
150 27.658872604370117
200 1.9257608652114868
250 0.149516761302948
300 0.01241301279515028
350 0.001337741850875318
400 0.0002710820408537984
450 9.37502336455509e-05
500 4.5355332986218855e-05


# PyTorch: `nn` module
Tensors and Autograd are relatively low-level for large neural networks.  
The `nn` module deals with relatively high-level work, defining a set of Modules roughly equivalent to neural network layers.  

In [7]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = nn.Sequential(
    nn.Linear(D_in, H), 
    nn.ReLU(),
    nn.Linear(H, D_out),
)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    model.zero_grad()
    loss.backward()

    # (4) Update weights
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

50 31.37506103515625
100 2.320072650909424
150 0.3068027198314667
200 0.054306309670209885
250 0.012054412625730038
300 0.0030662568751722574
350 0.0008425252162851393
400 0.00024181004846468568
450 7.122044917196035e-05
500 2.1321982785593718e-05


# PyTorch: `optim` module
The `optim` module abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms.

In [8]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = nn.Sequential(
    nn.Linear(D_in, H), 
    nn.ReLU(),
    nn.Linear(H, D_out),
)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    optimizer.zero_grad()
    loss.backward()

    # (4) Update weights
    optimizer.step()

50 203.43124389648438
100 50.17519760131836
150 8.437718391418457
200 1.1158555746078491
250 0.11447425931692123
300 0.010102280415594578
350 0.0008775893948040903
400 6.843827577540651e-05
450 4.252479357091943e-06
500 1.991742948348474e-07


# PyTorch: Custom `nn` Modules

In [9]:
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(D_in, H)
        self.fc2 = nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = F.relu(self.fc1(x))
        return self.fc2(h_relu)

In [10]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = TwoLayerNet(D_in, H, D_out)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    optimizer.zero_grad()
    loss.backward()

    # (4) Update weights
    optimizer.step()

50 174.67047119140625
100 34.094120025634766
150 3.6951241493225098
200 0.27947697043418884
250 0.023027854040265083
300 0.002166505204513669
350 0.00018708972493186593
400 1.2912669262732379e-05
450 6.738098363712197e-07
500 2.577219149202392e-08


# PyTorch: Control Flow + Weight Sharing
As an example of dynamic graphs and weight sharing, we implement a very strange model: a fully-connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.

For this model we can use normal Python flow control to implement the loop, and we can implement weight sharing among the innermost layers by simply reusing the same Module multiple times when defining the forward pass.

In [11]:
class DynamicNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.fc1 = nn.Linear(D_in, H)
        self.fch = nn.Linear(H, H)
        self.fc2 = nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = F.relu(self.fc1(x))
        for _ in range(np.random.randint(1, 5)):
            h_relu = F.relu(self.fch(h_relu))
        return self.fc2(h_relu)

In [12]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = DynamicNet(D_in, H, D_out)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    optimizer.zero_grad()
    loss.backward()

    # (4) Update weights
    optimizer.step()

50 587.51953125
100 527.52197265625
150 492.1304016113281
200 256.559814453125
250 223.74461364746094
300 48.736873626708984
350 31.47231674194336
400 32.43431091308594
450 35.45283508300781
500 34.850074768066406
