In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Warm-up: numpy
Implement a network using numpy

$
\begin{aligned}
z = f(Y), Y = AX + B \rightarrow \frac{\partial z}{\partial X} = A^T \frac{\partial z}{\partial Y} \\
z = f(Y), Y = XA + B \rightarrow \frac{\partial z}{\partial X} = \frac{\partial z}{\partial Y} A^T 
\end{aligned}
$

In [2]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # (1) Forward pass
    # h: (N, H)
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    # y_pred: (N, D_out)
    y_pred = h_relu.dot(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss)

    # (3) Backward propagation
    # China rule 
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # (4) Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

50 17771.563001788007
100 824.3449816524
150 63.10522474756195
200 5.874290871420839
250 0.6119854133533504
300 0.06949911230053367
350 0.008514900118536343
400 0.0011171972723749862
450 0.0001556328372400992
500 2.2789354646465557e-05


# PyTorch: Tensors

In [3]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # (1) Forward pass
    # h: (N, H)
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    # y_pred: (N, D_out)
    y_pred = h_relu.mm(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    # China rule 
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.T)
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.T.mm(grad_h)

    # (4) Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

50 14957.1787109375
100 520.1260375976562
150 28.716014862060547
200 1.8750921487808228
250 0.13260208070278168
300 0.009954098612070084
350 0.0010043686488643289
400 0.0002048783644568175
450 7.241935963975266e-05
500 3.6382560210768133e-05


# PyTorch: Autograd

In [4]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # (1) Forward pass
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    loss.backward()

    # (4) Update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

50 11510.7138671875
100 541.0838623046875
150 43.1258544921875
200 4.282898426055908
250 0.4824184775352478
300 0.058715153485536575
350 0.007716975640505552
400 0.0012684206012636423
450 0.000316200457746163
500 0.00011679269664455205


# PyTorch: Defining new autograd functions

In [5]:
class MyReLU(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input):
        """
        Receive: A Tensor containing the input
        Return: A Tensor containing the output
        ctx is a context object that can be used to stash information for backward 
        computation. You can cache arbitrary objects for use in the backward pass 
        using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        Receive: A Tensor containing the gradient of the loss w.r.t the output
        Return: A Tensor containing the gradient of the loss w.r.t the input
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [6]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
relu = MyReLU.apply

for t in range(500):
    # (1) Forward pass
    y_pred = relu(x.mm(w1)).mm(w2)

    # (2) Calculate loss
    loss = ((y_pred - y) ** 2).sum()
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    loss.backward()

    # (4) Update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

50 20575.8984375
100 1150.6260986328125
150 99.48345184326172
200 10.097822189331055
250 1.1162739992141724
300 0.1297990083694458
350 0.01577002741396427
400 0.002236555563285947
450 0.0004819182795472443
500 0.00015847866598051041


# PyTorch: nn module
Tensors and Autograd are relatively low-level for large neural networks.  

The nn module deals with relatively high-level work, defining a set of Modules roughly equivalent to neural network layers.  

In [7]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = nn.Sequential(
    nn.Linear(D_in, H), 
    nn.ReLU(),
    nn.Linear(H, D_out),
)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    model.zero_grad()
    loss.backward()

    # (4) Update weights
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

50 29.64038848876953
100 2.567214250564575
150 0.3545261025428772
200 0.05738692358136177
250 0.01023820973932743
300 0.0019655346404761076
350 0.00040638132486492395
400 9.055338887264952e-05
450 2.1870999262318946e-05
500 5.810691163787851e-06


# PyTorch: optim module
The optim module  abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms.

In [8]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = nn.Sequential(
    nn.Linear(D_in, H), 
    nn.ReLU(),
    nn.Linear(H, D_out),
)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    optimizer.zero_grad()
    loss.backward()

    # (4) Update weights
    optimizer.step()

50 166.0666961669922
100 36.015262603759766
150 4.174113750457764
200 0.2797119915485382
250 0.014573981054127216
300 0.0006866778712719679
350 2.770915125438478e-05
400 8.530561217412469e-07
450 1.8590624151215707e-08
500 4.704743261640942e-10


# PyTorch: Custom nn Modules

In [9]:
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(D_in, H)
        self.fc2 = nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = F.relu(self.fc1(x))
        return self.fc2(h_relu)

In [10]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = TwoLayerNet(D_in, H, D_out)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    optimizer.zero_grad()
    loss.backward()

    # (4) Update weights
    optimizer.step()

50 184.05323791503906
100 44.91203689575195
150 6.023375034332275
200 0.5573031902313232
250 0.04588571563363075
300 0.003909107763320208
350 0.0002986715699080378
400 1.7970227418118156e-05
450 8.331199978783843e-07
500 3.285331828806193e-08


# PyTorch: Control Flow + Weight Sharing
As an example of dynamic graphs and weight sharing, we implement a very strange model: a fully-connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.

For this model we can use normal Python flow control to implement the loop, and we can implement weight sharing among the innermost layers by simply reusing the same Module multiple times when defining the forward pass.

In [11]:
class DynamicNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.fc1 = nn.Linear(D_in, H)
        self.fch = nn.Linear(H, H)
        self.fc2 = nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = F.relu(self.fc1(x))
        for _ in range(np.random.randint(1, 5)):
            h_relu = F.relu(self.fch(h_relu))
        return self.fc2(h_relu)

In [12]:
device = torch.device("cpu")

# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: outpur dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Define the network with nn module
model = DynamicNet(D_in, H, D_out)

# Define the loss with nn module
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # (1) Forward pass
    y_pred = model(x)

    # (2) Calculate loss
    loss = loss_fn(y_pred, y)
    if (t+1) % 50 == 0:
        print(t+1, loss.item())

    # (3) Backward propagation
    optimizer.zero_grad()
    loss.backward()

    # (4) Update weights
    optimizer.step()

50 508.156005859375
100 461.31329345703125
150 223.98587036132812
200 244.3576202392578
250 273.8345031738281
300 189.361572265625
350 108.08002471923828
400 69.9246826171875
450 34.45793151855469
500 41.158103942871094
