### Building a neural network in PyTorch

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchsummary import torchsummary

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 kernel convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x)) # its like a flatten of tensorflow
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [3]:
params = list(net.parameters())  # list out all parameters
print(len(params))
print(params[0].size())  # conv1's .weight

10
torch.Size([6, 1, 3, 3])


In [4]:
input = torch.randn(1, 1, 32, 32) # (batch, C, W, H)
out = net(input)
print(out)

tensor([[ 0.0709, -0.1327,  0.0413,  0.0124,  0.1347, -0.0714,  0.0825, -0.1766,
         -0.0303, -0.0630]], grad_fn=<AddmmBackward>)


In [5]:
# Zero the gradient buffers of all parameters and backprops with random gradients:

In [6]:
net.zero_grad() # Sets gradients of all model parameters to zero.

In [7]:
out.backward(torch.randn(1, 10))

In [8]:
# loss function

In [9]:
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()
output = net(input)

In [10]:
loss = criterion(output, target)

In [11]:
print(loss)

tensor(0.9218, grad_fn=<MseLossBackward>)


In [12]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x7f9ee3b03ca0>
<AddmmBackward object at 0x7f9ee3b03e20>
<AccumulateGrad object at 0x7f9ee3b03ca0>


In [13]:
# Backprop

In [14]:
net.zero_grad()
print('conv1.weight.grad before backward') # shape = torch.Size([6, 1, 3, 3])
print(net.conv1.weight.grad)

print('conv1.bias.grad before backward') # shape = torch.Size(6)
print(net.conv1.bias.grad)

loss.backward() # differentiation

print('conv1.weight.grad after backward')
print(net.conv1.weight.grad)

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)


conv1.weight.grad before backward
tensor([[[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]]])
conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.weight.grad after backward
tensor([[[[-0.0040, -0.0152,  0.0113],
          [-0.0132,  0.0205,  0.0135],
          [-0.0046,  0.0016, -0.0245]]],


        [[[ 0.0287,  0.0030,  0.0032],
          [-0.0012, -0.0031, -0.0042],
          [ 0.0184, -0.0012,  0.0096]]],


        [[[-0.0196,  0.0057, -0.0162],
          [ 0.0186, -0.0214, -0.0004],
          [ 0.0118, -0.0118,  0.0106]]],


        [[[-0.0131,  0.0203,  0.0040],
          [

In [15]:
# update weights

In [16]:
lr = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * lr)

In [17]:
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

In [18]:
print(net.conv1.weight) # updated weight of conv1
print(net.conv1.bias) # updated bias of conv1

Parameter containing:
tensor([[[[-0.2092, -0.0955,  0.1885],
          [-0.0214,  0.0495,  0.0254],
          [ 0.3304,  0.3249,  0.1404]]],


        [[[-0.1057, -0.1083, -0.1745],
          [ 0.2324, -0.0814,  0.1427],
          [-0.1062, -0.1345, -0.3218]]],


        [[[-0.0556,  0.0269,  0.2000],
          [-0.2282,  0.2636, -0.1187],
          [ 0.1590,  0.2832,  0.0198]]],


        [[[ 0.1554,  0.0443,  0.1690],
          [-0.3220,  0.1124, -0.1000],
          [ 0.3041,  0.1204,  0.0597]]],


        [[[-0.0791,  0.0231, -0.2085],
          [ 0.1027, -0.0681, -0.2438],
          [-0.1563,  0.3104, -0.2443]]],


        [[[-0.1176,  0.0856, -0.1144],
          [ 0.1699, -0.1369, -0.0251],
          [-0.1727, -0.2576, -0.1505]]]], requires_grad=True)
Parameter containing:
tensor([ 0.2389,  0.2400, -0.1282, -0.2157, -0.1651,  0.2641],
       requires_grad=True)


### Building the neural network in numpy

In [19]:
import numpy as np

In [20]:
# N is batch size; D_in is input dimension;
# D_h is hidden dimension; D_out is output dimension.

In [21]:
# building 2 layer feed forward neural net.

In [22]:
N, D_in, D_h, D_out = 64, 1000, 100, 10

In [23]:
# creating random input and output data

In [24]:
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [25]:
# randomly initializing weights

In [26]:
w1 = np.random.randn(D_in, D_h)
w2 = np.random.randn(D_h, D_out)

In [27]:
lr = 1e-6

In [28]:
for t in range(500):
    # Forward pass: computing y_pred
    z = x.dot(w1) # y = W1*X + b
    z_relu = np.maximum(z, 0)
    y_pred = z_relu.dot(w2) # y_pred = W2 * y + b

    # Compute loss
    loss = np.square(y_pred - y).sum()
    print(loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = z_relu.T.dot(grad_y_pred)
    grad_z_relu = grad_y_pred.dot(w2.T)
    grad_z = grad_z_relu.copy()
    grad_z[z < 0] = 0
    grad_w1 = x.T.dot(grad_z)


    # Update weights
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2



37067542.94334827
43728517.089867316
59126110.69765696
67254317.19113758
49397725.25204694
20873509.814084142
6062621.714529593
2285472.5861908975
1412596.1988003482
1091760.8552572448
896502.1479915063
750111.1832361736
633943.9076387167
539960.2805066684
462928.8663222154
399239.7176547616
346186.17626899324
301709.9327622862
264169.4418756169
232231.65043946548
204936.75837619934
181487.7097800699
161246.19820831198
143697.06599834925
128409.82265639835
115059.02494414602
103344.59104947116
93033.56441196322
83930.1275097398
75874.253405867
68723.84906732837
62382.933880540804
56735.541619886164
51690.352692052154
47169.669786673054
43105.44137648971
39445.57971878331
36145.148349210926
33160.958445420285
30459.140074202645
28009.63564975006
25784.816425275014
23760.91119835395
21917.519721046658
20244.42616576638
18715.512125802394
17318.444525317424
16039.76420740183
14866.94306886348
13790.790595850713
12802.000996799185
11892.972241578555
11055.369646936333
10283.464341451594
95

### PyTorch: Tensors and autograd
Thankfully, we can use automatic differentiation to automate the computation of backward passes in neural networks. The autograd package in PyTorch provides exactly this functionality. When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

In [29]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [30]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [31]:

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [32]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [35]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2) # y1 = w1.dot(X) => y1_relu = max(y1, 0) => y2 = y1_relu.dot(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 2664.96484375
199 92.51250457763672
299 4.903074264526367
399 0.28263676166534424
499 0.01693444326519966


### PyTorch: nn

In [36]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 100, 1000, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [37]:
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.

In [38]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # hidden layer
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out) # output layer
)

In [39]:
print(model)

Sequential(
  (0): Linear(in_features=100, out_features=1000, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1000, out_features=10, bias=True)
)


In [48]:
torchsummary.summary(model, input_size=(100,))  # -1 represents batch

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1000]         101,000
              ReLU-2                 [-1, 1000]               0
            Linear-3                   [-1, 10]          10,010
Total params: 111,010
Trainable params: 111,010
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 0.42
Estimated Total Size (MB): 0.44
----------------------------------------------------------------


In [49]:
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

In [53]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad # w1, b1 => w2, b2

99 6.714045430733506e-10
199 5.065459163233754e-10
299 4.0454908867282313e-10
399 3.326021957850145e-10
499 2.824403211310056e-10


### PyTorch: optim

In [54]:
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

99 2.0839140688622138e-06
199 1.0924018634117516e-10
299 1.5969028183127065e-11
399 1.9460138361848678e-11
499 0.0018441888969391584


### PyTorch: Custom nn Modules
Sometimes you will want to specify models that are more complex than a sequence of existing Modules; for these cases you can define your own Modules by subclassing nn.Module and defining a forward which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.

In [56]:
class Net(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(D_in, H)
        self.linear2 = nn.Linear(H, D_out)
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred   


In [57]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = Net(D_in, H, D_out)

In [58]:
print(model)

Net(
  (linear1): Linear(in_features=1000, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=10, bias=True)
)


In [61]:
loss_obj  = nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [62]:
for t in range(500):
    # Forward pass
    y_pred = model(x)

    # compute the loss
    loss = loss_obj(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 3.2087807655334473
199 0.07874482870101929
299 0.0032931994646787643
399 0.00017603253945708275
499 1.0847301382455043e-05
