In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np

## nn.Module
* Base class for all neural network modules
* 只要在nn.Module的子類中定義了forward函數，backward函數就會被自動實現（利用Autograd）
* nn.Conv2d 本身也是nn.Module的類別(此時我們可以先不用理解nn.Conv2D做了什麼，只需了解其包含一些參數與操作)

In [2]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

In [3]:
model = Model()

### 實踐 forward propagation 
* 為什麼不應該直接call model.forward : https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690

In [4]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

### 查看 model 底下的 modules

#### .modules

* model.modules 遞迴的列出所有的 modules

In [5]:
for module in model.modules():
    print(module)

Model(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))
)
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))


#### .children

* model.children 只列出第一層的子 modules

In [6]:
for module in model.children():
    print(module)

Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))


### 查看 model 內的 parameters (torch.nn.parameter.Parameter)

#### .named_parameters
* named_parameters會列出每個nn.Module底下parameters 的名字,數值
* 同時可以查看 requires_grad是否開啟(for backpropagation)

In [7]:
for name, param in model.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


#### .parameters
* 不會印出名字

In [8]:
for param in model.parameters():
    print(type(param),param.shape, param.requires_grad)

<class 'torch.nn.parameter.Parameter'> torch.Size([20, 1, 5, 5]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20, 20, 5, 5]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20]) True


#### 計算模型可訓練參數總量

In [9]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('總共參數量：' ,params)

總共參數量： 10540


### Backpropagation

In [10]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

#### 確認 requires_grad為 True (default 就是 True)

In [11]:
for name, param in model.named_parameters():
    print(name,param.requires_grad)

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


#### 此時還沒做backpropagation，parameters沒有gradient value

In [12]:
print(model.conv1.weight.grad)

None


#### 執行backward，完成後就能看到每個parameters底下的gradient value

In [13]:
output.sum().backward()

In [14]:
print(model.conv1.weight.grad)

tensor([[[[  605.5430,  -376.2526,  -264.9802,  -405.8641,   330.9492],
          [ -498.8722,    89.9731,   254.4219,  -252.9127,   -92.0358],
          [  -63.3486,  -108.4003,    63.5249,  -241.9144,  -187.6409],
          [ -306.5060,  -275.6410,   458.6197,    81.5387,   455.1625],
          [ -405.7625,  -175.4588,  -278.4715,   376.6024,   386.9280]]],


        [[[   10.3513,   -12.5419,    77.3492,   103.1113,   -32.1752],
          [  209.9700,  -101.2162,   -43.0707,   385.9092,   -51.0152],
          [ -268.2236,   238.1764,   337.8172,   247.8170,  -107.5047],
          [ -219.5956,    -3.0727,  -244.3324,  -122.1673,  -173.0466],
          [   43.3710,    58.4297,   166.3651,     6.5298,    90.7496]]],


        [[[  483.3836,   305.7553,  -458.8383,  -221.6854,   602.1165],
          [  583.9101,   778.7874,  -209.8309,  -612.6638,   516.8854],
          [  416.4789,  -699.1741,  -945.6849,  -216.3947,  -673.4405],
          [  819.1714,  -201.8294,   406.1374,  -319.300

#### 當我們把 parameters 的 requires_grad關閉時，就無法成功的完成backward
* 什麼時候會關閉requires_grad關閉時？ prediction (inference)的階段
* 設定 requires_grad = True 是為了之後要做 backpropagation，在計算每個paramters的 gradient時，我們在forward propagation時需要保留額外的訊息(根據chain rule)，這會導致記憶體使用量上升與計算速度下降，然而只有在 training 階段時我們材需要做backpropagation，在 prediction (inference)的階段，我們則可以設定 requires_grad = False 來提升速度與降低記憶體使用量 

In [15]:
for param in model.parameters():
    param.requires_grad = False

In [16]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

In [17]:
output.sum().backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

#### with torch.no_grad()
* 此行底下的requires_grad都會關閉

In [18]:
for param in model.parameters():
    param.requires_grad = True
with torch.no_grad():
    input_ = torch.randn(1,1,124,124)
    output = model(input_)
    output.sum().backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

### 讓我們自行搭建一個 nn.Module 並試算gradient

In [19]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.x = torch.nn.Parameter(torch.tensor(2.4,dtype=torch.float32))
        self.y = torch.nn.Parameter(torch.tensor(4.3,dtype=torch.float32))

    def forward(self, x):
        output = x*self.x**2 + x*self.y + x # 可以看成 output = w*x*x + w*y+2
        return output

In [20]:
model = Model()
input_ = torch.tensor(1.3, dtype = torch.float32)
output = model(input_)
output.backward()
# output 對 self.x 的偏微分為 2 * w * x = 2 * 1.3 * 2.4 = 6.24 
print('self.x 的 gradient : {}'.format(model.x.grad))
# output 對 self.y 的偏微分為 w = 1.3
print('self.y 的 gradient : {}'.format(model.y.grad))

self.x 的 gradient : 6.240000247955322
self.y 的 gradient : 1.2999999523162842


## Sequential
* nn.Module 的容器

In [21]:
layer = nn.Sequential(
                        nn.Conv2d(3,
                                  20,
                                  kernel_size=3,
                                  stride=1,
                                  padding=1,
                                  bias=False), 
                        nn.BatchNorm2d(20),
                        nn.LeakyReLU(inplace=True))

In [22]:
for name, param in layer.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

0.weight True
1.weight True
1.bias True


In [23]:
input_ = torch.randn(1, 3, 124, 124)
output = layer(input_)

#### OrderedDict+Sequential, 讓我們替每一個module命名

In [24]:
from collections import OrderedDict

In [25]:
layer = nn.Sequential(OrderedDict([
          ('conv1', nn.Conv2d(1,20,5)),
          ('relu1', nn.ReLU()),
          ('conv2', nn.Conv2d(20,64,5)),
          ('relu2', nn.ReLU())
        ]))


In [26]:
for module in layer.modules():
    print(module)
    #param.requires_grad=True

Sequential(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (conv2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
)
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
ReLU()
Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
ReLU()


In [27]:
for name, param in layer.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


In [28]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


#### append 新的 module到 sequential上

In [29]:
import torch.nn as nn

modules = []
modules.append(nn.Conv2d(1,20,5))
modules.append(nn.ReLU())
modules.append(nn.Conv2d(20,64,5))
modules.append(nn.ReLU())

layer = nn.Sequential(*modules)

In [30]:
layer

Sequential(
  (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (3): ReLU()
)

In [31]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 另一種方式

In [32]:
layer = torch.nn.Sequential()
layer.add_module("conv1", nn.Conv2d(1,20,5))
layer.add_module("relu1", nn.ReLU())
layer.add_module("conv2", nn.Conv2d(20,64,5))
layer.add_module("relu2", nn.ReLU())

In [33]:
layer

Sequential(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (conv2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
)

In [221]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


## ModuleList
* 操作就像是python list, 但其內的module, parameters是可以被追蹤的

In [56]:
layer = nn.ModuleList()
layer.append(nn.Conv2d(1,20,5))
layer.append(nn.ReLU())
layer.append(nn.Conv2d(20,64,5))
layer.append(nn.ReLU())

ModuleList(
  (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (3): ReLU()
)

In [57]:
input_ = torch.randn(1, 1, 124, 124)
for _, module in enumerate(layer):
    if _ == 0:
        output = module(input_)
    else:
        output = module(output)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 可以追蹤是什麼意思？ nn.Module有辦法去獲取ModuleList裡面的資訊

In [250]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layer = nn.ModuleList()
        self.layer.append(nn.Conv2d(1,20,5))
        self.layer.append(nn.ReLU())
        self.layer.append(nn.Conv2d(20,64,5))
        self.layer.append(nn.ReLU())

    def forward(self, x):
        for module in self.layer:
            x = module(x)
        return x

In [251]:
model = Model()

In [252]:
model

Model(
  (layer): ModuleList(
    (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
    (3): ReLU()
  )
)

In [253]:
input_ = torch.randn(1, 1, 124, 124)
output = model(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 如果是一般的 python list

In [254]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layer = []
        self.layer.append(nn.Conv2d(1,20,5))
        self.layer.append(nn.ReLU())
        self.layer.append(nn.Conv2d(20,64,5))
        self.layer.append(nn.ReLU())

    def forward(self, x):
        for module in self.layer:
            x = module(x)
        return

In [255]:
model = Model()

In [256]:
model

Model()