# 신경망 (Neural Networks)

## 신경망 정의하기

In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

#### 간단한 CNN 예제 (conv 2번, feed-forward 3번)

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        print('conv2 > max_pooling 후의 x shpae\t', x.size())
        x = x.view(-1, self.num_flat_features(x))
        print('after reshape of x\t', x.size())
        x = F.relu(self.fc1(x))
        print('after linear function\t', x.size())
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        
        for s in size:
            num_features *= s
        return num_features
    

In [3]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


#### parameter 체크 (weight 5개, bias 5개)

In [4]:
params = list(net.parameters())
print(len(params))

10


In [5]:
params[0].size() # conv1's weight

torch.Size([6, 1, 5, 5])

In [6]:
params[1].size() # conv1's bias

torch.Size([6])

#### input 생성 (Variable로 감싸야 연산 추적 가능)

In [7]:
input = Variable(torch.randn(1, 1, 32, 32), 
                requires_grad=True) # nSample, nChannel, Height, Width

#### 차원 변화 추적

In [8]:
out = net(input)
print(out)

conv2 > max_pooling 후의 x shpae	 torch.Size([1, 16, 5, 5])
after reshape of x	 torch.Size([1, 400])
after linear function	 torch.Size([1, 120])
tensor([[ 0.0200, -0.0337, -0.0310, -0.0758, -0.1669,  0.0494,  0.1285,  0.0257,
         -0.1026, -0.0031]], grad_fn=<AddmmBackward>)


#### 무작위 값으로 back propagation
#### 모든 매개변수의 변화도 버퍼(gradient buffer)을 0으로 설정

In [35]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [37]:
input.grad.size()

torch.Size([1, 1, 32, 32])

## 손실 함수 (Loss Function)

#### 대표적 예인 MSE

In [51]:
output = net(input)
output.size()

torch.Size([1, 10])

In [52]:
torch.arange(1, 11)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

#### numpy처럼 arange 메소드가 존재
#### output 이 floatTensor 이므로 형을 맞춰줘야함

In [70]:
target = Variable(torch.arange(1, 11, out=torch.FloatTensor())) # a dummy target, for example
target

tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [71]:
target.size()

torch.Size([10])

#### 차원 수를 output과 맞추기 위해 2차원으로 바꿈

In [72]:
target = target.view(1, -1) # make it the same shape as output
target

tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]])

In [73]:
target.size()

torch.Size([1, 10])

#### 어떤 방식으로 loss를 체크할 지 선언

In [74]:
criterion = nn.MSELoss()
print(criterion)

MSELoss()


In [75]:
type(target)

torch.Tensor

In [76]:
loss = criterion(output, target)
loss

tensor(38.2805, grad_fn=<MseLossBackward>)

#### loss 연산 추적

In [79]:
loss.grad_fn

<MseLossBackward at 0x21c48fe6358>

#### next_functions attribute로 이전 연산 추적

In [86]:
loss.grad_fn.next_functions

((<AddmmBackward at 0x21c490c1198>, 0),)

#### 반환은 튜플 형태인 것을 확인 (0은 제로 gredient로 예상)

In [87]:
loss.grad_fn.next_functions[0][1]

0

In [88]:
loss.grad_fn.next_functions[0][0]

<AddmmBackward at 0x21c490c1198>

#### 몇 단계 더 추적

In [96]:
loss.grad_fn.next_functions[0][0].next_functions

((<AccumulateGrad at 0x21c48fea3c8>, 0),
 (<ReluBackward0 at 0x21c48fea470>, 0),
 (<TBackward at 0x21c48fea550>, 0))

In [100]:
loss.grad_fn.next_functions[0][0].next_functions[0][0].next_functions

()

#### accumulateGrad, TBackward 외에 grad를 따라 들어가야 과정이 보임

In [98]:
loss.grad_fn.next_functions[0][0].next_functions[1][0].next_functions

((<AddmmBackward at 0x21c48eae048>, 0),)

In [107]:
loss.grad_fn.next_functions[0][0].next_functions[1][0].next_functions[0][0].next_functions

((<AccumulateGrad at 0x21c48e9b5f8>, 0),
 (<ReluBackward0 at 0x21c48e9b630>, 0),
 (<TBackward at 0x21c48e9b668>, 0))

In [110]:
loss.grad_fn.next_functions[0][0].next_functions[1][0].next_functions[0][0].next_functions[1][0].next_functions[0][0].next_functions

((<AccumulateGrad at 0x21c48e8b898>, 0),
 (<ViewBackward at 0x21c48e8bbe0>, 0),
 (<TBackward at 0x21c48e8ba20>, 0))

In [102]:
loss.grad_fn.next_functions[0][0].next_functions[2][0].next_functions

((<AccumulateGrad at 0x21c48ea1710>, 0),)

In [105]:
loss.grad_fn.next_functions[0][0].next_functions[2][0].next_functions[0][0].next_functions

()

## 역전파 (Backprop)

#### 기존 변화도를 지우고 역전파를 계산해야함

In [111]:
net.zero_grad() # zeroes the gradient buffers of all parameters

In [112]:
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0133,  0.0645, -0.0386, -0.0578, -0.0224,  0.1019])


## 가중치 갱신

#### 대표적으로 이용하는 경사하강법

In [113]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

#### 이외에 다양한 가중치 갱신 방법들이 패키지에 내장

In [114]:
import torch.optim as optim

#### optimizer 생성 및 확률적 경사하강법(Stochastic Gradient Descent)

In [116]:
optimizer = optim.SGD(net.parameters(), lr=0.01)

In [117]:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update