In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
%matplotlib inline
%config InlineBackend.figure_format='retina'
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.8.1].
device:[cuda:0].


gpu를 잘 쓰고 있군요

In [2]:
from torchvision import datasets, transforms
mnist_train = datasets.MNIST(root='./data/', train=True, transform = transforms.ToTensor(), download=True)
mnist_test = datasets.MNIST(root = './data/', train=False, transform = transforms.ToTensor(), download=True)
print("mnist_train:\n", mnist_train, "\n")
print("mnist_test:\n", mnist_test, "\n")
print("Done")

mnist_train:
 Dataset MNIST
    Number of datapoints: 60000
    Root location: ./data/
    Split: Train
    StandardTransform
Transform: ToTensor() 

mnist_test:
 Dataset MNIST
    Number of datapoints: 10000
    Root location: ./data/
    Split: Test
    StandardTransform
Transform: ToTensor() 

Done


### Data Iterator
#### DataLoader(dataset, batch_size, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None) 

In [3]:
BATCH_SIZE = 256
train_iter = torch.utils.data.DataLoader(mnist_train,batch_size=BATCH_SIZE,shuffle=True,num_workers=1)
test_iter = torch.utils.data.DataLoader(mnist_test,batch_size=BATCH_SIZE,shuffle=True,num_workers=1)
print ("Done.")

Done.


### MLP model
#### torch.nn.Linear(in_feat, out_feat, bias=True, device=None, dtype = None)
1. in_features : input sample의 size 
2. out_features : output sample의 size 
3. bias = False이면, layer는 bias를 학습하지 않는다. default는 True이다.

In [4]:
class MultiLayerPerceptronClass(nn.Module):
    def __init__(self, name='mlp', xdim=784, hdim=256, ydim=10):
        super(MultiLayerPerceptronClass, self).__init__()
        self.name = name
        self.xdim = xdim
        self.hdim = hdim
        self.ydim = ydim
        self.lin_1 = nn.Linear(self.xdim, self.hdim, device)
        self.lin_2 = nn.Linear(self.hdim, self.ydim, device)
        self.init_param() 
        
    def init_param(self):
        nn.init.kaiming_normal_(self.lin_1.weight)
        nn.init.zeros_(self.lin_1.bias)
        nn.init.kaiming_normal_(self.lin_2.weight)
        nn.init.zeros_(self.lin_2.bias)
        
    def forward(self, x):
        net = x
        net = self.lin_1(net)
        net = F.relu(net)
        net = self.lin_2(net)
        return net
    
model = MultiLayerPerceptronClass(name='mlp',xdim=784,hdim=256,ydim=10).to(device)
loss = nn.CrossEntropyLoss()
optm = optim.Adam(model.parameters(), lr=1e-3)

In [5]:
print(model)

MultiLayerPerceptronClass(
  (lin_1): Linear(in_features=784, out_features=256, bias=True)
  (lin_2): Linear(in_features=256, out_features=10, bias=True)
)


### Forward Path of the MLP Model

In [6]:
import random
x_numpy = np.random.rand(2, 784)
x_torch = torch.from_numpy(x_numpy).float().to(device)
y_torch = model.forward(x_torch) #forward 후 출력되는 y class
y_numpy = y_torch.detach().cpu().numpy() # tensor to numpy
print("x_numpy:\n", x_numpy)
print("x_torch:\n", x_torch)
print("y_torch:\n", y_torch)
print("y_numpy:\n", y_numpy)


x_numpy:
 [[0.94285789 0.20773299 0.04822789 ... 0.04476771 0.81844804 0.86634775]
 [0.19317892 0.34745886 0.39895159 ... 0.58484288 0.94875307 0.13006104]]
x_torch:
 tensor([[0.9429, 0.2077, 0.0482,  ..., 0.0448, 0.8184, 0.8663],
        [0.1932, 0.3475, 0.3990,  ..., 0.5848, 0.9488, 0.1301]],
       device='cuda:0')
y_torch:
 tensor([[-1.3884,  1.1287, -0.2183, -0.3486,  1.2217, -0.0940, -1.2542, -0.2093,
         -0.7002,  1.3421],
        [-0.6304,  0.7817, -0.9152, -0.5888,  1.5725, -0.0132, -1.8734, -0.5062,
         -0.6180,  1.1436]], device='cuda:0', grad_fn=<AddmmBackward>)
y_numpy:
 [[-1.3883781   1.128686   -0.2182925  -0.34859407  1.2216712  -0.09404826
  -1.2541772  -0.2093036  -0.7002028   1.3420562 ]
 [-0.63035476  0.78171146 -0.91522765 -0.5887705   1.5724785  -0.01320729
  -1.8733902  -0.50616753 -0.6179874   1.1436275 ]]


### Check Parameters
#### weight, bias of each layers

np.set_printoptions(precision=None, threshold=None, ...): precision : output 소수점 아래 자리를 고정하기 위해 사용함. 
#### Parameter를 확인하는 방법 2가지
1. torch.nn.Module.parameters() : layer 이름을 제외한 parameter 값에 대한 iterator를 리턴함  
2. torch.nn.Module.named_parameters() : (parameter name, parameter)의 튜플 iterator 리턴함. 

In [7]:
np.set_printoptions(precision=3)
n_param = 0
for p_idx, (param_name, param) in enumerate(model.named_parameters()):
    param_numpy = param.detach().cpu().numpy()
    n_param += len(param_numpy.reshape(-1))
    print("[%d] name:[%s] shape: [%s]."%(p_idx, param_name, param_numpy.shape))
    print("   val:%s"%(param_numpy.reshape(-1)[:5]))
print("Total number of parameters:[%s]."%(format(n_param, ',d')))


[0] name:[lin_1.weight] shape: [(256, 784)].
   val:[-0.055  0.104 -0.032  0.041  0.089]
[1] name:[lin_1.bias] shape: [(256,)].
   val:[0. 0. 0. 0. 0.]
[2] name:[lin_2.weight] shape: [(10, 256)].
   val:[-0.08   0.032 -0.062 -0.045  0.055]
[3] name:[lin_2.bias] shape: [(10,)].
   val:[0. 0. 0. 0. 0.]
Total number of parameters:[203,530].


### Evaluation Function
torch.Tensor는 .require_grad 속성을 True로 설정하면, 그 tensor에서 이뤄진 모든 연산을 추적한다. 따라서 계산이 완료된 후 .backward()를 호출하면 모든 gradient를 자동으로 계산할 수 있다. 이 변화도는 .grad 에 누적된다. 
1. with torch.no_grad() : 메모리 사용량을 줄이기 위해 코드 블럭을 감싼 것이다. 평가하는 단게에서는 gradient를 계산할 필요가 없기 때문이다. 

train 과 evalute에 서로 다르게 동작해야 하는 것 : 1) Dropout layer, 2) BatchNorm layer

2. model.eval() : evaluation 과정에서 사용하지 않아야 할 layer를 알아서 off 시키는 역할을 한다. evaluation이 끝나면 다시 model.train()을 통해 train mode로 변경 해줘야 한다. 
3. data_iter : test_set image (=batch_in), label (=batch_out)
4. torch.max(input tensor, dim to reduce) : input tensor는 batch의 각 y class이다. 각각에서 max값을 1-dim tensor로 리턴한다._은 가장 큰 확률 값 텐서이고, y_pred는 argmax idx이다. 

In [8]:
def func_eval(model, data_iter, device):
    with torch.no_grad():
        model.eval() 
        n_total, n_correct = 0, 0
         
        for batch_in, batch_out in data_iter:
            label = batch_out.to(device)
            # model 안에 test set을 넣고 예측을 출력
            model_pred = model.forward(batch_in.view(-1, 28*28).to(device))
            
            _, y_pred = torch.max(model_pred.data, 1)
            n_correct += (y_pred == label).sum().item()
            n_total += batch_in.size(0)
        val_accr = (n_correct/n_total)
        model.train()
    return val_accr
print("Done")

Done


In [9]:
a = torch.randn(4,5)
a

tensor([[ 0.1914, -0.2675,  1.7128, -0.0170, -0.1897],
        [ 1.2396,  2.1350,  0.6858,  0.9363, -0.4073],
        [ 1.8576, -0.5630, -0.6915,  0.0913,  0.9568],
        [ 0.2810, -0.4587, -0.2019,  0.6088,  0.3123]])

In [10]:
torch.max(a, 1)

torch.return_types.max(
values=tensor([1.7128, 2.1350, 1.8576, 0.6088]),
indices=tensor([2, 1, 0, 3]))

### Initial Evaluation

In [11]:
model.init_param()
train_accr = func_eval(model, train_iter, device)
test_accr = func_eval(model, test_iter, device)
print("train_accr:[%.3f] test_accr:[%.3f]."%(train_accr, test_accr))

train_accr:[0.085] test_accr:[0.082].


### Train

In [12]:
model.init_param()
model.train()
EPOCHS, print_every = 10, 1
for epoch in range(EPOCHS):
    loss_sum = 0
    for batch_in, batch_out in train_iter:
        y_pred = model.forward(batch_in.view(-1, 28*28).to(device))
        loss_out = loss(y_pred, batch_out.to(device))
        
        optm.zero_grad()
        loss_out.backward()
        optm.step()
        
        loss_sum += loss_out
    loss_avg = loss_sum/len(train_iter)
    
    if ((epoch%print_every)==0) or (epoch==(EPOCHS-1)):
        train_accr = func_eval(model,train_iter,device)
        test_accr = func_eval(model,test_iter,device)
        print ("epoch:[%d] loss:[%.3f] train_accr:[%.3f] test_accr:[%.3f]."%
               (epoch,loss_avg,train_accr,test_accr))
print ("Done")    
        

epoch:[0] loss:[0.380] train_accr:[0.947] test_accr:[0.946].
epoch:[1] loss:[0.163] train_accr:[0.966] test_accr:[0.961].
epoch:[2] loss:[0.115] train_accr:[0.975] test_accr:[0.968].
epoch:[3] loss:[0.087] train_accr:[0.981] test_accr:[0.972].
epoch:[4] loss:[0.069] train_accr:[0.983] test_accr:[0.974].
epoch:[5] loss:[0.056] train_accr:[0.988] test_accr:[0.974].
epoch:[6] loss:[0.046] train_accr:[0.990] test_accr:[0.979].
epoch:[7] loss:[0.039] train_accr:[0.992] test_accr:[0.980].
epoch:[8] loss:[0.033] train_accr:[0.995] test_accr:[0.980].
epoch:[9] loss:[0.027] train_accr:[0.996] test_accr:[0.980].
Done
