In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchvision import datasets, transforms

from torch.utils.data import DataLoader

import os, time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class MyNet(nn.Module):
    def __init__(self, dim_in, dim_h1, dim_h2, dim_out):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(dim_in,dim_h1)
        self.fc2 = nn.Linear(dim_h1,dim_h2)
        self.fc3 = nn.Linear(dim_h2,dim_out)
        
    def forward(self, x):
        h1 = self.fc1(x)
        h1 = F.relu(h1)
        h2 = self.fc2(h1)
        h2 = F.relu(h2)
        out = self.fc3(h2)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        # 이번엔 모델에서 softmax를 빼고, 뒤의 loss에서 cross_entropy loss를 활용해봅시다.
        return out

train() 함수

In [3]:
def train(model, data_loader, optimizer, criterion, device):
    model.train() # 모델을 학습모드로!
    trn_loss = 0
    
    for i, (x, y) in enumerate(data_loader):
        # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
        x = x.view(-1, 784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
        y = y.to(device)
        
        # Step 2. gradient 초기화
        optimizer.zero_grad()
        
        # Step 3. Forward Propagation
        y_pred_prob = model(x)
        
        # Step 4. Loss Calculation
        loss = criterion(y_pred_prob, y)
        
        # Step 5. Gradient Calculation (Backpropagation)
        loss.backward()
        
        # Step 6. Update Parameter (by Gradient Descent)
        optimizer.step()
        
        # Step 7. trn_loss 변수에 mini-batch loss를 누적해서 합산
        trn_loss += loss.item()
        
    # Step 8. 데이터 한 개당 평균 train loss
    avg_trn_loss = trn_loss / len(data_loader.dataset)
    return avg_trn_loss

evaluate() 함수

In [4]:
def evaluate(model, data_loader, optimizer, criterion, device):
    model.eval() # 모델을 평가모드로!
    eval_loss = 0
    
    results_pred = []
    results_real = []
    with torch.no_grad(): # evaluate()함수에는 단순 forward propagation만 할 뿐, gradient 계산 필요 X.
        for i, (x, y) in enumerate(data_loader):
            # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
            x = x.view(-1,784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
            y = y.to(device)

            # Step 2. Forward Propagation
            y_pred_prob = model(x)

            # Step 3. Loss Calculation
            loss = criterion(y_pred_prob, y)
            
            # Step 4. Predict label
            y_pred_label = torch.argmax(y_pred_prob, dim=1)
            
            # Step 5. Save real and predicte label
            results_pred.extend(y_pred_label.detach().cpu().numpy())
            results_real.extend(y.detach().cpu().numpy())
            
            # Step 6. eval_loss변수에 mini-batch loss를 누적해서 합산
            eval_loss += loss.item()

    # Step 7. 데이터 한 개당 평균 eval_loss와 accuracy구하기
    avg_eval_loss = eval_loss / len(data_loader.dataset)
    results_pred = np.array(results_pred)
    results_real = np.array(results_real)
    accuracy = np.sum(results_pred == results_real) / len(results_real)
    
    return avg_eval_loss, accuracy

In [5]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [6]:
# torchvision에서도 MNIST데이터를 제공합니다. 
# 이 데이터를 다운 받을 디렉토리(data_path) 존재 여부를 확인하고 존재하지 않으면 생성 
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
# data 변환 방법 선언 (data transform method)
# 아래 예시: numpy형태의 데이터를 받으면 걔를 tensor로 변환해줘!
transform = transforms.Compose([transforms.ToTensor()])

# dataset을 생성 (torchvision에서 제공하는 데이터를 다운 받고, 위의 방법대로 변환)
trn_dset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
tst_dset = datasets.MNIST(root=data_path, train=False, transform=transform, download=True)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# stochastic gradient descent 방식
model_sgd = MyNet(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model_sgd = model_sgd.to(device)

# adagrad 방식
model_ada = MyNet(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model_ada = model_ada.to(device)

# RMSProp 방식
model_rms = MyNet(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model_rms = model_rms.to(device)

# adam 방식
model_adam = MyNet(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model_adam = model_adam.to(device)

In [9]:
loss_func = nn.CrossEntropyLoss(reduction='sum')

In [10]:
save_dir = 'models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [11]:
N_EPOCHS = 10
LR = 2e-4
BATCH_SIZE = 2**9

In [12]:
trn_loader = DataLoader(trn_dset, batch_size = BATCH_SIZE, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = BATCH_SIZE, shuffle=False, drop_last=False)

In [13]:
opt_sgd = optim.SGD(model_sgd.parameters(), lr = LR)
opt_ada = optim.Adagrad(model_ada.parameters(), lr = LR)
opt_rms = optim.RMSprop(model_rms.parameters(), lr = LR)
opt_adam = optim.Adam(model_adam.parameters(), lr = LR)

In [14]:
param_dict = {
    'SGD': {'model': model_sgd, 'optimizer': opt_sgd},
    'Adagrad': {'model': model_ada, 'optimizer': opt_ada},
    'RMSProp': {'model': model_rms, 'optimizer': opt_rms},
    'Adam': {'model': model_adam, 'optimizer': opt_adam},
}

In [15]:
best_val_loss = float('inf')
keys = ['SGD', 'Adagrad', 'RMSProp', 'Adam']
result_dict = {
    'SGD': {'trn_loss': [], 'val_loss': [], 'val_acc': []},
    'Adagrad': {'trn_loss': [], 'val_loss': [], 'val_acc': []},
    'RMSProp': {'trn_loss': [], 'val_loss': [], 'val_acc': []},
    'Adam': {'trn_loss': [], 'val_loss': [], 'val_acc': []},
}

for key in keys:
    print(f'optimizer: {key}')
    start_time = time.time()
    for epoch in range(N_EPOCHS):
        trn_loss = train(model=param_dict[key]['model'], 
                         data_loader=trn_loader, 
                         optimizer=param_dict[key]['optimizer'], 
                         criterion=loss_func,
                         device=device)

        val_loss, accuracy = evaluate(model=param_dict[key]['model'], 
                                      data_loader=tst_loader, 
                                      optimizer=param_dict[key]['optimizer'], 
                                      criterion=loss_func,
                                      device=device)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    #     if val_loss < best_val_loss:
    #         best_val_loss = val_loss
    #         torch.save(model.state_dict(), f'{save_dir}/my_model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {trn_loss:.3f} | Test Loss: {val_loss:.3f} | Test Acc: {100*accuracy:.3f}% ')

        result_dict[key]['trn_loss'] = trn_loss
        result_dict[key]['val_loss'] = val_loss
        result_dict[key]['val_acc'] = accuracy

optimizer: SGD
Epoch: 01 | Time: 0m 5s
	Train Loss: 1.497 | Test Loss: 0.609 | Test Acc: 83.730% 
Epoch: 02 | Time: 0m 9s
	Train Loss: 0.477 | Test Loss: 0.374 | Test Acc: 89.240% 
Epoch: 03 | Time: 0m 13s
	Train Loss: 0.366 | Test Loss: 0.332 | Test Acc: 90.620% 
Epoch: 04 | Time: 0m 17s
	Train Loss: 0.324 | Test Loss: 0.296 | Test Acc: 91.660% 
Epoch: 05 | Time: 0m 20s
	Train Loss: 0.297 | Test Loss: 0.273 | Test Acc: 92.490% 
Epoch: 06 | Time: 0m 24s
	Train Loss: 0.276 | Test Loss: 0.261 | Test Acc: 92.610% 
Epoch: 07 | Time: 0m 28s
	Train Loss: 0.260 | Test Loss: 0.244 | Test Acc: 93.010% 
Epoch: 08 | Time: 0m 32s
	Train Loss: 0.243 | Test Loss: 0.230 | Test Acc: 93.390% 
Epoch: 09 | Time: 0m 36s
	Train Loss: 0.229 | Test Loss: 0.218 | Test Acc: 93.510% 
Epoch: 10 | Time: 0m 40s
	Train Loss: 0.216 | Test Loss: 0.205 | Test Acc: 93.880% 
optimizer: Adagrad
Epoch: 01 | Time: 0m 4s
	Train Loss: 2.250 | Test Loss: 2.206 | Test Acc: 51.900% 
Epoch: 02 | Time: 0m 8s
	Train Loss: 2.174 | 

---

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import os, time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
fc1 = nn.Linear(3,2)

In [3]:
[x for x in fc1.parameters()]

[Parameter containing:
 tensor([[-0.4645,  0.1620,  0.3790],
         [-0.0798, -0.2333, -0.0215]], requires_grad=True),
 Parameter containing:
 tensor([-0.3341, -0.0347], requires_grad=True)]

In [4]:
fc1.weight

Parameter containing:
tensor([[-0.4645,  0.1620,  0.3790],
        [-0.0798, -0.2333, -0.0215]], requires_grad=True)

In [5]:
fc1.bias

Parameter containing:
tensor([-0.3341, -0.0347], requires_grad=True)

In [6]:
# 특정 분포에서 random하게 샘플링해서 초기화
nn.init.normal_(fc1.weight, mean=0.0, std=1.0)

# 특정 값으로 초기화
nn.init.zeros_(fc1.bias)
nn.init.constant_(fc1.bias, 0)

Parameter containing:
tensor([0., 0.], requires_grad=True)

In [7]:
fc1.weight

Parameter containing:
tensor([[ 0.3941, -0.6126, -1.4673],
        [ 1.0696, -0.2772, -0.0807]], requires_grad=True)

In [8]:
fc1.bias

Parameter containing:
tensor([0., 0.], requires_grad=True)

In [9]:
tmp_tensor = torch.tensor([[1.,2.,3.],[4.,5.,6.]])

In [10]:
fc1.weight.data = tmp_tensor
fc1.weight

Parameter containing:
tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)

In [11]:
nn.init.xavier_normal_(fc1.weight)

Parameter containing:
tensor([[-0.0227,  0.3041, -0.3069],
        [ 0.8272,  0.5798, -0.5675]], requires_grad=True)

In [12]:
nn.init.kaiming_normal_(fc1.weight)

Parameter containing:
tensor([[-0.9145,  0.2248,  0.1328],
        [-1.2407, -1.6195, -0.0074]], requires_grad=True)

In [13]:
class MyNet(nn.Module):
    def __init__(self, dim_in=784, dim_h1=50, dim_h2=100, dim_out=10):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(dim_in,dim_h1)
        self.fc2 = nn.Linear(dim_h1,dim_h2)
        self.fc3 = nn.Linear(dim_h2,dim_out)
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 그 즉시 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
    def forward(self, x):
        h1 = self.fc1(x)
        h1 = F.relu(h1)
        h2 = self.fc2(h1)
        h2 = F.relu(h2)
        out = self.fc3(h2)
        
        return out

In [14]:
# model의 module들은 뭐가 있는지 봅시다.
model = MyNet()
[x for x in model.modules()]

[MyNet(
   (fc1): Linear(in_features=784, out_features=50, bias=True)
   (fc2): Linear(in_features=50, out_features=100, bias=True)
   (fc3): Linear(in_features=100, out_features=10, bias=True)
 ),
 Linear(in_features=784, out_features=50, bias=True),
 Linear(in_features=50, out_features=100, bias=True),
 Linear(in_features=100, out_features=10, bias=True)]

In [15]:
def train(model, data_loader, optimizer, criterion, device):
    model.train() # 모델을 학습모드로!
    trn_loss = 0
    for i, (x, y) in enumerate(data_loader):
        # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
        x = x.view(-1, 784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
        y = y.to(device)
        
        # Step 2. gradient 초기화
        optimizer.zero_grad()
        
        # Step 3. Forward Propagation
        y_pred_prob = model(x)
        
        # Step 4. Loss Calculation
        loss = criterion(y_pred_prob, y)
        
        # Step 5. Gradient Calculation (Backpropagation)
        loss.backward()
        
        # Step 6. Update Parameter (by Gradient Descent)
        optimizer.step()
        
        # Step 7. trn_loss 변수에 mini-batch loss를 누적해서 합산
        trn_loss += loss.item()
        
    # Step 8. 데이터 한 개당 평균 train loss
    avg_trn_loss = trn_loss / len(data_loader.dataset)
    return avg_trn_loss

In [16]:
def evaluate(model, data_loader, optimizer, criterion, device):
    model.eval() # 모델을 평가모드로!
    eval_loss = 0
    
    results_pred = []
    results_real = []
    with torch.no_grad(): # evaluate()함수에는 단순 forward propagation만 할 뿐, gradient 계산 필요 X.
        for i, (x, y) in enumerate(data_loader):
            # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
            x = x.view(-1,784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
            y = y.to(device)

            # Step 2. Forward Propagation
            y_pred_prob = model(x)

            # Step 3. Loss Calculation
            loss = criterion(y_pred_prob, y)
            
            # Step 4. Predict label
            y_pred_label = torch.argmax(y_pred_prob, dim=1)
            
            # Step 5. Save real and predicte label
            results_pred.extend(y_pred_label.detach().cpu().numpy())
            results_real.extend(y.detach().cpu().numpy())
            
            # Step 6. eval_loss변수에 mini-batch loss를 누적해서 합산
            eval_loss += loss.item()

    # Step 7. 데이터 한 개당 평균 eval_loss와 accuracy구하기
    avg_eval_loss = eval_loss / len(data_loader.dataset)
    results_pred = np.array(results_pred)
    results_real = np.array(results_real)
    accuracy = np.sum(results_pred == results_real) / len(results_real)
    
    return avg_eval_loss, accuracy

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
# torchvision에서도 MNIST데이터를 제공합니다. 
# 이 데이터를 다운 받을 디렉토리(data_path) 존재 여부를 확인하고 존재하지 않으면 생성 
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
# data 변환 방법 선언 (data transform method)
# 아래 예시: numpy형태의 데이터를 받으면 걔를 tensor로 변환해줘!
transform = transforms.Compose([transforms.ToTensor()])

# dataset을 생성 (torchvision에서 제공하는 데이터를 다운 받고, 위의 방법대로 변환)
trn_dset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
tst_dset = datasets.MNIST(root=data_path, train=False, transform=transform, download=True)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
model = MyNet(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model = model.to(device)

In [21]:
save_dir = 'models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [22]:
N_EPOCHS = 10
LR = 2e-4
BATCH_SIZE = 2**9

In [23]:
loss_func = nn.CrossEntropyLoss(reduction='sum')

In [24]:
trn_loader = DataLoader(trn_dset, batch_size = BATCH_SIZE, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = BATCH_SIZE, shuffle=False, drop_last=False)

In [25]:
my_opt = optim.Adam(model.parameters(), lr = LR)

In [26]:
best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    trn_loss = train(model=model, 
                     data_loader=trn_loader, 
                     optimizer=my_opt, 
                     criterion=loss_func,
                     device=device)
    val_loss, accuracy = evaluate(model=model, 
                                  data_loader=tst_loader, 
                                  optimizer=my_opt, 
                                  criterion=loss_func,
                                  device=device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f'{save_dir}/my_model2.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {trn_loss:.3f} | Test Loss: {val_loss:.3f} | Test Acc: {100*accuracy:.3f}% ')

Epoch: 01 | Time: 0m 4s
	Train Loss: 1.434 | Test Loss: 0.702 | Test Acc: 83.020% 
Epoch: 02 | Time: 0m 3s
	Train Loss: 0.523 | Test Loss: 0.391 | Test Acc: 89.990% 
Epoch: 03 | Time: 0m 3s
	Train Loss: 0.359 | Test Loss: 0.312 | Test Acc: 91.560% 
Epoch: 04 | Time: 0m 3s
	Train Loss: 0.302 | Test Loss: 0.274 | Test Acc: 92.320% 
Epoch: 05 | Time: 0m 3s
	Train Loss: 0.270 | Test Loss: 0.251 | Test Acc: 92.880% 
Epoch: 06 | Time: 0m 3s
	Train Loss: 0.246 | Test Loss: 0.230 | Test Acc: 93.510% 
Epoch: 07 | Time: 0m 3s
	Train Loss: 0.229 | Test Loss: 0.216 | Test Acc: 93.820% 
Epoch: 08 | Time: 0m 3s
	Train Loss: 0.214 | Test Loss: 0.206 | Test Acc: 94.060% 
Epoch: 09 | Time: 0m 3s
	Train Loss: 0.201 | Test Loss: 0.196 | Test Acc: 94.250% 
Epoch: 10 | Time: 0m 3s
	Train Loss: 0.191 | Test Loss: 0.187 | Test Acc: 94.400% 


---

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import os, time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device = torch.device('cpu')
print(device)

cuda:0


In [3]:
class MyNet(nn.Module):
    def __init__(self, dim_in=784, dim_h1=50, dim_h2=100, dim_out=10):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(dim_in,dim_h1)
        self.fc2 = nn.Linear(dim_h1,dim_h2)
        self.fc3 = nn.Linear(dim_h2,dim_out)
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
        
    def forward(self, x):
        h1 = self.fc1(x)
        h1 = F.relu(h1)
        h2 = self.fc2(h1)
        h2 = F.relu(h2)
        out = self.fc3(h2)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        
        return out

In [4]:
class MyNet_BN(nn.Module):
    def __init__(self, dim_in=784, dim_h1=50, dim_h2=100, dim_out=10):
        super(MyNet_BN, self).__init__()
        self.fc1 = nn.Linear(dim_in,dim_h1)
        self.fc2 = nn.Linear(dim_h1,dim_h2)
        self.fc3 = nn.Linear(dim_h2,dim_out)
        self.bn1 = nn.BatchNorm1d(dim_h1) # input -> hidden1로 가는 과정에서 필요한 batchnorm layer
        self.bn2 = nn.BatchNorm1d(dim_h2) # hidden1 -> hidden2로 가는 과정에서 필요한 batchnorm layer
#         self.bn3 = nn.BatchNorm1d(10) # output layer에서는 batchnorm이 통상적으로 잘 사용되지 않는 것 같습니다.
        self.apply(self._init_weights) # 모델을 만들때, self._init_weights()를 호출하여 parameter 초기화
        
    def _init_weights(self, submodule):
        if isinstance(submodule, nn.Linear): # submodule이 nn.Linear에서 생성된 객체(혹은 인스턴스이면)
            nn.init.kaiming_normal_(submodule.weight) #해당 submodule의 weight는 He Initialization으로 초기화
            if submodule.bias is not None:
                submodule.bias.data.fill_(0.01) # 해당 submodule의 bias는 0.01로 초기화
            
        
    def forward(self, x):
        h1 = self.fc1(x)
        h1 = self.bn1(h1) # batchnorm은 affine연산(matrix multiplication)이후 사용.(activation전에!)
        h1 = F.relu(h1)
        h2 = self.fc2(h1)
        h2 = self.bn2(h2) # batchnorm은 affine연산(matrix multiplication)이후 사용.(activation전에!)
        h2 = F.relu(h2)
        out = self.fc3(h2)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        return out

In [5]:
def train(model, data_loader, optimizer, criterion, device):
    model.train() # 모델을 학습모드로! BN이 있는 모델을 학습할 때, 반드시 필요함
    trn_loss = 0
    for i, (x, y) in enumerate(data_loader):
        # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
        x = x.view(-1, 784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
        y = y.to(device)
        
        # Step 2. gradient 초기화
        optimizer.zero_grad()
        
        # Step 3. Forward Propagation
        y_pred_prob = model(x)
        
        # Step 4. Loss Calculation
        loss = criterion(y_pred_prob, y)
        
        # Step 5. Gradient Calculation (Backpropagation)
        loss.backward()
        
        # Step 6. Update Parameter (by Gradient Descent)
        optimizer.step()
        
        # Step 7. trn_loss 변수에 mini-batch loss를 누적해서 합산
        trn_loss += loss.item()
        
    # Step 8. 데이터 한 개당 평균 train loss
    avg_trn_loss = trn_loss / len(data_loader.dataset)
    return avg_trn_loss

In [6]:
def evaluate(model, data_loader, optimizer, criterion, device):
    model.eval() # 모델을 평가모드로! BN이 있는 모델을 학습할 때, 반드시 필요함
    eval_loss = 0
    
    results_pred = []
    results_real = []
    with torch.no_grad(): # evaluate()함수에는 단순 forward propagation만 할 뿐, gradient 계산 필요 X.
        for i, (x, y) in enumerate(data_loader):
            # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
            x = x.view(-1,784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
            y = y.to(device)

            # Step 2. Forward Propagation
            y_pred_prob = model(x)

            # Step 3. Loss Calculation
            loss = criterion(y_pred_prob, y)
            
            # Step 4. Predict label
            y_pred_label = torch.argmax(y_pred_prob, dim=1)
            
            # Step 5. Save real and predicte label
            results_pred.extend(y_pred_label.detach().cpu().numpy())
            results_real.extend(y.detach().cpu().numpy())
            
            # Step 6. eval_loss변수에 mini-batch loss를 누적해서 합산
            eval_loss += loss.item()

    # Step 7. 데이터 한 개당 평균 eval_loss와 accuracy구하기
    avg_eval_loss = eval_loss / len(data_loader.dataset)
    results_pred = np.array(results_pred)
    results_real = np.array(results_real)
    accuracy = np.sum(results_pred == results_real) / len(results_real)
    
    return avg_eval_loss, accuracy

In [7]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [8]:
# torchvision에서도 MNIST데이터를 제공합니다. 
# 이 데이터를 다운 받을 디렉토리(data_path) 존재 여부를 확인하고 존재하지 않으면 생성 
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
# data 변환 방법 선언 (data transform method)
# 아래 예시: numpy형태의 데이터를 받으면 걔를 tensor로 변환해줘!
transform = transforms.Compose([transforms.ToTensor()])

# dataset을 생성 (torchvision에서 제공하는 데이터를 다운 받고, 위의 방법대로 변환)
trn_dset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
tst_dset = datasets.MNIST(root=data_path, train=False, transform=transform, download=True)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
# without batchnorm
model = MyNet(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model = model.to(device)

# with batchnorm
model_bn = MyNet_BN(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model_bn = model_bn.to(device)

In [11]:
loss_func = nn.CrossEntropyLoss(reduction='sum')

In [12]:
save_dir = 'models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [13]:
N_EPOCHS = 10
LR = 2e-4
BATCH_SIZE = 2**9

In [14]:
trn_loader = DataLoader(trn_dset, batch_size = BATCH_SIZE, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = BATCH_SIZE, shuffle=False, drop_last=False)

In [15]:
my_opt = optim.Adam(model.parameters(), lr = LR)
my_opt_bn = optim.Adam(model_bn.parameters(), lr = LR)

In [16]:
best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    trn_loss = train(model=model, 
                     data_loader=trn_loader, 
                     optimizer=my_opt, 
                     criterion=loss_func,
                     device=device)
    val_loss, accuracy = evaluate(model=model, 
                                  data_loader=tst_loader, 
                                  optimizer=my_opt, 
                                  criterion=loss_func,
                                  device=device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f'{save_dir}/my_model3_1.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {trn_loss:.3f} | Test Loss: {val_loss:.3f} | Test Acc: {100*accuracy:.3f}% ')

Epoch: 01 | Time: 0m 4s
	Train Loss: 1.511 | Test Loss: 0.750 | Test Acc: 83.210% 
Epoch: 02 | Time: 0m 4s
	Train Loss: 0.550 | Test Loss: 0.401 | Test Acc: 89.950% 
Epoch: 03 | Time: 0m 4s
	Train Loss: 0.364 | Test Loss: 0.307 | Test Acc: 91.860% 
Epoch: 04 | Time: 0m 3s
	Train Loss: 0.297 | Test Loss: 0.265 | Test Acc: 92.690% 
Epoch: 05 | Time: 0m 4s
	Train Loss: 0.260 | Test Loss: 0.238 | Test Acc: 93.120% 
Epoch: 06 | Time: 0m 3s
	Train Loss: 0.235 | Test Loss: 0.218 | Test Acc: 93.700% 
Epoch: 07 | Time: 0m 4s
	Train Loss: 0.215 | Test Loss: 0.204 | Test Acc: 93.990% 
Epoch: 08 | Time: 0m 3s
	Train Loss: 0.200 | Test Loss: 0.190 | Test Acc: 94.450% 
Epoch: 09 | Time: 0m 3s
	Train Loss: 0.187 | Test Loss: 0.182 | Test Acc: 94.660% 
Epoch: 10 | Time: 0m 4s
	Train Loss: 0.176 | Test Loss: 0.172 | Test Acc: 94.850% 


In [17]:
best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    trn_loss = train(model=model_bn, 
                     data_loader=trn_loader, 
                     optimizer=my_opt_bn, 
                     criterion=loss_func,
                     device=device)
    val_loss, accuracy = evaluate(model=model_bn, 
                                  data_loader=tst_loader, 
                                  optimizer=my_opt_bn, 
                                  criterion=loss_func,
                                  device=device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model_bn.state_dict(), f'{save_dir}/my_model3_2.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {trn_loss:.3f} | Test Loss: {val_loss:.3f} | Test Acc: {100*accuracy:.3f}% ')

Epoch: 01 | Time: 0m 3s
	Train Loss: 1.369 | Test Loss: 0.725 | Test Acc: 84.160% 
Epoch: 02 | Time: 0m 3s
	Train Loss: 0.572 | Test Loss: 0.428 | Test Acc: 90.120% 
Epoch: 03 | Time: 0m 4s
	Train Loss: 0.386 | Test Loss: 0.319 | Test Acc: 92.200% 
Epoch: 04 | Time: 0m 4s
	Train Loss: 0.305 | Test Loss: 0.265 | Test Acc: 93.180% 
Epoch: 05 | Time: 0m 4s
	Train Loss: 0.257 | Test Loss: 0.233 | Test Acc: 93.740% 
Epoch: 06 | Time: 0m 4s
	Train Loss: 0.226 | Test Loss: 0.208 | Test Acc: 94.320% 
Epoch: 07 | Time: 0m 4s
	Train Loss: 0.202 | Test Loss: 0.190 | Test Acc: 94.690% 
Epoch: 08 | Time: 0m 4s
	Train Loss: 0.183 | Test Loss: 0.175 | Test Acc: 95.090% 
Epoch: 09 | Time: 0m 4s
	Train Loss: 0.168 | Test Loss: 0.163 | Test Acc: 95.320% 
Epoch: 10 | Time: 0m 4s
	Train Loss: 0.154 | Test Loss: 0.155 | Test Acc: 95.510% 


---

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.2)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import os, time
import numpy as np
from matplotlib import pyplot as plt

In [3]:
class MyNet(nn.Module):
    def __init__(self, dim_in, dim_h1, dim_h2, dim_out):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(dim_in,dim_h1)
        self.fc2 = nn.Linear(dim_h1,dim_h2)
        self.fc3 = nn.Linear(dim_h2,dim_out)
        
    def forward(self, x):
        h1 = self.fc1(x)
        h1 = F.relu(h1)
        h2 = self.fc2(h1)
        h2 = F.relu(h2)
        out = self.fc3(h2)
        # F.cross_entropy = F.log_softmax + F.nll_loss
        # 뒤에서 cross_entropy를 사용하려면, 여기서 softmax 빼야됩니다.
        # 이번엔 모델에서 softmax를 빼고, 뒤의 loss에서 cross_entropy loss를 활용해봅시다.
        return out

In [4]:
def train(model, data_loader, optimizer, criterion, device):
    model.train() # 모델을 학습모드로!
    trn_loss = 0
    for i, (x, y) in enumerate(data_loader):
        # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
        x = x.view(-1, 784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
        y = y.to(device)
        
        # Step 2. gradient 초기화
        optimizer.zero_grad()
        
        # Step 3. Forward Propagation
        y_pred_prob = model(x)
        
        # Step 4. Loss Calculation
        loss = criterion(y_pred_prob, y)
        
        # Step 5. Gradient Calculation (Backpropagation)
        loss.backward()
        
        # Step 6. Update Parameter (by Gradient Descent)
        optimizer.step()
        
        # Step 7. trn_loss 변수에 mini-batch loss를 누적해서 합산
        trn_loss += loss.item()
        
    # Step 8. 데이터 한 개당 평균 train loss
    avg_trn_loss = trn_loss / len(data_loader.dataset)
    return avg_trn_loss

In [5]:
def evaluate(model, data_loader, optimizer, criterion, device):
    model.eval() # 모델을 평가모드로!
    eval_loss = 0
    
    results_pred = []
    results_real = []
    with torch.no_grad(): # evaluate()함수에는 단순 forward propagation만 할 뿐, gradient 계산 필요 X.
        for i, (x, y) in enumerate(data_loader):
            # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
            x = x.view(-1,784).to(device) # x.shape: [batch_size,28,28] -> [batch_size, 784]
            y = y.to(device)

            # Step 2. Forward Propagation
            y_pred_prob = model(x)

            # Step 3. Loss Calculation
            loss = criterion(y_pred_prob, y)
            
            # Step 4. Predict label
            y_pred_label = torch.argmax(y_pred_prob, dim=1)
            
            # Step 5. Save real and predicte label
            results_pred.extend(y_pred_label.detach().cpu().numpy())
            results_real.extend(y.detach().cpu().numpy())
            
            # Step 6. eval_loss변수에 mini-batch loss를 누적해서 합산
            eval_loss += loss.item()

    # Step 7. 데이터 한 개당 평균 eval_loss와 accuracy구하기
    avg_eval_loss = eval_loss / len(data_loader.dataset)
    results_pred = np.array(results_pred)
    results_real = np.array(results_real)
    accuracy = np.sum(results_pred == results_real) / len(results_real)
    
    return avg_eval_loss, accuracy

In [6]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [7]:
# torchvision에서도 MNIST데이터를 제공합니다. 
# 이 데이터를 다운 받을 디렉토리(data_path) 존재 여부를 확인하고 존재하지 않으면 생성 
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
# data 변환 방법 선언 (data transform method)
# 아래 예시: numpy형태의 데이터를 받으면 걔를 tensor로 변환해줘!
transform = transforms.Compose([transforms.ToTensor()])

# dataset을 생성 (torchvision에서 제공하는 데이터를 다운 받고, 위의 방법대로 변환)
trn_dset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
tst_dset = datasets.MNIST(root=data_path, train=False, transform=transform, download=True)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
model = MyNet(dim_in=784, dim_h1=50, dim_h2=100, dim_out=10)
model = model.to(device)

In [12]:
batch_size = 2**9
trn_loader = DataLoader(trn_dset, batch_size = batch_size, shuffle=True, drop_last=False)
tst_loader = DataLoader(tst_dset, batch_size = batch_size, shuffle=False, drop_last=False)

In [13]:
my_opt = torch.optim.SGD(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(my_opt, step_size=10, gamma=0.2)

In [17]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for batch_idx, (x_batch, y_batch) in enumerate(trn_loader):
        x_batch = x_batch.reshape(-1,784).to(device)
        y_batch = y_batch.to(device)
        my_opt.zero_grad()
        y_batch_prob = model(x_batch)
        loss = F.nll_loss(y_batch_prob, y_batch)
        loss.backward()
        my_opt.step()
        if (batch_idx+1)%100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                           batch_idx * len(x_batch), 
                                                                           len(trn_loader.dataset),
                                                                           100 * batch_idx / len(trn_loader),
                                                                           loss.item()))
    # 매 epoch이 끝날때 결과 찍기
    print('Train Epoch: {} [{}/{} (100%)]\tLoss: {:.6f}'.format(epoch, 
                                                                   len(trn_loader.dataset), 
                                                                   len(trn_loader.dataset),
                                                                loss.item()))
    model.eval()
    y_pred_list = []
    y_real_list = []
    tst_loss = 0
    with torch.no_grad():
        for batch_idx, (x_batch, y_batch) in enumerate(tst_loader):
            x_batch = x_batch.reshape(-1,784).to(device)
            y_batch = y_batch.to(device)
            y_batch_prob = model(x_batch)
            y_batch_pred = y_batch_prob.argmax(dim=1)  # 수정된 부분
            loss = F.nll_loss(y_batch_prob, y_batch, reduction='sum')
            tst_loss += loss
            
            y_pred_list.append(y_batch_pred.cpu().numpy())  # 수정된 부분
            y_real_list.append(y_batch.cpu().numpy())  # 수정된 부분
            
        y_real = np.concatenate([x for x in y_real_list], axis=0)
        y_pred = np.concatenate([x for x in y_pred_list], axis=0)
        tst_loss /= y_real.shape[0]
        correct  = np.sum(y_real == y_pred)
        accuracy = 100*correct / len(tst_loader.dataset)
        
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(tst_loss, 
                                                                                     correct, 
                                                                                     len(tst_loader.dataset),
                                                                                     accuracy))



Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)


Test set: Average loss: nan, Accuracy: 980/10000 (10%)

