# **[HW2] Training Neural Network**
1. Prerequisite
2. Activation
3. Optimizer
4. Regularization
5. FC vs Conv
6. Do it by yourself

이번 실습에서는 지난 시간에 배웠던 MLP-layer의 component들을 하나씩 바꿔가며 activation, optimizer, regularization, convolution layer등의 중요성을 하나씩 익혀가는 시간을 갖도록 하겠습니다.  

# 1. Prerequisite

본격적인 실습을 진행하기 이전, 지난 [HW1.2 Logistic Regression vs MLP]에서 진행했던것과 동일하게 \\
Mnist dataset에 대해서 DataLoader와 Trainer class를 생성해두겠습니다.



## Import packages

런타임의 유형을 변경해줍니다.

상단 메뉴에서 [런타임]->[런타임유형변경]->[하드웨어가속기]->[GPU]

변경 이후 아래의 cell을 실행 시켰을 때, torch.cuda.is_avialable()이 True가 나와야 합니다.



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torch.utils import data
print(torch.__version__)
print(torch.cuda.is_available())

1.10.0+cu111
True


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

## Load Dataset

In [3]:
mnist = fetch_openml('mnist_784', cache=False)
X = mnist.data.astype('float32').values
y = mnist.target.astype('int64').values
X /= 255.0
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


## Split Dataset

학습과 평가를 위한 dataset으로 나눕니다.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(56000, 784)
(56000,)
(14000, 784)
(14000,)


## Pytorch Dataset 

In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        super(CustomDataset, self).__init__()
        self.X = X
        self.y = y
        
    def __getitem__(self, index):
        x = self.X[index]
        y = self.y[index]
        x = torch.from_numpy(x).float()
        y = torch.from_numpy(np.array(y)).long()
        return x, y

    def __len__(self):
        return len(self.X)

In [6]:
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

print(len(train_dataset))
print(train_dataset.X.shape)
print(len(test_dataset))
print(test_dataset.X.shape)

56000
(56000, 784)
14000
(14000, 784)


## DataLoader


In [7]:
batch_size = 64

# shuffle the train data
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# do not shuffle the val & test data
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# dataset size // batch_size
print(len(train_dataloader))
print(len(test_dataloader))

875
219


## Trainer


In [8]:
class Trainer():
    def __init__(self, trainloader, testloader, model, optimizer, criterion, device):
        """
        trainloader: train data's loader
        testloader: test data's loader
        model: model to train
        optimizer: optimizer to update your model
        criterion: loss function
        """
        self.trainloader = trainloader
        self.testloader = testloader
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        
    def train(self, epoch = 1):
        self.model.train()
        for e in range(epoch):
            running_loss = 0.0  
            for i, data in enumerate(self.trainloader, 0): 
                inputs, labels = data 
                # model에 input으로 tensor를 gpu-device로 보낸다
                inputs = inputs.to(self.device)  
                labels = labels.to(self.device)
                # zero the parameter gradients
                self.optimizer.zero_grad()    
                # forward + backward + optimize
                outputs = self.model(inputs) 
                loss = self.criterion(outputs, labels)  
                loss.backward() 
                self.optimizer.step() 
                running_loss += loss.item()
            
            print('epoch: %d  loss: %.3f' % (e + 1, running_loss / len(self.trainloader)))
            running_loss = 0.0
        
    def test(self):
        self.model.eval() 
        correct = 0
        for inputs, labels in self.testloader:
            inputs = inputs.to(self.device)
            labels = labels.to(self.device)
            output = self.model(inputs) 
            pred = output.max(1, keepdim=True)[1] # get the index of the max 
            correct += pred.eq(labels.view_as(pred)).sum().item()
        test_acc = correct / len(self.testloader.dataset)
        print('test_acc: %.3f' %(test_acc))

# 2. Activation Function

이번 section에서는 가장 대표적으로 사용되는 sigmoid function과 relu function을 사용해보고 비교해보도록 하겠습니다.

![](https://drive.google.com/uc?export=view&id=1xfJBd9v9L_RgXGf8urNrYpb40zXU6gea)


- input: 784
- hidden: 32 or (32, 32)
- output: 10
- **activation: sigmoid or relu**
- optimizer: sgd
- loss: cross-entropy

## 2-layer Network + Sigmoid

In [9]:
class MLP(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 hidden_dim=32, 
                 output_dim=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.sigmoid(x)
        x = self.fc2(x)
        return x

model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=10, bias=True)
)

In [10]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)



epoch: 1  loss: 2.192
epoch: 2  loss: 1.837
epoch: 3  loss: 1.407
epoch: 4  loss: 1.090
epoch: 5  loss: 0.886
epoch: 6  loss: 0.754
epoch: 7  loss: 0.664
epoch: 8  loss: 0.599
epoch: 9  loss: 0.551
epoch: 10  loss: 0.514


In [11]:
trainer.test()

test_acc: 0.881




## 2-layer Network + ReLU

In [12]:
class MLP(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 hidden_dim=32, 
                 output_dim=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=10, bias=True)
)

In [13]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 1.406
epoch: 2  loss: 0.543
epoch: 3  loss: 0.413
epoch: 4  loss: 0.365
epoch: 5  loss: 0.338
epoch: 6  loss: 0.320
epoch: 7  loss: 0.306
epoch: 8  loss: 0.295
epoch: 9  loss: 0.285
epoch: 10  loss: 0.276


In [14]:
trainer.test()

test_acc: 0.924


#### Q1. Activation Function에 따라 성능의 차이가 있나요? 있다면, 왜 차이가 발생했을까요?




## 3-layer Network + Sigmoid

In [15]:
class MLP(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 hidden_dim=(32,32), 
                 output_dim=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.sigmoid(x)
        x = self.fc2(x)
        x = F.sigmoid(x)
        x = self.fc3(x)
        return x

model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=10, bias=True)
)

In [16]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)



epoch: 1  loss: 2.303
epoch: 2  loss: 2.295
epoch: 3  loss: 2.288
epoch: 4  loss: 2.278
epoch: 5  loss: 2.260
epoch: 6  loss: 2.227
epoch: 7  loss: 2.162
epoch: 8  loss: 2.050
epoch: 9  loss: 1.896
epoch: 10  loss: 1.713


In [17]:
trainer.test()

test_acc: 0.601




## 3-layer Network + ReLU

In [18]:
class MLP(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 hidden_dim=(32,32), 
                 output_dim=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=10, bias=True)
)

In [19]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 1.685
epoch: 2  loss: 0.581
epoch: 3  loss: 0.422
epoch: 4  loss: 0.373
epoch: 5  loss: 0.345
epoch: 6  loss: 0.324
epoch: 7  loss: 0.305
epoch: 8  loss: 0.289
epoch: 9  loss: 0.274
epoch: 10  loss: 0.260


In [20]:
trainer.test()

test_acc: 0.928


#### Q2. Activation function 별로 Layer 수를 늘리는 것이 성능이 어떻게 변하나요? 양상이 다르게 나타난다면 왜 그럴까요?


#### Q3. Activation function이 존재하지 않는다면 어떤 일이 일어날까요?

# 3. Optimization

이번 section에서는 sgd, momentum, Adam등의 optimizer를 사용해보고 성능을 비교해보도록 하겠습니다.

![](https://drive.google.com/uc?export=view&id=1xfCTx8xj4zoaombrK2bSN9nv0Z3r95jp)


- input: 784
- hidden: (32, 32)
- output: 10
- activation: relu
- **optimizer: sgd or momentum or adam**
- loss: cross-entropy

In [21]:
class MLP(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 hidden_dim=(32,32), 
                 output_dim=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

## 3-layer Network + ReLU + SGD

In [22]:
model = MLP()
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=10, bias=True)
)

In [23]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 1.927
epoch: 2  loss: 0.727
epoch: 3  loss: 0.467
epoch: 4  loss: 0.381
epoch: 5  loss: 0.342
epoch: 6  loss: 0.315
epoch: 7  loss: 0.297
epoch: 8  loss: 0.282
epoch: 9  loss: 0.268
epoch: 10  loss: 0.258


In [24]:
trainer.test()

test_acc: 0.928


## 3-layer Network + ReLU + Momentum





In [25]:
model = MLP()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.99)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=10, bias=True)
)

In [26]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 0.561
epoch: 2  loss: 0.239
epoch: 3  loss: 0.188
epoch: 4  loss: 0.182
epoch: 5  loss: 0.160
epoch: 6  loss: 0.145
epoch: 7  loss: 0.142
epoch: 8  loss: 0.150
epoch: 9  loss: 0.127
epoch: 10  loss: 0.124


In [27]:
trainer.test()

test_acc: 0.956


## 3-layer Network + ReLU + Adam



In [28]:
model = MLP()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=10, bias=True)
)

In [29]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 0.300
epoch: 2  loss: 0.178
epoch: 3  loss: 0.150
epoch: 4  loss: 0.143
epoch: 5  loss: 0.131
epoch: 6  loss: 0.127
epoch: 7  loss: 0.126
epoch: 8  loss: 0.116
epoch: 9  loss: 0.119
epoch: 10  loss: 0.111


In [30]:
trainer.test()

test_acc: 0.959


#### Q4. Optimizer 별로 수렴 속도가 어떻게 다른가요? 
##### Q4.1 수렴 속도가 다르다면 sgd와 momentum의 차이는 왜 발생할까요? 
##### Q4.2 수렴 속도가 다르다면 momentum과 Adam의 차이는 왜 발생할까요?

## 4. Regularization

이번 section에서는 image data에서 주로 사용되는 batch-normalization을 어떻게 사용하는지를 확인해보겠습니다.

![](https://drive.google.com/uc?export=view&id=1xZSWZiSxuGZAsonghidhTSfUEYiuxRtN)

- input: 784
- hidden: 32 or (32, 32)
- output: 10
- activation: relu
- optimizer: adam
- **regularizer: batch_norm**
- loss: cross-entropy

## 3-layer Network + ReLU + Adam + batch_norm

In [31]:
class MLP(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 hidden_dim=(32,32), 
                 output_dim=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.bn1 = nn.BatchNorm1d(hidden_dim[0])
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.bn2 = nn.BatchNorm1d(hidden_dim[1])
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

In [32]:
model = MLP()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda')
model.to(device)

MLP(
  (fc1): Linear(in_features=784, out_features=32, bias=True)
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=32, out_features=10, bias=True)
)

In [33]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 0.263
epoch: 2  loss: 0.150
epoch: 3  loss: 0.122
epoch: 4  loss: 0.106
epoch: 5  loss: 0.099
epoch: 6  loss: 0.088
epoch: 7  loss: 0.086
epoch: 8  loss: 0.077
epoch: 9  loss: 0.073
epoch: 10  loss: 0.073


In [34]:
trainer.test()

test_acc: 0.972


In [35]:
def count_parameters(model):
    print(sum(p.numel() for p in model.parameters() if p.requires_grad))
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

26634


26634

#### Q5. Batch-normalization을 사용하기 전 후로 성능이 어떻게 변화했나요? 왜 이러한 변화가 일어났을까요?


# 5. Fully-Connected Layer vs Convolution Layer

지금까지 model의 다양한 node를 바꿔가며 mnist의 성능 변화를 확인해보는 실습을 진행해 보았습니다. \\
비록, fully-connected network가 mnist 데이터에서 높은 성능을 내는데는 문제가 없었지만, 모든 layer를 fully-connected layer로 만드는 것은 엄청난 파라미터와 연산량을 필요로 하기 때문에 더욱 큰 고화질의 이미지 데이터를 처리하는데는 적합하지 않습니다. \\ 

따라서, 이번 section에서는 이미지 데이터 처리에 주로 사용되는 convolution layer를 사용해보고 파라미터 수와 성능이 어떻게 변화하는지 확인해보도록 하겠습니다. 

## Convolution Operation

![](https://drive.google.com/uc?export=view&id=1xdjTf4ab0P8qfu_TaLJ4TZzt5sk3twS6)


### Q6. Input이 (H, W, C) 일 때, stride S의 2개의 (F * F) convolutional filter를 적용하면 output이 어떻게 되나요?

In [36]:
class Conv(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 output_dim=10):
        super(Conv, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=8,
                               kernel_size=7,
                               stride=2)
        self.conv2 = nn.Conv2d(in_channels=8,
                               out_channels=8,
                               kernel_size=7,
                               stride=2)
        self.fc = nn.Linear(3*3*8, output_dim)

    def forward(self, x):
        # should reshape data into image
        x = x.reshape(-1, 1, 28, 28)
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = x.reshape(-1, 3*3*8)
        x = self.fc(x)
        return x

In [37]:
model = Conv()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda')
model.to(device)

Conv(
  (conv1): Conv2d(1, 8, kernel_size=(7, 7), stride=(2, 2))
  (conv2): Conv2d(8, 8, kernel_size=(7, 7), stride=(2, 2))
  (fc): Linear(in_features=72, out_features=10, bias=True)
)

In [38]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 0.226
epoch: 2  loss: 0.103
epoch: 3  loss: 0.090
epoch: 4  loss: 0.085
epoch: 5  loss: 0.081
epoch: 6  loss: 0.081
epoch: 7  loss: 0.082
epoch: 8  loss: 0.077
epoch: 9  loss: 0.077
epoch: 10  loss: 0.071


In [39]:
trainer.test()

test_acc: 0.973


In [40]:
count_parameters(model)

4274


4274

##### Q7. covolution operation은 image데이터를 다루는데 있어서 fully-connected layer에 비해 어떤 점에서 효과적일까요?


## 6. Do It By Yourself

위에서 했던 실습들과 수업에 배웠던 다양한 network component들을 참조해서 20,000개 이하의 파라미터로 98%의 accuracy를 달성해보세요!

In [52]:
class CustomModel(nn.Module):
    def __init__(self, 
                 input_dim=784, 
                 output_dim=10):
        super(CustomModel, self).__init__()

        # [64, 1, 28, 28] => [64, 3, 22, 22]
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=3,
                               kernel_size=7,
                               stride=1)
        
        self.bn1 = nn.BatchNorm2d(3)

        # [64, 3, 22, 22] => [64, 8, 16, 16]
        self.conv2 = nn.Conv2d(in_channels=3,
                               out_channels=7,
                               kernel_size=7,
                               stride=1)
        
        self.bn2 = nn.BatchNorm2d(7)


        self.fc = nn.Linear(16*16*7, output_dim)

    def forward(self, x):
        # should reshape data into image
        x = x.reshape(-1, 1, 28, 28)
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)


        x = x.reshape(-1, 16*16*7)
        x = self.fc(x)
        return x

In [53]:
model = CustomModel()
count_parameters(model)

19136


19136

In [54]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda')
model.to(device)

CustomModel(
  (conv1): Conv2d(1, 3, kernel_size=(7, 7), stride=(1, 1))
  (bn1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(3, 7, kernel_size=(7, 7), stride=(1, 1))
  (bn2): BatchNorm2d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc): Linear(in_features=1792, out_features=10, bias=True)
)

In [55]:
trainer = Trainer(trainloader = train_dataloader,
                  testloader = test_dataloader,
                  model = model,
                  criterion = criterion,
                  optimizer = optimizer,
                  device = device)

trainer.train(epoch = 10)

epoch: 1  loss: 0.162
epoch: 2  loss: 0.070
epoch: 3  loss: 0.058
epoch: 4  loss: 0.053
epoch: 5  loss: 0.048
epoch: 6  loss: 0.041
epoch: 7  loss: 0.041
epoch: 8  loss: 0.036
epoch: 9  loss: 0.030
epoch: 10  loss: 0.027


In [56]:
trainer.test()

test_acc: 0.981
