In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import time, datetime
from tqdm import tqdm
import torch
from torch import nn
from torch.nn import functional as F
from torchsummary import summary

device(gpu)를 정해서 사용해야함. multi-gpu를 사용할 수도 있지만, 각 연구원별로 1개로 할당되어있기 때문에, single gpu가 default

In [2]:
!nvidia-smi

Fri Jan 22 17:02:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1D:00.0 Off |                  N/A |
| 27%   29C    P0    52W / 250W |      0MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1E:00.0 Off |                  N/A |
| 27%   30C    P0    58W / 250W |      0MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1F:00.0 Off |                  N/A |
| 26%   

In [3]:
gpu_idx = 3

if torch.cuda.is_available() and type(gpu_idx) == int:
    device = torch.device("cuda:{}".format(gpu_idx))
    current_device = torch.cuda.current_device()
    print("Device: {} ({})\n".format(torch.cuda.get_device_name(current_device), device))
else:
    device = torch.device('cpu')
    print("Device: CPU\n")

Device: GeForce RTX 2080 Ti (cuda:3)



In [4]:
!nvidia-smi

Fri Jan 22 17:02:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1D:00.0 Off |                  N/A |
| 27%   29C    P0    51W / 250W |     10MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1E:00.0 Off |                  N/A |
| 27%   30C    P0    58W / 250W |     10MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1F:00.0 Off |                  N/A |
| 27%   

# 1. Load data

pkl로 저장된 data_config를 가져와서 훈련 목적에 따라 데이터를 load

파일이름을 batch로 받아서 batch단위에서 이미지를 load해서 device를 입혀야 최대한 neural network 훈련에 gpu를 사용할 수 있음

In [5]:
import pickle
import cv2
from itertools import product

def Image_norm(img, mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]):
    """
    img: np.array (h, w, ch) or (n, h, w, ch) ~ [0, 255] integer
    mean and std vaules were used for normalizing input data to ImageNet models in torch 
    """
    return (img/255-mean)/std

def reshape4torch(img, norm = False):
    """
    (sample #, height, width, channel) -> (sample #, channel, height, width)
    """
    if norm == True:
        img = Image_norm(img)
        
    if len(img.shape) == 4:
        img = np.transpose(img, (0, 3, 1, 2))
        return img
    elif len(img.shape) == 3:
        img = np.transpose(img, (2, 0, 1))
        return np.expand_dims(img, axis=0)

def load_image_from_path(path_list, normalization = False, extract_name = False):
    data = []
    for i in path_list:
        temp = cv2.imread(i)

        data.append(temp)
    if extract_name != False:
        name = []
        for i in path_list:
            name.append(os.path.basename(i))
        return reshape4torch(np.asarray(data), norm = normalization), np.asarray(name)
    else:
        return reshape4torch(np.asarray(data), norm = normalization)
    
def extract_aug_suffix(frb_switch = [1, 1, 1], sv_switch = True, mode = 'load'):
    """
    frb_switch = [1, 1, 1], [0, 0 ,1], [1, 1, 0].... 
    that means [flip, rotate, blur_sharp]
    """
    phase0 = ['_c']
    phase1 = {1: ['-', 'f'], 0: ['-']}
    phase2 = {1: ['-', 'r1', 'r2', 'r3'], 0: ['-']}
    phase3 = {1: ['-', 'ab', 'mb', 'eh'], 0: ['-']}
    phase4 = ['s_-30_v_30', 's_-30_v_-30', 's_30_v_-30', 's_30_v_30']

    if mode == 'load':
        phase_a_items = [phase1[frb_switch[0]], phase2[frb_switch[1]], phase3[frb_switch[2]]]
    elif mode == 'preprocessing':
        phase_a_items = [phase0, phase1[frb_switch[0]], phase2[frb_switch[1]], phase3[frb_switch[2]]]

    phase_a = []
    for i in list(product(*phase_a_items)):
        phase_a.append('_'.join(i))

    if not sv_switch == False:
        phase_b = []
        for i in list(product(*[phase_a, phase4])):
            phase_b.append('_'.join(i))
        return list(np.hstack([phase_a, phase_b]))
    else:
        return phase_a 

def train_data_load(data_config, aug_frb = [0, 0, 0], aug_sv = False):
    """
    data_config: ~~~.pkl
    """
    root = '/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed'

    data_dir = root + '/database'

    with open(root + '/{}'.format(data_config), "rb") as f:
        data_config = pickle.load(f)

    train_aug_files, valid_files = data_config['train_aug_files'], data_config['valid_files']
    
#     train_aug_files = remove_annotation_mark(train_aug_files)
#     valid_files = remove_annotation_mark(valid_files)
    
    valid_Xs = []
    for i, valid_file in enumerate(valid_files):
        valid_path = [os.path.join(data_dir, f) for f in valid_file]
        valid_Xs.append(load_image_from_path(valid_path))

    target_aug = extract_aug_suffix(aug_frb, aug_sv, mode = 'load')

    train_aug_paths = []
    for train_aug_file in train_aug_files:
        train_aug_paths.append([os.path.join(data_dir, f) for f in train_aug_file 
                                if (f.split('c_')[-1])[:-4] in target_aug])
        
    print('{:<7}| {:<30}| {:<25}| {:<15}'.format('class', 'total augmented training set', 'target training set (x{})'.format(len(target_aug)), 'validation set'))
    for i in range(len(train_aug_files)):
        print('{:<7}| {:<30}| {:<25}| {:<15}'.format(i, len(train_aug_files[i]), len(train_aug_paths[i]), len(valid_Xs[i])))
        
    print()

#     print('total augmented training set:', len(train_aug_files[0]), ',', len(train_aug_files[1]))
#     print('target augmented training set:', len(train_aug_paths[0]), ',', len(train_aug_paths[1]))
#     print('validation set:', len(valid_files[0]), ',', len(valid_files[1]))
    
    return train_aug_paths, valid_Xs

증강법을 선택하여 training set에 쓰일 파일 경로와 validation set에 쓰일 이미지를 불러올 수 있음

- validation set에 쓰이는 데이터는 증강법이 적용이 안되어 데이터 수가 적어 이미지로 불러온 상태로 작업해도 무방하나, training set의 경우, 증강법에 따라 그 수가 커지면 memory에 영향이 가기때문에 파일경로로 가지고 있다가 batch를 불러올 때, 이미지를 불러옴

In [6]:
 train_aug_paths, valid_Xs = train_data_load('data_config_np-hd_frb_sv.pkl', aug_frb = [0, 0, 0], aug_sv = False)

class  | total augmented training set  | target training set (x1) | validation set 
0      | 1094720                       | 6842                     | 1710           
1      | 400480                        | 2503                     | 626            



In [7]:
train_paths, n_cls = train_aug_paths, len(train_aug_paths)
class_size = []
valid_Ys = []
classifier_file = None

def gen_label(data, cls):
#     label = cls*np.ones([data.shape[0]])
    label = cls*np.ones([len(data)])
    return label

for i, train_path, valid_x in zip(range(n_cls), train_paths, valid_Xs):
                     
    valid_y = gen_label(valid_x, i)                                                                
    valid_Ys.append(valid_y)
    class_size.append(len(train_path))   
                                
valid_X = np.concatenate(valid_Xs)
valid_Y = torch.tensor(np.concatenate(valid_Ys), device = device).long()

valid_X.shape, valid_Y.shape

((2336, 3, 512, 512), torch.Size([2336]))

In [8]:
class_size

[6842, 2503]

valid_Y를 device에 할당시키면서 늘어나는 memory

In [9]:
!nvidia-smi

Fri Jan 22 17:02:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1D:00.0 Off |                  N/A |
| 27%   28C    P8    13W / 250W |     10MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1E:00.0 Off |                  N/A |
| 27%   29C    P8     8W / 250W |     10MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1F:00.0 Off |                  N/A |
| 27%   

# 2. Model

CNN_v1: VGGNet과 유사하지만, SBCE 이미지에 쓰이는 색상이 적고 구분하는 클래스가 적어 훨씬 더 shallow한 network구조로 baseline으로 잡고 있음

Transfer learning: 다른 ImageNet에서 훈련된 network들이 heavy해서 3, 512, 512의 이미지 (ImageNet 이미지의 4배 scale)를 훈련시키려면 single gpu 하나로 가능한 훈련배치가 급격히 줄어듬  

발전된 network layer와 module 등을 network 구조를 customizing하여 성능을 높일 수는 있으나, 연구적인 novelty는 크게 없음

In [10]:
class CNN_v1(nn.Module):
    def __init__(self, n_ch, n_cls):
        super().__init__()

        self.conv1_1 = nn.Conv2d(n_ch, 16, 3, 1, padding=1)
        self.conv1_2 = nn.Conv2d(16, 16, 3, 1, padding=1)
        self.maxp1 = nn.MaxPool2d(2, 2)
        
        self.conv2_1 = nn.Conv2d(16, 16, 3, 1, padding=1)
        self.conv2_2 = nn.Conv2d(16, 16, 3, 1, padding=1)
        self.maxp2 = nn.MaxPool2d(2, 2)
        
        self.conv3_1 = nn.Conv2d(16, 16, 3, 1, padding=1)
        self.conv3_2 = nn.Conv2d(16, 16, 3, 1, padding=1)
        self.maxp3 = nn.MaxPool2d(2, 2)
        
        self.conv4_1 = nn.Conv2d(16, 32, 3, 1, padding=1)
        self.conv4_2 = nn.Conv2d(32, 32, 3, 1, padding=1)
        self.maxp4 = nn.MaxPool2d(2, 2)

        self.conv5_1 = nn.Conv2d(32, 32, 3, 1, padding=1)
        self.conv5_2 = nn.Conv2d(32, 32, 3, 1, padding=1)
        self.maxp5 = nn.MaxPool2d(2, 2)
        
        self.conv6_1 = nn.Conv2d(32, 32, 3, 1, padding=1)
        self.conv6_2 = nn.Conv2d(32, 32, 3, 1, padding=1)
        self.maxp6 = nn.MaxPool2d(2, 2)
        
        self.conv7_1 = nn.Conv2d(32, 64, 3, 1, padding=1)
        self.conv7_2 = nn.Conv2d(64, 64, 3, 1, padding=1)
        self.maxp7 = nn.MaxPool2d(2, 2)
        
        self.dense1 = nn.Linear(4*4*64, 100)
        self.dropout1 = nn.Dropout(0.3)
        self.dense2 = nn.Linear(100, 50)
        self.dropout2 = nn.Dropout(0.3)
        self.dense3 = nn.Linear(50, n_cls)
        
    def forward(self, x):
        x = F.relu(self.conv1_1(x))
        x = F.relu(self.conv1_2(x))
        x = self.maxp1(x)
        x = F.relu(self.conv2_1(x))
        x = F.relu(self.conv2_2(x))
        x = self.maxp2(x)
        x = F.relu(self.conv3_1(x))
        x = F.relu(self.conv3_2(x))
        x = self.maxp3(x)
        x = F.relu(self.conv4_1(x))
        x = F.relu(self.conv4_2(x))
        x = self.maxp4(x)
        x = F.relu(self.conv5_1(x))
        x = F.relu(self.conv5_2(x))
        x = self.maxp5(x)
        x = F.relu(self.conv6_1(x))
        x = F.relu(self.conv6_2(x))
        x = self.maxp6(x)
        x = F.relu(self.conv7_1(x))
        x = F.relu(self.conv7_2(x))
        x = self.maxp7(x)
        # flatten
        x = x.view(-1, 4*4*64)
        feature = F.relu(self.dense1(x))
        x = self.dropout1(feature)
        x = F.relu(self.dense2(x))
        x = self.dropout2(x)
        x = self.dense3(x)
        x = F.softmax(x, dim = -1)
        return x

summary를 통해 input size에 따라 하나의 input에 대한 network 훈련에 드는 총 memory를 볼 수 있음

101.39 MB

11019 MiB = 11554.258944 MB 

이론상, 약 100개의 batch training이 최대임을 알 수 있음 (batch training 외에 gpu memory에 데이터가 많이 할당되어있으면 훈련에 쓰는 gpu가 적어짐)

In [11]:
lr  = 0.0001

input_shape = (3, 512, 512)
n_ch, input_h, input_w = input_shape
        
# if network == 'CNN_v1':
#     from ce_model.cnns import CNN_v1
network = CNN_v1(n_ch, n_cls)

model = network
# model = model.cuda()
model = model.to(device)
# if device == 'cuda':
#     net = torch.nn.DataParallel(net)
#     cudnn.benchmark = True
#         model.train()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
# if summary_show == True:
summary(model, (n_ch, input_h, input_w), device = device)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 512, 512]             448
            Conv2d-2         [-1, 16, 512, 512]           2,320
         MaxPool2d-3         [-1, 16, 256, 256]               0
            Conv2d-4         [-1, 16, 256, 256]           2,320
            Conv2d-5         [-1, 16, 256, 256]           2,320
         MaxPool2d-6         [-1, 16, 128, 128]               0
            Conv2d-7         [-1, 16, 128, 128]           2,320
            Conv2d-8         [-1, 16, 128, 128]           2,320
         MaxPool2d-9           [-1, 16, 64, 64]               0
           Conv2d-10           [-1, 32, 64, 64]           4,640
           Conv2d-11           [-1, 32, 64, 64]           9,248
        MaxPool2d-12           [-1, 32, 32, 32]               0
           Conv2d-13           [-1, 32, 32, 32]           9,248
           Conv2d-14           [-1, 32,

(tensor(226004), tensor(226004))

network를 device에 할당시키면서 늘어나는 memory

In [12]:
!nvidia-smi

Fri Jan 22 17:02:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1D:00.0 Off |                  N/A |
| 27%   28C    P8    13W / 250W |     10MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1E:00.0 Off |                  N/A |
| 27%   29C    P8     9W / 250W |     10MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1F:00.0 Off |                  N/A |
| 27%   

동일한 network (CNN_v1)이면서 이미 훈련된 모델을 쓰고자 한다면 아래 코드로 사용할 수 있음

In [13]:
classifider_file = None

if classifier_file is not None:
# load the weights into generator
    print("loading classifier_weights from:", classifier_file, '\n')
#             self.model.load_state_dict(torch.load(classifier_file))
    model.load_state_dict(torch.load(classifier_file, map_location=lambda storage, loc: storage.cuda(gpu_idx)))

# 3. Training

Batch Training
- 'equal'은 batch training 단계에서 각 class마다 동일한 갯수의 데이터를 sub-sample하여 batch를 만드는 것
- 비교를 위해서 binary classification에 한하여 oversample, undersample이 가능하도록 했지만, 기본적으로 'eqaul'을 사용함
- 이는 황윤섭 학위논문에 class-equal batch sampling으로 소개되어 oversample, undersample과 비교하였음

batch training을 위한 함수들

In [14]:
norm = None 

def load_rand_batch(path, label = None, cls = None, batch_size = 50, mode = 'eqaul', norm = False):
    idx = np.random.choice(len(path), batch_size)
    if type(path) == list:
        path = np.asarray(path)
    batch_dir = path[idx]
    batch_x = []
    for i in batch_dir:
        img = cv2.imread(i) # BGR Channel
        batch_x.append(img)
    if mode == 'equal':
        batch_label = gen_label(batch_x, cls)
        return reshape4torch(np.asarray(batch_x), norm = norm), batch_label
    elif mode == 'mixed':
        return reshape4torch(np.asarray(batch_x), norm = norm), label[idx]

def rand_shuffle(x1, x2):
    """
    random shuffle of two paired data -> x, y = shuffle(x, y)
    but, available of one data -> x = shuffle(x, None)
    """
    idx = np.arange(len(x1))
    np.random.shuffle(idx)
    if type(x1) == type(x2):
        return x1[idx], x2[idx] 
    else:
        return x1[idx]

def torch_batch_load(train_paths, batch_size = 100, shuffle = False, mode = 'equal'):
    x, y = [], []

    if mode == 'equal':
        for i, X_i in enumerate(train_paths):
            x_i, y_i = load_rand_batch(path = X_i, cls = i, 
                                       batch_size = batch_size, mode = 'equal', norm = norm)
            x.append(x_i), y.append(y_i)
        x, y = np.concatenate(x), np.concatenate(y)


    elif mode == 'mixed':
        Y = []
        for i, X_i in enumerate(train_paths):
            Y_i = gen_label(X_i, i)
            Y.append(Y_i)
        X = np.concatenate(train_paths)
        Y = np.concatenate(Y)
        x, y = load_rand_batch(path = X, label = Y, 
                               batch_size = batch_size, mode = 'mixed', norm = norm)

    if shuffle != False:
        x, y = rand_shuffle(x, y)
    x, y = torch.tensor(x, device = device).float(), torch.tensor(y, device = device).long()
#         x, y = torch.tensor(x, device = 'cpu').float(), torch.tensor(y, device = 'cpu').long()
    return x, y

oversample, undersample을 위한 data sampling 함수 (binary classification 한정)

In [15]:
def data_sampling(train_paths, mode):
    if mode == 'undersample':
        train_paths_ = train_paths.copy()
        n_min = np.min([len(train_paths_[0]), len(train_paths_[1])])
        target_cls = np.argmax([len(train_paths_[0]), len(train_paths_[1])])
        target_path = np.asarray(train_paths_[target_cls])
        undersampled_paths = list(target_path[sorted(np.random.choice(len(target_path), n_min, replace=False))])
        train_paths_[target_cls] = undersampled_paths

    elif mode == 'oversample':
        train_paths_ = train_paths.copy()
        n_max = np.max([len(train_paths_[0]), len(train_paths_[1])])
        target_cls = np.argmin([len(train_paths_[0]), len(train_paths_[1])])
        target_path = np.asarray(train_paths_[target_cls])
        n_diff = int(n_max-len(target_path))
        if len(target_path) >= n_diff:
            oversampled_paths = list(target_path[sorted(np.random.choice(len(target_path), n_diff, replace=False))])
        elif len(target_path) < n_diff:
            oversampled_paths = list(target_path[sorted(np.random.choice(len(target_path), n_diff, replace=True))])
        train_paths_[target_cls] += oversampled_paths

    return train_paths_

훈련 중 overfitting을 방지하기 위한 validation 함수들

In [16]:
def batch_idxs(dataset, batch_size = 32, shuffle = False):

    idxs = np.arange(len(dataset))
    total_size = len(idxs)
    if shuffle:
        np.random.shuffle(idxs)
    start = 0
    b_idxs = []
    while True:
        if total_size > start + batch_size: 
            b_idxs.append(list(idxs[start:start+batch_size]))  
            start += batch_size
        elif total_size <= start + batch_size: 
            b_idxs.append(list(idxs[start:]))
            break 
    return b_idxs

def validation(X, Y, batch_size = 32):
    b_idxs = batch_idxs(X, batch_size)
    output = []
    for b_idx in b_idxs:
        x = torch.tensor(X[b_idx, :, :, :], device = device).float() 
#             x = X[batch, :, :, :] 
        o = model(x)
        output.append(o)
    output = torch.cat(output)
    loss = criterion(output, Y)
    _, pred = torch.max(output, 1)
    return loss, pred  

훈련의 경과를 보고, 훈련시간을 기록하기 위한 함수들

In [17]:
def progress_bar(iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
    formatStr = "{0:." + str(decimals) + "f}"
    percent = formatStr.format(100 * (iteration / float(total)))
    filledLength = int(round(barLength * iteration / float(total)))
    bar = '#' * filledLength + '-' * (barLength - filledLength)
    sys.stdout.write('\r{} |{} | {}{} {}'.format(prefix, bar, percent, '%', suffix)),
    if iteration == total:
        sys.stdout.write('\n')
    sys.stdout.flush()

def plot_history(model_name, save_dir = 'training_history'):
        
    fig = plt.figure(figsize = (20, 8))

    # x_axis = range(1, 10*len(accr_hist)+1, 10)
#         x_axis = np.arange(10, 10*len(accr_hist)+1, 10)
    x_axis = np.arange(1, epoch_i + 1)

#         print(x_axis, accr_hist, loss_hist, val_accr_hist, val_loss_hist)

    plt.subplot(1, 2, 1)
    plt.plot(x_axis, accr_hist, 'b-', label = 'Training Accuracy')
    plt.plot(x_axis, val_accr_hist, 'r-', label = 'Validation Accuracy')
    plt.xlabel('Epoch', fontsize = 15)
    plt.ylabel('Accuracy', fontsize = 15)
    plt.legend(fontsize = 10)
    plt.grid(True)
#         plt.grid('on')
    plt.subplot(1, 2, 2)
    plt.plot(x_axis, loss_hist, 'b-', label = 'Training Loss')
    plt.plot(x_axis, val_loss_hist, 'r-', label = 'Validation Loss')
    plt.xlabel('Epoch', fontsize = 15)
    plt.ylabel('Loss', fontsize = 15)
    # plt.yticks(np.arange(0, 0.25, step=0.025))
    plt.legend(fontsize = 12)
#         plt.grid('on')
    plt.grid(True)
#         plt.show()

    save_path = './training_history'
    os.makedirs(save_path, exist_ok = True)
#         model_name = '_'.join(model_full_name.split('_')[0:3])

    hyper_params = '{}_{}'.format(lr, n_batch)
    model_name = '{}_{}'.format(model_name, hyper_params)

#         print(model_name)

    fig.savefig(save_path + '/{}_training_plot.png'.format(model_name), bbox_inches='tight')
    plt.close(fig)

    np.save(save_path + '/{}_training_log'.format(model_name), [loss_hist, accr_hist, val_loss_hist, val_accr_hist])
    
def sec_to_m_s_ms(sec):
    """
    sec: time.time() - start_time
    output: ex) 00:47.421
    """
    min_sec = time.strftime("%M:%S", time.gmtime(sec))
    ms = '{:03d}'.format(int((sec - int(sec))*1000))   
    return '.'.join([min_sec, ms])


sampling mode와 batch mode에 따른 1 epoch당 iteration 수 계산

equal에 경우, n_batch는 각 class마다의 batch 사이즈를 의미함 <br>
즉, 32개의 mini batch를 class마다 가져와서 64개의 batch를 input으로 훈련하게 됨

In [18]:
sampling_mode = None
batch_mode = 'equal'
n_batch = 32

if sampling_mode == 'oversample':
    if batch_mode == 'equal':
        max_iter = np.max(class_size) // n_batch + 1
    elif batch_mode == 'mixed':
        max_iter = np.max(class_size) // int(n_batch/n_cls) + 1
elif sampling_mode == 'undersample':
    if batch_mode == 'equal':
        max_iter = np.min(class_size) // n_batch + 1
    elif batch_mode == 'mixed':
        max_iter = np.min(class_size) // int(n_batch/n_cls) + 1
elif sampling_mode == None:
    if batch_mode == 'equal':
        max_iter = np.max(class_size) // n_batch + 1
    elif batch_mode == 'mixed':
        max_iter = np.sum(class_size) // n_batch + 1
        
max_iter

214

Training history
- Epoch가 끝날때마다 accuracy와 loss의 history를 plot하여 저장해서 해당 폴더에 들어가서 훈련 경과를 볼 수 있음 

Training progress verbose
- 1: Epoch마다 iteration 경과가 표시되고, training loss & accuracy 및 validation loss & accuracy가 표시됨
- 2: Epoch마다 %로  진행경과만 나타남
- 3: 진행경과와 epoch마다 걸리는 시간이 나타남 (tqdm)

Model Saving
- model_name은 저장되는 모델 파일명의 prefix가 되며 data_config의 이름 형식과 유사하게 작성하는 것을 권장함
- 본인이 저장한 모델을 개발 목적에 따라 한 눈에 알아보고 추후에 혼동되는 것을 방지하기 위함 
- validation loss가 가장 작은 값을 가질 때마다 모델을 저장하며 파일이름에 성능이 들어가 한 눈에 성능을 추정할 수 있도록 함

learning scheduler나 early stopping을 사용할 수 있으나, 권장하지는 않음

In [19]:
n_epoch = 50
verbose = 1
model_name = 'np-hd_---_--'
n_patient = None

if verbose == 3:
    pbar = tqdm(total=n_epoch, unit='epoch', bar_format='{l_bar}{bar:40}{r_bar}')

loss_hist, accr_hist = [], []
val_loss_hist, val_accr_hist = [], []

iter_i = 0
epoch_i = 0
patient_i = 0

save_path = './model'
os.makedirs(save_path, exist_ok = True)  

print('Iteration {} for 1 epoch\n'.format(max_iter))

start_time = time.time()

while True:

    if iter_i == 0:
        train_loss = 0
        train_correct = 0

        if sampling_mode:
            sampled_train_paths = data_sampling(train_paths, mode = sampling_mode)

    if sampling_mode:
        train_x, train_y = torch_batch_load(sampled_train_paths, n_batch, mode = batch_mode, shuffle = True)
    else:
        train_x, train_y = torch_batch_load(train_paths, n_batch, mode = batch_mode, shuffle = True)

    output = model(train_x)
    loss = criterion(output, train_y)

    _, pred = torch.max(output, 1)

    train_loss += loss.item()
    train_correct += torch.mean((pred == train_y.detach()).float()).item()

#             train_loss.append(loss.item())
#             train_correct.append(torch.mean((pred == train_y.detach()).float()))

    optimizer.zero_grad()
    loss.backward() 
    optimizer.step()

    iter_i += 1

    if verbose == 1:
        progress_bar(iter_i, max_iter, prefix = 'Epoch {:03d}'.format(epoch_i+1), 
                      suffix = '', barLength = 70)

    if iter_i % max_iter == 0:   

        epoch_i += 1
        patient_i += 1

        loss_hist.append(train_loss / iter_i)
        accr_hist.append(train_correct / iter_i)

#                 model.eval()
        with torch.no_grad():

            valid_loss, valid_pred = validation(valid_X, valid_Y, batch_size = 8)

            val_loss_hist.append(valid_loss.item())
            val_accr_hist.append((torch.mean((valid_pred == valid_Y.detach()).float()).item()))


        if (val_accr_hist[-1] == np.max(val_accr_hist)): 

            patient_i = 0

            now = datetime.datetime.now()
            nowDatetime = now.strftime('%y%m%d%H%M')
            hyper_params = '{}_{}'.format(lr, n_batch)
            tr_spec = 't_accr_{:.4f}_t_loss_{:.6f}'.format(accr_hist[-1], loss_hist[-1])
            vl_spec = 'v_accr_{:.4f}_v_loss_{:.6f}'.format(val_accr_hist[-1], val_loss_hist[-1])
            model_full_name = '{}_{}_{}_{:03d}_{}_{}.pt'.format(model_name, 
                                                                hyper_params, nowDatetime, epoch_i, tr_spec, vl_spec)
            torch.save(model.state_dict(), save_path + '/' + model_full_name)

        if verbose == 1:
            train_prt = 'train_loss: {:.5f}, train_accr: {:.3f}'.format(loss_hist[-1], accr_hist[-1])
            val_prt = 'val_loss: {:.5f}, val_accr: {:.3f}'.format(val_loss_hist[-1], val_accr_hist[-1])

            elapsed_time = time.time() - start_time
            print("{} | {} | {} elapsed".format(train_prt, val_prt, sec_to_m_s_ms(elapsed_time)))

        if verbose == 2:
            progress_bar(epoch_i, n_epoch, 
                          prefix = 'Training Epoch', suffix = '({}/{})'.format(epoch_i, n_epoch), 
                          barLength = 70)
        if verbose == 3:
            pbar.update(1)

        if patient_i == n_patient:
            break

        plot_history(model_name) 
        iter_i = 0
        start_time = time.time()

    if epoch_i == n_epoch:
#             if epoch_i == 2:
#                 print(loss_hist, accr_hist)
#                 print(val_loss_hist, val_accr_hist)
        break

if verbose == 3:        
    pbar.close()

Iteration 214 for 1 epoch

Epoch 001 |###################################################################### | 100.0% 
train_loss: 0.67213, train_accr: 0.580 | val_loss: 0.68873, val_accr: 0.708 | 01:18.098 elapsed
Epoch 002 |###################################################################### | 100.0% 
train_loss: 0.58013, train_accr: 0.731 | val_loss: 0.60095, val_accr: 0.711 | 01:17.763 elapsed
Epoch 003 |###################################################################### | 100.0% 
train_loss: 0.54944, train_accr: 0.757 | val_loss: 0.50977, val_accr: 0.797 | 01:17.927 elapsed
Epoch 004 |###################################################################### | 100.0% 
train_loss: 0.51629, train_accr: 0.790 | val_loss: 0.49729, val_accr: 0.807 | 01:17.719 elapsed
Epoch 005 |###################################################################### | 100.0% 
train_loss: 0.48338, train_accr: 0.824 | val_loss: 0.45197, val_accr: 0.852 | 01:18.320 elapsed
Epoch 006 |######################

Model Selection
- 저장된 모델들 중에 training loss와 validation loss의 합이 가장 작은 model 선택

In [20]:
model_path = './model'

model_list = np.array([i for i in os.listdir(model_path) if model_name + '_' in i])
model_list

array(['np-hd_---_--_0.0001_32_2101221706_003_t_accr_0.7570_t_loss_0.549435_v_accr_0.7967_v_loss_0.509768.pt',
       'np-hd_---_--_0.0001_32_2101221707_004_t_accr_0.7899_t_loss_0.516286_v_accr_0.8065_v_loss_0.497294.pt',
       'np-hd_---_--_0.0001_32_2101221709_005_t_accr_0.8243_t_loss_0.483379_v_accr_0.8519_v_loss_0.451969.pt',
       'np-hd_---_--_0.0001_32_2101221711_007_t_accr_0.8396_t_loss_0.469670_v_accr_0.8519_v_loss_0.446602.pt',
       'np-hd_---_--_0.0001_32_2101221713_008_t_accr_0.8559_t_loss_0.452650_v_accr_0.8759_v_loss_0.432012.pt',
       'np-hd_---_--_0.0001_32_2101221715_010_t_accr_0.8611_t_loss_0.446560_v_accr_0.8818_v_loss_0.424417.pt',
       'np-hd_---_--_0.0001_32_2101221718_012_t_accr_0.8792_t_loss_0.429733_v_accr_0.8840_v_loss_0.420574.pt',
       'np-hd_---_--_0.0001_32_2101221720_014_t_accr_0.8792_t_loss_0.428873_v_accr_0.8908_v_loss_0.418895.pt',
       'np-hd_---_--_0.0001_32_2101221723_016_t_accr_0.8834_t_loss_0.425618_v_accr_0.8934_v_loss_0.416949.pt',
 

In [21]:
n_model = len(model_list)
n_model

20

In [22]:
t_loss, v_loss = np.zeros([n_model]), np.zeros([n_model])

for i, file in zip(range(n_model), model_list):
    t_loss[i] = file.split('t_loss')[-1].split('_')[1]
    v_loss[i] = file.split('v_loss')[-1].split('_')[1][:-3]
    
t_loss + v_loss

array([1.059203, 1.01358 , 0.935348, 0.916272, 0.884662, 0.870977,
       0.850307, 0.847768, 0.842567, 0.829732, 0.8183  , 0.798994,
       0.779825, 0.775278, 0.775286, 0.762089, 0.750552, 0.752359,
       1.360863, 1.181085])

In [23]:
best_idx = np.where((t_loss+v_loss) == np.min(t_loss+v_loss))[0]
(t_loss + v_loss)[best_idx]

array([0.750552])

In [24]:
delete_idx = np.setdiff1d(np.arange(n_model), best_idx)
for i in delete_idx:
    os.remove(model_path + '/' + model_list[i])
print('Best Model:', model_list[int(best_idx)])

Best Model: np-hd_---_--_0.0001_32_2101221757_042_t_accr_0.9350_t_loss_0.376242_v_accr_0.9366_v_loss_0.374310.pt
