# 훈련 데이터셋 생성기

모델을 훈련시키기 위한 데이터셋을 생성한다. 데이터셋을 생성하기 위해, 원본 데이터 전처리를 레벨별로 수행한다.

Raw : 원본 데이터  
Level 0 : 패치 이미지를 추출하기 위해 원본 데이터를 가공함  
Level 1 : 패치 이미지를 추출하여 폴더별로 정리함  
Level 2 : Upsampling 또는 downsampling 수행  
Dataset : 모델 입력 데이터 (IMAGE와 LABEL로 구성)

In [1]:
import cv2
import numpy as np
import sys
import os
import random

Level 경로를 설정합니다.

In [2]:
LEVEL1_TRUE_DATA_DIR = './warehouse/level1/train/true/'
LEVEL1_FALSE_DATA_DIR = './warehouse/level1/train/false/'

LEVEL2_TRUE_DATA_DIR = './warehouse/level2/train/true/'
LEVEL2_FALSE_DATA_DIR = './warehouse/level2/train/false/'

DATASET_IMAGE_PATH = './dataset/train_image_64x64_gray_447648.bin'
DATASET_LABEL_PATH = './dataset/train_label_64x64_gray_447648.bin'

환경설정된 경로가 없다면 폴더를 생성한다.

In [None]:
class PatchGenerator():

    def __init__(self, num_patch_channel, num_patch_row, num_patch_col, num_sample_per_batch, num_batch_per_cache):        
        self.num_patch_channel = num_patch_channel # channel 수
        self.num_patch_row = num_patch_row # row 수
        self.num_patch_col = num_patch_col # col 수 
        self.num_sample_per_batch = num_sample_per_batch # 한 배치 당 샘플 수
        self.num_batch_per_cache = num_batch_per_cache # 한 캐쉬 당 배치 수
        
    def upsampling_rotate(self, angle):
        
    
    
    
    def sample_size(self):
        return self.num_patch_channel * self.num_patch_row * self.num_patch_col
    
    def batch_size(self):
        return self.num_sample_per_batch * self.sample_size()
    
    def cache_size(self):
        return self.num_batch_per_cache * self.batch_size()

In [3]:
def PrintProgress(prev_percent, curr_percent):
    
    if prev_percent % 5 == curr_percent % 5:
        return None
    
    if curr_percent % 10 == 0:
        sys.stdout.write(str(curr_percent) + '%')
        sys.stdout.flush()
    else:
        if curr_percent % 5 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()

GetLevelDataList() : Level 경로에서 파일 목록을 가지고 옵니다.

In [4]:
def GetLevelDataList(dir_path):

    level_data_list = []
    
    for (path, dir, files) in os.walk(dir_path):
        for filename in files:
            ext = os.path.splitext(filename)[-1]
            if ext == '.bmp' or ext == '.png' or ext == '.jpg' or ext == '.jpeg':
                level_data_list.append(dir_path + filename)
                    
    return level_data_list

RotatePatchImage() : 업샘플링을 위해 패치이미지를 회전시킵니다.

In [5]:
def RotatePatchImage(src_file_list, dst_dir):
    
    total_count = len(src_file_list)
    curr_count = 0
    prev_percent = -1
    
    for file_path in src_file_list:
        cv_img = cv2.imread(file_path)
        (h, w) = cv_img.shape[:2]

        check = 'f'
        
        filename = os.path.basename(file_path)
        
        if filename.rfind('t.png') != -1:
            check = 't'

        for angle in range(0,360,15):
            M = cv2.getRotationMatrix2D((w/2, h/2), angle, scale=1.0)
            rotated = cv2.warpAffine(cv_img, M, (w, h))
            dst_file_path = dst_dir + filename[:filename.rfind('.')] + '_' + str(angle) + check + '.png'
            cv2.imwrite(dst_file_path, rotated)

        curr_percent = int(curr_count*100/total_count)
        PrintProgress(prev_percent, curr_percent)
        curr_count = curr_count + 1
        prev_percent = curr_percent

    print('')

ShufflePatchImage() : 패치 이미지를 합치고 섞습니다.

In [6]:
def ShufflePatchImage(true_data_list, false_data_list):

    random.shuffle(true_data_list)
    random.shuffle(false_data_list)

    shuffled_data_list = true_data_list + false_data_list
    
    random.shuffle(shuffled_data_list)
    
    return shuffled_data_list

GenerateDataset() : 데이터셋을 생성합니다.

In [7]:
def GenerateDataset(data_list, out_image_file_path, out_label_file_path):
    
    total_count = len(data_list)
    curr_count = 0
    prev_percent = -1
    
    fp_label = open(out_label_file_path, 'wb')
    fp_image = open(out_image_file_path, 'wb')
        
    for data_item in data_list:
        cv_img = cv2.imread(data_item)
        cv_gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)

        label = '0'
        if data_item.find("t.") != -1:
            label = '1'

        fp_label.write(label)
        fp_image.write(cv_gray.tobytes())

        """
        for row in cv_gray:  
            for val in row:
                fp_image.write(val)
        """

        curr_count = curr_count + 1
        curr_percent = int(curr_count*100/total_count)
        PrintProgress(prev_percent, curr_percent)
        prev_percent = curr_percent

    fp_label.close()
    fp_image.close()
    
    print('')

In [8]:
if __name__ == '__main__':
    
    """
    PatchGenerator pg
    
    pg.init(positive_dir_path, negative_dir_path)
    pg.upsampling_rotate(val)
    pg.upsampling_noise(...)
    pg.upsampling_shift(...)
    pg.upsampling_zoom(...)
    pg.upsampling_horizontal_flip(...)
    pg.upsampling_vertical_flip(...)    
    pg.generate_dataset(patch_file_path, label_file_path)

    """
    
    # Level1 데이터 목록 가져오기
    level1_true_data_list = GetLevelDataList(LEVEL1_TRUE_DATA_DIR)
    level1_false_data_list = GetLevelDataList(LEVEL1_FALSE_DATA_DIR)
    
    print('level1 true data count :\t' + str(len(level1_true_data_list)))
    print('level1 false data count :\t' + str(len(level1_false_data_list)))    
    
    # Level1 데이터로부터 Level2 데이터 생성
    
    print('generate level2 true data...')
    
    RotatePatchImage(level1_true_data_list, # Level 1 파일 목록 (in)
                     LEVEL2_TRUE_DATA_DIR) # Level 2 경로 (out)

    print('generate level2 false data...')
    
    RotatePatchImage(level1_false_data_list, # Level 1 파일 목록 (in)
                     LEVEL2_FALSE_DATA_DIR) # Level 2 경로 (out)

     
    # Level2 데이터 목록 가져오기

    level2_true_data_list = GetLevelDataList(LEVEL2_TRUE_DATA_DIR)
    level2_false_data_list = GetLevelDataList(LEVEL2_FALSE_DATA_DIR)

    print('level2 true data count :\t' + str(len(level2_true_data_list)))
    print('level2 false data count :\t' + str(len(level2_false_data_list)))    
    
    # Level2 데이터로부터 Dataset 생성

    print('merge and shuffle level 2 patch image...')
    merge_shuffle_level2_data_list = ShufflePatchImage(level2_true_data_list, level2_false_data_list)
      
    print('generate dataset...')
    GenerateDataset(merge_shuffle_level2_data_list, DATASET_IMAGE_PATH, DATASET_LABEL_PATH)
    
    print('dataset pacth image : \t' + DATASET_IMAGE_PATH)    
    print('dataset label : \t' + DATASET_LABEL_PATH)    

level2 true data count :	93120
level2 false data count :	354528
merge and shuffle level 2 patch image...
generate dataset...
0%.10%.20%.30%.40%.50%.60%.70%.80%.90%.100%
dataset pacth image : 	./dataset/train_image_64x64_gray_447648.bin_2
dataset label : 	./dataset/train_label_64x64_gray_447648.bin_2
