## ETL Preprocessing

This script devide all files into below.
- train
- valid
- test

In [1]:
import pickle
import glob
import os
import shutil
from tools.mask_generator import generate_mask_image
from sklearn.model_selection import train_test_split
from natsort import natsorted

### Load All files

In [47]:
data_path = "../data/GoPro/"
original_dir = "../data/original_imgs"
masked_img_dir = "../data/masked_imgs"

In [63]:
json_file_names = glob.glob(data_path + "**/!(meta.json)*.json", recursive=True)
img_files_names = glob.glob(data_path + "**/*.jpg", recursive=True)

In [64]:
os.makedirs(original_dir, exist_ok = True)
os.makedirs(masked_img_dir, exist_ok=True)

In [65]:
# save original images into original_img directory
for img_file in img_files_names:
    shutil.copy(img_file, original_dir)
    
# store masked data into masked_imgs
for mask_json in json_file_names:
    try:
        # generate masked images
        generate_mask_image(mask_json, masked_img_dir) 
    except Exception as e:
        print(mask_json)

## Create data direcotries for training

In [2]:
# 環境変数
IMG_PATH = '../data/*original_imgs/*'
MASKED_IMG_PATH = '../data/*masked_imgs/*'

In [3]:
img_list = glob.glob(IMG_PATH)
anno_img_list = glob.glob(MASKED_IMG_PATH)
sorted_img_list = natsorted(img_list)
sorted_anno_list = natsorted(anno_img_list)

In [5]:
##### Prepare data 
train_img_list, test_img_list, train_anno_list, test_anno_list = train_test_split(sorted_img_list, sorted_anno_list, test_size=0.2, random_state=1)
# split train into validation
train_img_list, val_img_list, train_anno_list, val_anno_list = train_test_split(train_img_list, train_anno_list, test_size=0.15, random_state=1) # 0.15 x 0.8 = 0.12

In [7]:
def create_dir(img_list, anno_list, target_dir, data_dir="../data"):
    # create img dir and store data
    for img_file in img_list:
        os.makedirs(data_dir+target_dir+"/img", exist_ok = True)
        shutil.copy(img_file, data_dir+target_dir+"/img/")
    # create annotation dir and store data
    for img_file in anno_list:
        os.makedirs(data_dir+target_dir+"/anno", exist_ok = True)
        shutil.copy(img_file, data_dir+target_dir+"/anno/")

In [8]:
create_dir(train_img_list, train_anno_list, "/train")
create_dir(test_img_list, test_anno_list, "/test")
create_dir(val_img_list, val_anno_list, "/valid")