In [None]:
# This section is to move to the directory on Google Drive
import os
# os.chdir('drive/MyDrive/ComVis_20211')

In [None]:
import cv2
import numpy as np
from tqdm import tqdm

In [None]:
# change this to your data folder path
path = "Parasyte"
train = 0.7
valid = 0.15
test = 0.15

In [None]:
# create train/valid/test folders
if not os.path.exists('Train'):
  os.makedirs('Train')
if not os.path.exists('Valid'):
  os.makedirs('Valid')
if not os.path.exists('Test'):
  os.makedirs('Test')

In [None]:
# sort rectangles by area
def sort_rect(rect):
    return rect[4]

In [None]:
# detect iou of overlapping rectangles
def detect_overlap(rect1, rect2, threshold=0.2):
    x1, y1, w1, h1 = rect1[0], rect1[1], rect1[2], rect1[3]
    x2, y2, w2, h2 = rect2[0], rect2[1], rect2[2], rect2[3]

    #check if rect 2 inside rect 1
    if ((x2+w2) <= (x1+w1)) and (x2 >= x1) and ((y2+h2) <= (y1+h1)) and (y2 >= y1):
        return True

    #check percentage of iou for rect 2 and rect 1
    xa = max(x1,x2)
    ya = max(y1,y2)
    xb = min(x1+w1, x2+w2)
    yb = min(y1+h1, y2+h2)

    interArea = max(0, xb-xa+1)*max(0,yb-ya+1)

    rect1Area = w1*h1
    rect2Area = w2*h2

    iou = interArea/(rect1Area + rect2Area - interArea)
    if iou > threshold:
        return True
    return False

In [None]:
# extract panels from images
def roi_split(img_list, dir, parent_dir):
  ROI_number = 0
  for img_name in img_list:
      img = os.path.join(chapter_path, img_name)
      # Load image, grayscale, Gaussian blur, Canny edge detection
      image = cv2.imread(img)
      gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
      blurred = cv2.GaussianBlur(gray, (3,3), 0)
      canny = cv2.Canny(blurred, 100, 200, 1)

      # Find contours
      cnts = cv2.findContours(canny, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
      cnts = cnts[0] if len(cnts) == 2 else cnts[1]
      # cnts, _ = contours.sort_contours(cnts, method="left-to-right")
      cnts = sorted(cnts, key = cv2.contourArea, reverse = True)

      (img_x, img_y, img_c) = image.shape
      threshold_lower = img_x*img_y/16
      vertices = []
      for i in range(len(cnts)):
          c = cnts[i]
          # Obtain bounding rectangle for each contour
          x,y,w,h = cv2.boundingRect(c)

          if ((w*h) > threshold_lower):
              vertices.append((x,y,w,h, w*h))
              # Find ROI of the contour

      vertices = sorted(vertices, key=sort_rect, reverse=True)
      overlapped = np.array([0]*len(vertices))
      for i in range(len(vertices)):
          rect1 = vertices[i]
          for j in range(i+1, len(vertices)):
              rect2 = vertices[j]
              if detect_overlap(rect1, rect2):
                  overlapped[j] = 1

      true_bound = []
      for i in range(len(vertices)):
          if overlapped[i] == 0:
              true_bound.append(vertices[i])

      for rect in true_bound:
          x,y,w,h = rect[0], rect[1], rect[2], rect[3]

          ROI = image[y:y+h, x:x+w]
          img_location = os.path.join(dir, '{}_ROI_{}.jpg'.format(parent_dir, ROI_number))
          cv2.imwrite(img_location, ROI)
          ROI_number += 1

In [None]:
# split data into train/valid/test
for chapter in tqdm(os.listdir(path)):
  chapter_path = os.path.join(path, chapter)
  chapter_img = os.listdir(chapter_path)
  train_len = int(len(chapter_img)*train)
  valid_len = int(len(chapter_img)*valid)
  test_len = len(chapter_img) - train_len - valid_len
  train_img = chapter_img[:train_len]
  valid_img = chapter_img[train_len:(train_len + valid_len)]
  test_img = chapter_img[(train_len + valid_len):]
  roi_split(train_img, 'Train', chapter)
  roi_split(valid_img, 'Valid', chapter)
  roi_split(test_img, 'Test', chapter)

100%|██████████| 64/64 [14:13<00:00, 13.34s/it]


In [None]:
len(os.listdir('Train'))

6893

In [None]:
len(os.listdir('Valid'))

1387

In [None]:
len(os.listdir('Test'))

1672