# Data Processors

In [17]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import json
import multiprocessing
from multiprocessing import Pool
from functools import partial

import cv2
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
from tqdm import tqdm

In [2]:
base_dir = '/home/xd/data/fire'
ds = '2'

img_dir = os.path.join(base_dir, ds, 'images')
label_dir = os.path.join(base_dir, ds, 'labels')
mask_dir = os.path.join(base_dir, ds, 'masks')

if not os.path.isdir(mask_dir):
    os.makedirs(mask_dir)
    
img_names = os.listdir(img_dir)

img_paths = [os.path.join(img_dir, img_name) for img_name in img_names]
label_paths = [os.path.join(label_dir, img_name.split('.')[0] + '.json') for img_name in img_names]

## Mask Creation

In [13]:
def export_mask(label_path):
    label_cont = json.load(open(label_path))
    
    width = label_cont['materialInfo']['width']
    height = label_cont['materialInfo']['high']
    annos = label_cont['annotateInfo']
    
    mask = np.zeros((height, width), dtype=np.uint8)
    
    contours_all = []
    empty_all = []
    
    # parser
    for anno in annos:
        positions = anno['positions'][0]['positions']

        st_empty = positions.find('empty')
        st_meaningful = positions.find('meaningful')

        ed_empty = positions.find(']', st_empty)
        ed_meaningful = positions.find(']', st_meaningful)


        meaningful_cut = positions[st_meaningful + 14: ed_meaningful - 1]
        empty_cut = positions[st_empty + 10: ed_empty - 1]

        meaningful_pos = meaningful_cut.split('},{')
        empty_pos = empty_cut.split('},{')

        if len(meaningful_pos) > 1:
            contours = np.ndarray((len(meaningful_pos), 1, 2), dtype=np.int32)
            for index, pos in enumerate(meaningful_pos):
                x = pos.split(',')[0].split(':')[-1]
                y = pos.split(',')[1].split(':')[-1]

                contours[index][0][0] = int(float(x))
                contours[index][0][1] = int(float(y))
            contours_all.append(contours)

        if len(empty_pos) > 1:
            contours = np.ndarray((len(empty_pos), 1, 2), dtype=np.int32)
            for index, pos in enumerate(empty_pos):
                x = pos.split(',')[0].split(':')[-1]
                y = pos.split(',')[1].split(':')[-1]

                contours[index][0][0] = int(float(x))
                contours[index][0][1] = int(float(y))
            empty_all.append(contours)
            
    cv2.drawContours(mask, contours_all, -1, (255), thickness=cv2.FILLED)
    cv2.drawContours(mask, empty_all, -1, (0), thickness=cv2.FILLED)
    
    return mask

In [16]:
with tqdm(total=len(label_paths), file=sys.stdout) as pbar:
    for img_name, label_path in zip(img_names, label_paths):
        pbar.update(1)
        
        if not os.path.exists(label_path):
            print(label_path)
            continue
            
        mask = export_mask(label_path)
        mask_img = Image.fromarray(mask)
        
        mask_path = os.path.join(mask_dir, img_name.split('.')[0] + '.png')
        mask_img.save(mask_path)

 12%|█▏        | 381/3153 [00:05<00:51, 54.13it/s]/home/xd/data/fire/2/labels/1194932391064698883.json
100%|██████████| 3153/3153 [00:44<00:00, 70.82it/s] 


## JPEG to PNG Convertor

In [8]:
base_dir = '/home/xd/data/fire'
ds = 'large'

img_dir = os.path.join(base_dir, ds, 'images')
mask_dir = os.path.join(base_dir, ds, 'masks')

img_names = os.listdir(img_dir)

In [4]:
def convert_worker(img_name, img_dir=''):
    target_name = os.path.join(img_dir, img_name.split('.')[0] + '.png')
    img = Image.open(os.path.join(img_dir, img_name))
    img.save(os.path.join(img_dir, target_name))
    img.close()
    
convert_worker_wrapper = partial(convert_worker, img_dir=img_dir)

In [5]:
# there's something wrong with pil multiprocessing
'''
with Pool(multiprocessing.cpu_count()*2) as p:
    rs = list(tqdm(
        p.imap(convert_worker_wrapper, img_names),
        total=len(img_names)
    ))
'''

with tqdm(total=len(img_names), file=sys.stdout) as pbar:
    for img_name in img_names:
        pbar.update(1)
        
        try:
            convert_worker_wrapper(img_name)
        except:
            print(img_name)

  5%|▍         | 238/4918 [00:44<10:15,  7.61it/s]1194940096135729155.jpg
 14%|█▎        | 672/4918 [02:24<18:56,  3.74it/s]  1194932466734501892.jpg
 15%|█▍        | 718/4918 [02:28<05:07, 13.68it/s]1194940066431303685.jpg
 38%|███▊      | 1864/4918 [06:34<32:56,  1.55it/s]  1194939997858992132.jpg
 59%|█████▉    | 2913/4918 [10:29<10:40,  3.13it/s]  1194940088191717379.jpg
 60%|██████    | 2972/4918 [10:39<10:17,  3.15it/s]1194940030599364611.jpg
 99%|█████████▉| 4858/4918 [18:11<00:11,  5.39it/s]1194940093291966467.jpg
100%|██████████| 4918/4918 [18:21<00:00,  4.47it/s]


In [12]:
# remove non-png files from img_dir manully

img_names = os.listdir(img_dir)
mask_names = os.listdir(mask_dir)

for mask_name in mask_names:
    if mask_name not in img_names:
        mask_path = os.path.join(mask_dir, mask_name)
        print(mask_path)
        
        os.remove(mask_path)

/home/xd/data/fire/large/masks/1194932466734501892.png
/home/xd/data/fire/large/masks/1194940030599364611.png
/home/xd/data/fire/large/masks/1194940088191717379.png
/home/xd/data/fire/large/masks/1194940093291966467.png
/home/xd/data/fire/large/masks/1194939997858992132.png
/home/xd/data/fire/large/masks/1194940066431303685.png
/home/xd/data/fire/large/masks/1194940096135729155.png


## Simple Data Mining

In [14]:
mask_dir = os.path.join(base_dir, 'masks')

In [41]:
mask_names = os.listdir(mask_dir)

def get_counts(mask_name, mask_dir):
    mask = cv2.imread(os.path.join(mask_dir, mask_name))
    g_mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)

    return len(np.nonzero(g_mask)[0]), g_mask.shape

In [42]:
rs = [get_counts(mask_name, mask_dir) for mask_name in mask_names]

In [48]:
counts = []
shapes = []

for r in rs:
    counts.append(r[0])
    shapes.append(r[1])
    
print(len(np.nonzero(counts)[0]))
print(np.mean([y for x, y in shapes]))
print(np.mean([x for x, y in shapes]))

4911
801.0146609651802
589.227652209326
