In [1]:
import glob
import os
import numpy as np
import json
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
import cv2
from scipy.ndimage import label

train = glob.glob("/home/yiw/kg/input/train/*")

annotations = {}

# Open the annotations file
with open('/home/yiw/kg/input/polygons.jsonl', 'r') as f:
    for line in f:
        annotation = json.loads(line)
        image_id = annotation['id']
        image_annotations = annotation['annotations']

        annotations[image_id] = image_annotations

image_map = {impath.split('/')[-1].split('.')[0]: impath for impath in train}
print(len(set(image_map.keys())), len(set(annotations.keys())))
#7033 1633

7034 1633


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm
from datetime import datetime
import json,itertools
from typing import Optional
from sklearn.model_selection import KFold

def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

# From https://newbedev.com/encode-numpy-array-using-uncompressed-rle-for-coco-dataset
def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

idx = 0

def coco_structure(image_ids):
    global idx
    cats =[{'name': "blood_vessel", 'id': 1}]
    images = [{'id':image_id, 'width': 512, 'height': 512, 'file_name': image_map[image_id]} for image_id in image_ids]
    coco_annotations = []

    for _, image_id in enumerate(tqdm(image_ids)):
        polygons = annotations[image_id]
        for polygon in polygons:
            if polygon["type"] != "blood_vessel": continue
            lines = np.array(polygon['coordinates'])
            lines = lines.reshape(-1, 1, 2)
            mk = np.zeros((512, 512), dtype=np.uint8)
            cv2.fillPoly(mk, [lines], 1)

            ys, xs = np.where(mk)
            x1, x2 = min(xs), max(xs)
            y1, y2 = min(ys), max(ys)
            enc = binary_mask_to_rle(mk)
            seg = {
                'segmentation':enc, 
                'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
                'bbox_mode': 1,
                'area': int(np.sum(mk)),
                'image_id': image_id, 
                'category_id': 1,
                'iscrowd': 0,
                'id': idx
            }
            coco_annotations.append(seg)
            idx += 1
    return {'categories':cats, 'images':images,'annotations': coco_annotations}

train_df = pd.DataFrame()
train_df["id"] = list(annotations.keys())
ids = train_df.id.values
folds = KFold(n_splits=5, random_state=2023, shuffle=True).split(ids)
for fold, (train_idx, val_idx) in enumerate(folds):
    train_ids, val_ids = ids[train_idx], ids[val_idx]

    train_json = coco_structure(train_ids)
    valid_json = coco_structure(val_ids)

    with open(f'../folds/coco_cell_train_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(train_json, f, ensure_ascii=True, indent=4)
    with open(f'../folds/coco_cell_valid_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(valid_json, f, ensure_ascii=True, indent=4)


  0%|          | 0/1306 [00:00<?, ?it/s]

  0%|          | 0/327 [00:00<?, ?it/s]

  0%|          | 0/1306 [00:00<?, ?it/s]

  0%|          | 0/327 [00:00<?, ?it/s]

  0%|          | 0/1306 [00:00<?, ?it/s]

  0%|          | 0/327 [00:00<?, ?it/s]

  0%|          | 0/1307 [00:00<?, ?it/s]

  0%|          | 0/326 [00:00<?, ?it/s]

  0%|          | 0/1307 [00:00<?, ?it/s]

  0%|          | 0/326 [00:00<?, ?it/s]