# Detectron: preprocess data COCO format

In [None]:
DEBUG = False
KAGGLE = False
COLAB = True

In [None]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
import os
import json
import itertools
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from pycocotools.coco import COCO
import skimage.io as io
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image
from sklearn.model_selection import (
    StratifiedKFold, 
    GroupKFold)

## Utils

In [None]:
def rle_to_mask(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

def mask_to_rle(mask):
    rle = {'counts': [], 'size': list(mask.shape)}
    counts = rle.get('counts')
    for i, (v, es) in enumerate(itertools.groupby(mask.ravel(order='F'))):
        if i == 0 and v == 1:
            counts.append(0)
        counts.append(len(list(es)))
    return rle

In [None]:
def coco_structure(df, desc=''):
    cat_idxs = {name: idx + 1 for idx, name 
               in enumerate(sorted(df.cell_type.unique()))}    
    cats =[{'name': name, 'id': idx} for name, idx 
           in cat_idxs.items()]
    imgs = [
        {
            'id': idx, 
            'width': row.width, 
            'height': row.height, 
            'file_name': f'train/{idx}.png'
        } 
        for idx, row in df.groupby('id').agg('first').iterrows()
    ]
    anns=[]
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=desc):
        msk = rle_to_mask(row.annotation, (row.height, row.width))
        ys, xs = np.where(msk)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        enc = mask_to_rle(msk)
        segm = {
            'segmentation': enc, 
            'bbox': [int(x1), 
                     int(y1), 
                     int(x2 - x1 + 1), 
                     int(y2 - y1 + 1)],
            'area': int(np.sum(msk)),
            'image_id': row.id, 
            'category_id': cat_idxs[row.cell_type], 
            'iscrowd': 0, 
            'id': idx
        }
        anns.append(segm)
    return {'categories': cats, 'images':imgs,'annotations': anns}

## Train and test split

In [None]:
FOLDS = 5
VAL_FOLD = 0
WORK_DIR = '/content/drive/MyDrive/sartorius' if COLAB else '.'
DATA_PATH = '../input/sartorius-cell-instance-segmentation' if KAGGLE else f'{WORK_DIR}/data'

In [None]:
df = pd.read_csv(f'{DATA_PATH}/train.csv')
if DEBUG: 
    df = df.sample(100)
    df.reset_index(inplace=True)
gkf = GroupKFold(n_splits=FOLDS)
df['fold'] = -1
for i, (train_idxs, val_idxs) in enumerate(gkf.split(df, groups=df['id'])):
    df.loc[val_idxs, 'fold'] = i
display(df.head())

In [None]:
for val_fold in range(FOLDS):
    plt.figure(figsize=(16, 4))
    plt.subplot(1, 3, 1)
    plt.title(f'train data, {len(df.loc[df.fold != VAL_FOLD].id.unique())} unique imgs')
    df.loc[df.fold != val_fold].cell_type.hist()
    plt.subplot(1, 3, 2)
    plt.title(f'val data, fold {val_fold}, {len(df.loc[df.fold == VAL_FOLD].id.unique())} unique imgs')
    df.loc[df.fold == val_fold].cell_type.hist()
    plt.show()

## Preprocess

In [None]:
for fold in tqdm(range(FOLDS), desc='folds'):
    file_name = f'{DATA_PATH}/coco_annotations_train_f{fold}.json'
    if os.path.exists(file_name):
        continue
    train_df = df.loc[df.fold != fold]
    coco_anns = coco_structure(train_df, desc=f'train fold {fold}')
    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(coco_anns, file, ensure_ascii=True, indent=4)

    val_df = df.loc[df.fold == fold]
    coco_anns = coco_structure(val_df, desc=f'val fold {fold}')
    with open(f'{DATA_PATH}/coco_annotations_val_f{fold}.json', 'w', encoding='utf-8') as file:
        json.dump(coco_anns, file, ensure_ascii=True, indent=4)

## COCO demo plots

In [None]:
coco = COCO(f'{DATA_PATH}/coco_annotations_train_f0.json')
img_idxs = coco.getImgIds()
imgs = coco.loadImgs(img_idxs[:4])

In [None]:
_, axs = plt.subplots(len(imgs), 2, figsize=(14, 4 * len(imgs)))
for img, ax in zip(imgs, axs):
    I = io.imread(f'{DATA_PATH}/{img["file_name"]}')
    ann_idxs = coco.getAnnIds(imgIds=[img['id']])
    anns = coco.loadAnns(ann_idxs)
    ax[0].imshow(I)
    ax[1].imshow(I)
    plt.sca(ax[1])
    coco.showAnns(anns, draw_bbox=True)