In [None]:
!pip install pycocotools
!pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# python libaries
from itertools import groupby
import os
import json

#detectron2
from detectron2.data.datasets import register_coco_instances
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import DatasetCatalog, MetadataCatalog

#sklearn
from sklearn.model_selection import GroupShuffleSplit

#computer vision lib
import cv2

#pycocotools
from pycocotools import mask


## Steps
1. Decoding from competition RLE format
2. Encoding to coco RLE format
3. Build coco json file
4. Use detectron register_coco_instances directly

COCO json format
reference: https://www.immersivelimit.com/tutorials/create-coco-annotations-from-scratch

- categories :[]
     - id
     - name

- images: []
    - id # maps to annotation image_id
    - width
    - height
    - file_name #filepath

- annotations: []
    - segmentation
        - counts [] # left to right, top to bottom [not a thing_pixel, thing_pixel....]
        - size # height, width
    - bbox [] # [top left x position, top left y position, width, height].
    - area # num of pixel
    - image_id 
    - category_id
    - iscrowd
    - id
 

## Utils for working convert from competition RLE to coco RLE

In [None]:
def RLEdecode(rle, image_size:tuple):
    """
    This function will take in the competition format RLE and transform it into a bit mask
    """
    height, width = image_size
    image_array = np.zeros(height*width, dtype=np.uint8)
    
    rle =np.array([int(item) for item in rle.split()])
    starts = rle[0::2] -1
    ends = starts+rle[1::2]
    
    for start, end in zip(starts,ends):
        image_array[start:end] = 1
        
    return image_array.reshape(image_size)

def bit_mask_to_coco_rle(bitmask):
    """
    This function will take in a bitmask and transform to coco compressed RLE format
    COCO RLE format, odd number are non-masks(0) even number are masks(1)
    """
    rle = {"size": bitmask.shape}
    counts = []
    for i, (label,values) in enumerate(groupby(bitmask.ravel(order='F'))):
        if i==0 and label==1:
            counts.append(0)
        
        counts.append(len(list(values)))
    rle['counts']=counts
    return rle

## Load in data and generate category mapping

In [None]:
import pandas as pd
train_df = pd.read_csv("../input/sartorius-cell-instance-segmentation/train.csv")
train_df.head()

In [None]:
cell_types = train_df['cell_type'].unique().tolist()
cell_types_mapping = {cell_type:i+1 for i, cell_type in enumerate(cell_types)}
print("Cell types mapping")
print(cell_types_mapping)

In [None]:
test_annotation = train_df['annotation'][1]
image_array = RLEdecode(test_annotation,(520,704))
plt.imshow(image_array)

## Comparing pycocotools conversion with custom conversion
1. we can see that pycocotools compresses the RLE in toe a binary string and hence we will use our own method

In [None]:
coco_rle_custom = bit_mask_to_coco_rle(image_array)
coco_rle_pycoco = mask.encode(np.asfortranarray(image_array))
print("Custom method", coco_rle_custom)
print("pycocotools method", coco_rle_pycoco)

## Convert to COCO Json format

In [None]:
def generate_train_coco_json(df, image_training_root, cell_types_mapping):
    """
    This function takes in a training_df (train/validation) and create a coco training json file
    """
    # final json file
    coco_dict = {}
    
    # build categories
    categories = []
    for name, idx in cell_types_mapping.items():
        categories.append({"id":idx,"name":name})
    
    # build images
    images = []
    unique_image_df = df.groupby("id").agg('first')
    for i in range(len(unique_image_df)):
        row = unique_image_df.iloc[i]
        idx = unique_image_df.index[i]
        images.append({
                'id':str(idx),
                'width':int(row['width']),
                'height':int(row['height']),
                'file_name':os.path.join(image_training_root, idx+'.png')
            })
    
    #build annotations
    annotations = []
    for i in tqdm(range(len(df))):
        row=df.iloc[i]  
        bit_map = RLEdecode(row['annotation'],(row['height'],row['width']))
        seg = bit_mask_to_coco_rle(bit_map)
        ys, xs = np.where(bit_map) 
        x1 = np.min(xs)
        x2 = np.max(xs)
        y1 = np.min(ys)
        y2 = np.max(ys)
        
        annotations.append({
            "segmentation":seg,
            "bbox":[int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)], #xywh
            "area":int(np.sum(bit_map)),
            "image_id": str(row['id']),
            "category_id":cell_types_mapping[row['cell_type']],
            "iscrowd":0,
            "id":int(df.index[i])
        })
        
    coco_dict['categories'] = categories
    coco_dict['images'] = images
    coco_dict['annotations'] = annotations
    
    return coco_dict
    

## Train test split

In [None]:
group_shuffle_split = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

for train_idx, test_idx in group_shuffle_split.split(train_df, None, groups=train_df['id']):
    train_dataset_df =train_df.iloc[train_idx]
    test_dataset_df = train_df.iloc[test_idx]

In [None]:
train_coco_json = generate_train_coco_json(train_dataset_df, "train", cell_types_mapping)
test_coco_json = generate_train_coco_json(test_dataset_df, "train", cell_types_mapping)

In [None]:
with open("./train_coco.json", "w") as jsonFile:
    json.dump(train_coco_json,jsonFile)
    
with open("./val_coco.json", "w") as jsonFile:
    json.dump(test_coco_json,jsonFile)

## Visualizing with pycocotools 

In [None]:
## register in the dataset
# Change config to bitmask
cfg = get_cfg()
cfg.INPUT.MASK_FORMAT='bitmask'
register_coco_instances("satorius_train",{},
                        "../input/satorius-segmentation-coco-json/train_coco.json",
                        image_root='../input/sartorius-cell-instance-segmentation')

In [None]:
train_metadata = MetadataCatalog.get("satorius_train")
train_dict = DatasetCatalog.get("satorius_train")

In [None]:
# create visualizer with config
# read image with cv2
# show image
sample_train_record =train_dict[1]
img = cv2.imread(sample_train_record["file_name"])
visualizer = Visualizer(img[:,:,::-1], metadata= train_metadata, scale=1)
out = visualizer.draw_dataset_dict(sample_train_record)
out.save("./sample.png")
plt.figure(figsize=(25,15))
plt.imshow(out.get_image()[:,:,::-1]);