In [None]:
from pycocotools.coco import COCO
import skimage.io as io
import numpy as np
import pandas as pd
import os

import wandb

In [None]:
!git clone https://github.com/softwaremill/lemon-dataset.git -qq
!unzip -q lemon-dataset/data/lemon-dataset.zip

In [None]:
PROJECT_NAME = 'lemon-dataset'
RAW_DATA_FOLDER = 'lemon-dataset/'
ANNOTATIONS_FILE = 'lemon-dataset/annotations/instances_default.json'

In [None]:
coco = COCO(ANNOTATIONS_FILE)

In [None]:
cats = coco.loadCats(coco.getCatIds())
catIds = coco.getCatIds()
imgIds = coco.getImgIds()
imgs = coco.loadImgs(imgIds)

In [None]:
def get_anns(img):
    annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds, iscrowd=None)
    return coco.loadAnns(annIds)

def get_label(ann):
    return [cat['name'] for cat in cats if cat['id'] == ann['category_id']][0]

In [None]:
def make_wandb_image(img):
    pth = os.path.join(RAW_DATA_FOLDER, img['file_name'])
    img_array = io.imread(pth)
    anns = get_anns(img)

    truth_box_data = [{'position': {'minX': ann['bbox'][0],
                                  'minY': ann['bbox'][1],
                                  'maxX': ann['bbox'][0]+ann['bbox'][2],
                                  'maxY': ann['bbox'][1]+ann['bbox'][3]},
                      'class_id': ann['category_id'],
                      'box_caption': get_label(ann),
                      'domain': 'pixel'} for ann in anns]

    masks = [coco.annToMask(ann)*ann['category_id'] for ann in anns]
    mask = np.stack(masks).max(axis=0)  # arbitrary way to select a label...
    return wandb.Image(
                    img_array,
                    classes=cats,
                    boxes={'ground_truth': {'box_data': truth_box_data}},
                    masks={'ground_truth': {'mask_data': mask}}
                    )

In [None]:
cats

In [None]:
def is_mold(img):
    anns = get_anns(img)
    # 4 is id of mold category
    return 4 in [x['category_id'] for x in anns]

is_mold_col = [is_mold(img) for img in imgs]

In [None]:
imgs[0]['file_name']

In [None]:
img_uris = [t['file_name'].split('/')[1].split('.')[0] for t in imgs]

In [None]:
with wandb.init(project=PROJECT_NAME, job_type="EDA") as run:
    df = pd.DataFrame({'imgs': [make_wandb_image(img) for img in imgs]})
    df['ids'] = [t.split('_')[0] for t in img_uris]
    df['n1'] = [t.split('_')[1] for t in img_uris]
    df['n2'] = [t.split('_')[2] for t in img_uris]
    df['n3'] = [t.split('_')[3] for t in img_uris]
    df['n4'] = [t.split('_')[4] for t in img_uris]   
    df['is_mold'] = is_mold_col
    run.log({'table_coco': wandb.Table(dataframe=df)})