##  Prepare train/val data to coco format

modified from https://www.kaggle.com/impiyush/simply-convert-data-to-coco-format/notebook

https://github.com/zylo117/Yet-Another-EfficientDet-Pytorch

- your dataset structure should be like this
datasets/
    -your_project_name/
        -train_set_name/
            -*.jpg
        -val_set_name/
            -*.jpg
        -annotations
            -instances_{train_set_name}.json
            -instances_{val_set_name}.json

- for example, coco2017
datasets/
    -coco2017/
        -train2017/
            -000000000001.jpg
            -000000000002.jpg
            -000000000003.jpg
        -val2017/
            -000000000004.jpg
            -000000000005.jpg
            -000000000006.jpg
        -annotations
            -instances_train2017.json
            -instances_val2017.json

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import random
import os
from shutil import copyfile

In [2]:
project_path = './datasets/EfficientDet_pneumonia/'
img_path = '../data/size1024/stage_2_train_images'
train_path = './datasets/EfficientDet_pneumonia/train/'
val_path = './datasets/EfficientDet_pneumonia/val/'

In [3]:
# Load df and select image with bounding box (Target =1)
df = pd.read_csv('../data/final_df.csv')
df = df[df['Target']==1].reset_index(drop=True)

In [4]:
# split into train, valid
train_idx, valid_idx = train_test_split(
    df['patientId'].unique(),
    test_size=0.1,
    random_state=32,
    shuffle=True
)

df_train = df[df['patientId'].isin(train_idx)].reset_index(drop=True)
df_valid = df[df['patientId'].isin(valid_idx)]

print(f'train files:{len(df_train)}, test files:{len(df_valid)}')

train files:8587, test files:968


In [5]:
# create image folder structure as requirement above

def create_image_folder(df, src_path, dst_path):
    if not os.path.exists(dst_path):
        os.makedirs(dst_path)
    for _, img_id in enumerate(tqdm(df['patientId'].unique())):
        src = os.path.join(src_path, img_id+'.png')
        dst = os.path.join(dst_path, img_id+'.png')
        copyfile(src, dst)

In [6]:
create_image_folder(df_train, img_path, train_path)
create_image_folder(df_valid, img_path, val_path)

100%|██████████| 5410/5410 [00:03<00:00, 1484.24it/s]
100%|██████████| 602/602 [00:00<00:00, 1506.13it/s]


## Prepare Label to coco json format

modified from : https://www.kaggle.com/impiyush/simply-convert-data-to-coco-format

In [7]:
coco_base = { "info": {},
              "licenses": [], 
              "images": [],
              "annotations": [],
              "categories": []}

In [8]:
coco_base["info"] = {
    "description": "RSNA Pneumonia detection, Kaggle",
    "url": "https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data",
    "version": "1.0",
    "year": 2018,
    "contributor": " Radiological Society of North America (RSNA) ",
    "date_created": "2020/12/10"
}

In [9]:
coco_base["licenses"].append(
    {
        "url": "https://opensource.org/licenses/MIT",
        "id": 1,
        "name": "MIT License"
    }
)

In [10]:
coco_base["categories"].append({"id": 1, "name": "Opacity", "supercategory": None})

In [11]:
coco_base_train = coco_base.copy()
coco_base_valid = coco_base.copy()

In [12]:
def set_coco_annotations(df):
    # id, image_id, category_id should be int (according to coco format)
    annos = []
    id_cnt = 1
    
    for idx, img_id in enumerate(tqdm(sorted(df['patientId'].unique()))):
        
        records = df[df['patientId'] == img_id]
        anno = {}
        anno['segmentation'] = []
        anno['iscrowd'] = 0
        anno['image_id'] = idx
        anno['category_id'] = 1
        bboxes = records[['x', 'y', 'width', 'height']].values.tolist()

        for ix, box in enumerate(bboxes):
            anno['bbox'] = box # x,y,w,h
            anno['area'] = box[2] * box[3] # w*h
            anno['id'] = id_cnt
            # https://stackoverflow.com/questions/43895430/python-append-an-original-object-vs-append-a-copy-of-object
            annos.append(anno.copy()) # copy is necessary here, otherwise it will always point to the last value of anno
            id_cnt += 1
        
    return annos

In [13]:
def set_coco_images(df, size=1024):
    # id, image_id, category_id should be int (according to coco format)
    images = []
    for idx, img in enumerate(tqdm(sorted(df['patientId'].unique()))):
        img_dict = {
                    "id":idx,
                    "file_name":f"{img}.png", 
                    "width":size,
                    "height":size, 
                    "license":1, 
                   }
        images.append(img_dict)

    return images

In [14]:
coco_base_train["images"] = set_coco_images(df_train)
coco_base_valid["images"] = set_coco_images(df_valid)

100%|██████████| 5410/5410 [00:00<00:00, 1804467.96it/s]
100%|██████████| 602/602 [00:00<00:00, 1634285.44it/s]


In [15]:
coco_base_train['annotations'] = set_coco_annotations(df_train)
coco_base_valid['annotations'] = set_coco_annotations(df_valid)

100%|██████████| 5410/5410 [00:07<00:00, 738.31it/s]
100%|██████████| 602/602 [00:00<00:00, 1012.48it/s]


In [16]:
#coco_base_train["images"] 

[{'id': 0,
  'file_name': '000fe35a-2649-43d4-b027-e67796d412e0.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 1,
  'file_name': '001031d9-f904-4a23-b3e5-2c088acd19c6.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 2,
  'file_name': '001916b8-3d30-4935-a5d1-8eaddb1646cd.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 3,
  'file_name': '0022073f-cec8-42ec-ab5f-bc2314649235.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 4,
  'file_name': '002cb550-2e31-42f1-a29d-fbc279977e71.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 5,
  'file_name': '00436515-870c-4b36-a041-de91049b9ab4.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 6,
  'file_name': '00704310-78a8-4b38-8475-49f4573b2dbb.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 7,
  'file_name': '0087bd3a-55a7-4045-b111-b018fa52d361.png',
  'width': 1024,
  'height': 1024,
  'license': 1},
 {'id': 8,
  'file_name'

In [17]:
#coco_base_train['annotations']

[{'segmentation': [],
  'iscrowd': 0,
  'image_id': 0,
  'category_id': 1,
  'bbox': [570.0, 282.0, 269.0, 409.0],
  'area': 110021.0,
  'id': 1},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 0,
  'category_id': 1,
  'bbox': [83.0, 227.0, 296.0, 438.0],
  'area': 129648.0,
  'id': 2},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 1,
  'category_id': 1,
  'bbox': [66.0, 160.0, 373.0, 608.0],
  'area': 226784.0,
  'id': 3},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 1,
  'category_id': 1,
  'bbox': [552.0, 164.0, 376.0, 676.0],
  'area': 254176.0,
  'id': 4},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 2,
  'category_id': 1,
  'bbox': [198.0, 375.0, 114.0, 206.0],
  'area': 23484.0,
  'id': 5},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 3,
  'category_id': 1,
  'bbox': [575.0, 232.0, 246.0, 528.0],
  'area': 129888.0,
  'id': 6},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 3,
  'category_id': 1,
  'bbox': [161.0, 230.0, 223.0, 486.0],
  'are

In [18]:
dst_path = './datasets/EfficientDet_pneumonia/annotations/'
if not os.path.exists(dst_path):
    os.makedirs(dst_path)
with open('./datasets/EfficientDet_pneumonia/annotations/instances_train.json','w') as train_coco:
    json.dump(coco_base_train, train_coco)

In [19]:
with open('./datasets/EfficientDet_pneumonia/annotations/instances_val.json','w') as valid_coco:
    json.dump(coco_base_valid, valid_coco)

## Test format

In [None]:
train_path = './datasets/EfficientDet_pneumonia/train/'

In [28]:
512len(os.listdir(train_path))

5705

In [29]:
len(os.listdir(val_path))

915

In [83]:
with open('./datasets/EfficientDet_pneumonia/annotations/instances_train.json') as f:
    my_data=json.load(f)

In [144]:
for idx, value in enumerate(my_data):
    print(value)

info
licenses
images
annotations
categories


In [84]:
my_data['images']

[{'license': 1,
  'height': 512,
  'width': 512,
  'id': 'b165b941-4ec6-4f1b-9f03-2e908de24266',
  'file_name': 'b165b941-4ec6-4f1b-9f03-2e908de24266.png'},
 {'license': 1,
  'height': 512,
  'width': 512,
  'id': '40346c30-cf4a-4fe3-a6a1-935ac182a551',
  'file_name': '40346c30-cf4a-4fe3-a6a1-935ac182a551.png'},
 {'license': 1,
  'height': 512,
  'width': 512,
  'id': '8cb6fe1e-ed23-44af-9a1d-85fe2bdb106e',
  'file_name': '8cb6fe1e-ed23-44af-9a1d-85fe2bdb106e.png'},
 {'license': 1,
  'height': 512,
  'width': 512,
  'id': 'cfcfaa4a-9c58-4eec-bb63-708e2b25df16',
  'file_name': 'cfcfaa4a-9c58-4eec-bb63-708e2b25df16.png'},
 {'license': 1,
  'height': 512,
  'width': 512,
  'id': '18ee3d5a-1d35-4f91-bef4-da13e40c4935',
  'file_name': '18ee3d5a-1d35-4f91-bef4-da13e40c4935.png'},
 {'license': 1,
  'height': 512,
  'width': 512,
  'id': '306ff5d4-1ea2-4ce1-9fc7-25a98fac5148',
  'file_name': '306ff5d4-1ea2-4ce1-9fc7-25a98fac5148.png'},
 {'license': 1,
  'height': 512,
  'width': 512,
  'id': '

In [85]:
my_data['annotations']

[{'segmentation': [],
  'iscrowd': 0,
  'image_id': 'b165b941-4ec6-4f1b-9f03-2e908de24266',
  'category_id': 1,
  'bbox': [354.0, 375.0, 164.0, 293.0],
  'area': 48052.0,
  'id': 1},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 'b165b941-4ec6-4f1b-9f03-2e908de24266',
  'category_id': 1,
  'bbox': [698.0, 454.0, 281.0, 418.0],
  'area': 117458.0,
  'id': 2},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': '40346c30-cf4a-4fe3-a6a1-935ac182a551',
  'category_id': 1,
  'bbox': [269.0, 301.0, 130.0, 193.0],
  'area': 25090.0,
  'id': 3},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': '8cb6fe1e-ed23-44af-9a1d-85fe2bdb106e',
  'category_id': 1,
  'bbox': [166.0, 91.0, 284.0, 659.0],
  'area': 187156.0,
  'id': 4},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': '8cb6fe1e-ed23-44af-9a1d-85fe2bdb106e',
  'category_id': 1,
  'bbox': [597.0, 331.0, 242.0, 561.0],
  'area': 135762.0,
  'id': 5},
 {'segmentation': [],
  'iscrowd': 0,
  'image_id': 'cfcfaa4a-9c58-4eec-bb63-708e2b

In [133]:
with open('datasets/shape/annotations/instances_train.json') as f:
    data = json.load(f)

In [137]:
for idx, value in enumerate(data):
    print(value)

info
licenses
categories
images
annotations


In [142]:
data['images']

[{'id': 0,
  'file_name': '0.jpg',
  'width': 512,
  'height': 512,
  'date_captured': '2020-04-14 01:45:07.508146',
  'license': 1,
  'coco_url': '',
  'flickr_url': ''},
 {'id': 1,
  'file_name': '1.jpg',
  'width': 512,
  'height': 512,
  'date_captured': '2020-04-14 01:45:07.508146',
  'license': 1,
  'coco_url': '',
  'flickr_url': ''},
 {'id': 2,
  'file_name': '2.jpg',
  'width': 512,
  'height': 512,
  'date_captured': '2020-04-14 01:45:07.508146',
  'license': 1,
  'coco_url': '',
  'flickr_url': ''},
 {'id': 3,
  'file_name': '3.jpg',
  'width': 512,
  'height': 512,
  'date_captured': '2020-04-14 01:45:07.508146',
  'license': 1,
  'coco_url': '',
  'flickr_url': ''},
 {'id': 4,
  'file_name': '4.jpg',
  'width': 512,
  'height': 512,
  'date_captured': '2020-04-14 01:45:07.508146',
  'license': 1,
  'coco_url': '',
  'flickr_url': ''},
 {'id': 5,
  'file_name': '5.jpg',
  'width': 512,
  'height': 512,
  'date_captured': '2020-04-14 01:45:07.508146',
  'license': 1,
  'coco

In [147]:
data['annotations']

[{'id': 0,
  'image_id': 0,
  'category_id': 2,
  'iscrowd': 0,
  'area': 4095.9999999999986,
  'bbox': [200.0, 416.0, 64.0, 64.0],
  'segmentation': [[200.0, 416.0, 264.0, 416.0, 264.0, 480.0, 200.0, 480.0]]},
 {'id': 1,
  'image_id': 0,
  'category_id': 2,
  'iscrowd': 0,
  'area': 4356.000000000001,
  'bbox': [424.0, 208.0, 66.0, 66.0],
  'segmentation': [[424.0, 208.0, 490.0, 208.0, 490.0, 274.0, 424.0, 274.0]]},
 {'id': 2,
  'image_id': 1,
  'category_id': 2,
  'iscrowd': 0,
  'area': 7396.000000000002,
  'bbox': [318.0, 376.0, 86.0, 86.0],
  'segmentation': [[318.0, 376.0, 404.0, 376.0, 404.0, 462.0, 318.0, 462.0]]},
 {'id': 3,
  'image_id': 2,
  'category_id': 1,
  'iscrowd': 0,
  'area': 19557.0,
  'bbox': [264.0, 128.0, 123.0, 159.0],
  'segmentation': [[264.0, 128.0, 387.0, 128.0, 387.0, 287.0, 264.0, 287.0]]},
 {'id': 4,
  'image_id': 2,
  'category_id': 1,
  'iscrowd': 0,
  'area': 2575.9999999999986,
  'bbox': [267.0, 43.0, 92.0, 28.0],
  'segmentation': [[267.0, 43.0, 359