In [13]:
%reload_ext autoreload
%autoreload 2

import os
import json

import matplotlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
data_root = '~/data/rsna'

In [4]:
cls_df = pd.read_csv(os.path.join(data_root, 'anno/stage_2_detailed_class_info.csv'))
bbox_df = pd.read_csv(os.path.join(data_root, 'anno/stage_2_train_labels.csv'))

print(cls_df.head())
print(bbox_df.head())

                              patientId                         class
0  0004cfab-14fd-4e49-80ba-63a80b6bddd6  No Lung Opacity / Not Normal
1  00313ee0-9eaa-42f4-b0ab-c148ed3241cd  No Lung Opacity / Not Normal
2  00322d4d-1c29-4943-afc9-b6754be640eb  No Lung Opacity / Not Normal
3  003d8fa0-6bf1-40ed-b54c-ac657f8495c5                        Normal
4  00436515-870c-4b36-a041-de91049b9ab4                  Lung Opacity
                              patientId      x      y  width  height  Target
0  0004cfab-14fd-4e49-80ba-63a80b6bddd6    NaN    NaN    NaN     NaN       0
1  00313ee0-9eaa-42f4-b0ab-c148ed3241cd    NaN    NaN    NaN     NaN       0
2  00322d4d-1c29-4943-afc9-b6754be640eb    NaN    NaN    NaN     NaN       0
3  003d8fa0-6bf1-40ed-b54c-ac657f8495c5    NaN    NaN    NaN     NaN       0
4  00436515-870c-4b36-a041-de91049b9ab4  264.0  152.0  213.0   379.0       1


In [32]:
# split train and val for samples with target == 1

all_bbox_df = bbox_df[bbox_df['Target']==1]

all_ids = list(set(all_bbox_df['patientId']))

train_ids, val_ids = train_test_split(all_ids, test_size=0.2)

def convert_to_coco(df, ids):
    anno_list = []
    image_list = []
    cat_list = [{ 'id': 0 }]
    
    bbox_id = 0

    for i, unique_id in enumerate(ids):
        image_list.append({
            'id': i,
            'file_name': '{}.jpg'.format(unique_id)
        })
        
        id_df = df[df['patientId']==unique_id]
        
        for row in id_df.itertuples():
            image_id = getattr(row, 'patientId')
            x = float(getattr(row, 'x'))
            y = float(getattr(row, 'y'))
            w = float(getattr(row, 'width'))
            h = float(getattr(row, 'height'))
            
            anno_list.append({
                'id': bbox_id,
                'image_id': i,
                'category_id': 0,
                'bbox': [x, y, w, h],
                'area': w*h,
                'iscrowd': 0
            })
            
            bbox_id += 1
        
    json_dict = {
        'images': image_list,
        'annotations': anno_list,
        'categories': cat_list
    }
    
    return json.dumps(json.loads(json.dumps(json_dict), parse_float=lambda x: round(float(x), 1)), indent=4, separators=(',', ': '))

train_json = convert_to_coco(all_bbox_df, train_ids)
val_json = convert_to_coco(all_bbox_df, val_ids)

with open('train.json', 'w') as train_file:
    train_file.write(train_json)
    
with open('val.json', 'w') as val_file:
    val_file.write(val_json)