create id_to_integer_id --> dict

In [None]:
import numpy as np
import os
from mrcnn.utils import Dataset
import pandas as pd
from pathlib import Path

In [15]:
# path of data folder
data_path = Path('rsna-pneumonia-detection-challenge')

## Labels data

In [16]:
labels_path = data_path / 'stage_2_train_labels.csv'

In [17]:
labels = pd.read_csv(labels_path)

In [18]:
labels.shape

(30227, 6)

In [19]:
labels.head(2)

Unnamed: 0,patientId,x,y,width,height,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0


In [20]:
IMAGE_WIDTH, IMAGE_HEIGHT = 1024, 1024

In [1]:
class PneumoniaDataset(Dataset):
    def load_dataset(self, dataset_dir):
        # define one class
        self.add_class('dataset', 1, 'pneumonia')
        # define data location
        images_dir = dataset_dir / 'stage_2_train_images'
        # find all images
        for filename in os.listdir(str(images_dir)):
            id = filename.split('.')[0]
            label = labels[labels['patientId'] == id].iloc[:, 1:-1]
            bboxes = []
            for _, bbox in label.iterrows():
                bboxes.append(bbox)
            image_id = id_to_integer_id[id]
            img_path = images_dir / filename
            # add to dataset
            self.add_image('dataset', image_id=image_id, path=image_path, bboxes=bboxes)
            
        
    # load the masks for an image
    def load_masks(self, image_id):
        # get details of image
        info = self.image_info[image_id]
        # get box-position
        bboxes = info['bboxes']
        # create one array for all masks, each on a different channel
        masks = np.zeros([IMAGE_HEIGHT, IMAGE_WIDTH, len(bboxes)], dtype='uint8')
        # create masks
        class_ids = []
        for i, bbox in enumerate(bboxes):
            row_s, row_e = bbox['x'], bbox['x'] + bbox['width']
            col_s, col_e = bbox['y'], bbox['y'] + bbox['height']
            masks[row_s:row_e, col_s, col_e, 1] = 1
            class_ids.append(self.class_names.index('pneumonia'))
        return (masks, np.asarray(class_ids, dtype='int32'))
    
    
    # load an image reference
    def image_reference(self, image_id):
        info = self.image_info[image_id]
        return info['path']
        

In [None]:
# train set
train_set = PneumoniaDataset()
train_set.load_dataset('kangaroo')
train_set.prepare()
print('Train: %d' % len(train_set.image_ids))