### This notebook explores the structure of COCO data set and how to create labels for your custom dataset with similar structure

In [1]:
import numpy
import multiprocessing
import time
import os
import json

import openslide
from SlideRunner.dataAccess.database import Database
from SlideRunner.dataAccess.annotations import *

from pathlib import Path
path = Path('./')

import sys
sys.path.append('../')

# you need to create a "utils" folder with the following two files
from utils.image_utils import *
from utils.coco_format_labels_creation import *

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
root_dir = os.getcwd()
database_dir = os.path.join(root_dir, 'databases')

#Specify the database SQL file
db_file = 'MITOS_WSI_CMC_COADEL_TR.sqlite'
db_path = os.path.join(database_dir, db_file)

#Specify directory of whole slide images
slide_dir = os.path.join(root_dir, 'WSI')

### Read in a new slide and load the file into memory

In [3]:
#Specify a slide number
slide_num = 15 # must be in this list [1,2, ..., 21]
filename = os.listdir(slide_dir)[slide_num]

#Get all the details about a paricular slide
slide_path = os.path.join(slide_dir, filename)  #basepath + os.sep + filename
slide = openslide.open_slide(str(slide_path))

In [4]:
#database.open(os.path.join(database_dir, database))
database = Database()
DB = database.open(db_path)

getslides = """SELECT uid, filename FROM Slides"""
currslide, filename = DB.execute(getslides).fetchall()[slide_num]

In [5]:
# Print the label for each annotation
for k in database.annotations.keys():
    print(k, database.annotations[k].agreedLabel())

In [6]:
#load the WSI into memory
database.loadIntoMemory(currslide)

In [7]:
# Print the label for each annotation
len(database.annotations.keys())
#for k in database.annotations.keys():
#    print(k, database.annotations[k].agreedLabel())

722

In [8]:
getslides = """SELECT uid, filename FROM Slides"""
#currslide, filename = DB.execute(getslides).fetchall()[slide_num]

In [9]:
DB.execute(getslides).fetchall()

[(1, 'a8773be388e12df89edd.svs'),
 (2, '460906c0b1fe17ea5354.svs'),
 (3, '2d56d1902ca533a5b509.svs'),
 (5, 'd0423ef9a648bb66a763.svs'),
 (6, '50cf88e9a33df0c0c8f9.svs'),
 (7, 'da18e7b9846e9d38034c.svs'),
 (8, 'd7a8af121d7d4f3fbf01.svs'),
 (9, '2191a7aa287ce1d5dbc0.svs'),
 (10, '69a02453620ade0edefd.svs'),
 (11, 'c4b95da36e32993289cb.svs'),
 (12, 'fa4959e484beec77543b.svs'),
 (13, '72c93e042d0171a61012.svs'),
 (14, '4eee7b944ad5e46c60ce.svs'),
 (15, 'b1bdee8e5e3372174619.svs'),
 (16, '3d3d04eca056556b0b26.svs'),
 (17, '084383c18b9060880e82.svs'),
 (18, 'e09512d530d933e436d5.svs'),
 (19, 'd37ab62158945f22deed.svs'),
 (20, 'deb768e5efb9d1dcbc13.svs'),
 (21, '022857018aa597374b6c.svs'),
 (22, '13528f1921d4f1f15511.svs')]

In [10]:
slide_path = os.path.join(slide_dir, filename)  #basepath + os.sep + filename
slide = openslide.open_slide(str(slide_path))

#load the WSI into memory
database.loadIntoMemory(currslide)

slide.associated_images['macro'], slide.properties
for k in slide.properties.items():
    print(k)

('aperio.AppMag', '40')
('aperio.DSR ID', '130.133.91.37')
('aperio.Date', '09/24/18')
('aperio.DisplayColor', '0')
('aperio.Exposure Scale', '0.000001')
('aperio.Exposure Time', '109')
('aperio.Filename', '084383c18b9060880e82')
('aperio.Focus Offset', '0.000000')
('aperio.ICC Profile', 'ScanScope v1')
('aperio.ImageID', '49817')
('aperio.Left', '21.099384')
('aperio.LineAreaXOffset', '0.003040')
('aperio.LineAreaYOffset', '-0.011652')
('aperio.LineCameraSkew', '-0.000450')
('aperio.MPP', '0.2533')
('aperio.OriginalHeight', '82253')
('aperio.OriginalWidth', '62560')
('aperio.ScanScope ID', 'VETWE12MIC')
('aperio.SessonMode', 'NR')
('aperio.StripeWidth', '1840')
('aperio.Time', '10:46:00')
('aperio.Time Zone', 'GMT+02:00')
('aperio.Title', '084383c18b9060880e82')
('aperio.Top', '22.655859')
('aperio.User', 'e075590f-9409-461d-b086-af2ffaf0ad27')
('openslide.comment', 'Aperio Image Library v12.0.15 ..62560x82253 [0,100 61199x82153] (240x240) J2K/KDU Q=70|AppMag = 40|StripeWidth = 1840|S

### Exploring COCO data structure

In [11]:
import json

In [12]:
coco_val_2017json = "/home/bony/Downloads/coco/annotations/instances_val2017.json"

In [13]:
with open(coco_val_2017json, "r") as coco_json:
    data = coco_json.read()
coco_json_dict = json.loads(data)

In [14]:
# COCO data is a dict and has 5 keys
for k in coco_json_dict:
    print(k, len(coco_json_dict[k]))

info 6
licenses 8
images 5000
annotations 36781
categories 80


In [15]:
# Explore "info" key
coco_json_dict['info']

{'description': 'COCO 2017 Dataset',
 'url': 'http://cocodataset.org',
 'version': '1.0',
 'year': 2017,
 'contributor': 'COCO Consortium',
 'date_created': '2017/09/01'}

In [16]:
# Explore "license" key
coco_json_dict['licenses']

[{'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/',
  'id': 1,
  'name': 'Attribution-NonCommercial-ShareAlike License'},
 {'url': 'http://creativecommons.org/licenses/by-nc/2.0/',
  'id': 2,
  'name': 'Attribution-NonCommercial License'},
 {'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/',
  'id': 3,
  'name': 'Attribution-NonCommercial-NoDerivs License'},
 {'url': 'http://creativecommons.org/licenses/by/2.0/',
  'id': 4,
  'name': 'Attribution License'},
 {'url': 'http://creativecommons.org/licenses/by-sa/2.0/',
  'id': 5,
  'name': 'Attribution-ShareAlike License'},
 {'url': 'http://creativecommons.org/licenses/by-nd/2.0/',
  'id': 6,
  'name': 'Attribution-NoDerivs License'},
 {'url': 'http://flickr.com/commons/usage/',
  'id': 7,
  'name': 'No known copyright restrictions'},
 {'url': 'http://www.usa.gov/copyright.shtml',
  'id': 8,
  'name': 'United States Government Work'}]

In [17]:
# Explore "categories" key
coco_json_dict['categories']

[{'supercategory': 'person', 'id': 1, 'name': 'person'},
 {'supercategory': 'vehicle', 'id': 2, 'name': 'bicycle'},
 {'supercategory': 'vehicle', 'id': 3, 'name': 'car'},
 {'supercategory': 'vehicle', 'id': 4, 'name': 'motorcycle'},
 {'supercategory': 'vehicle', 'id': 5, 'name': 'airplane'},
 {'supercategory': 'vehicle', 'id': 6, 'name': 'bus'},
 {'supercategory': 'vehicle', 'id': 7, 'name': 'train'},
 {'supercategory': 'vehicle', 'id': 8, 'name': 'truck'},
 {'supercategory': 'vehicle', 'id': 9, 'name': 'boat'},
 {'supercategory': 'outdoor', 'id': 10, 'name': 'traffic light'},
 {'supercategory': 'outdoor', 'id': 11, 'name': 'fire hydrant'},
 {'supercategory': 'outdoor', 'id': 13, 'name': 'stop sign'},
 {'supercategory': 'outdoor', 'id': 14, 'name': 'parking meter'},
 {'supercategory': 'outdoor', 'id': 15, 'name': 'bench'},
 {'supercategory': 'animal', 'id': 16, 'name': 'bird'},
 {'supercategory': 'animal', 'id': 17, 'name': 'cat'},
 {'supercategory': 'animal', 'id': 18, 'name': 'dog'},

In [18]:
# Explore "images" key
coco_json_dict['images'][0]

#type(coco_json_dict['images'])

{'license': 4,
 'file_name': '000000397133.jpg',
 'coco_url': 'http://images.cocodataset.org/val2017/000000397133.jpg',
 'height': 427,
 'width': 640,
 'date_captured': '2013-11-14 17:02:52',
 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg',
 'id': 397133}

In [19]:
# Explore "annotations" key
# the value is a list of all the annotations

# Each annotation is a dict
coco_json_dict['annotations'][0]
print(f"keys in annotation number 0:  \n{coco_json_dict['annotations'][0].keys()}")


for item in coco_json_dict['annotations'][500].items():
    print("\n", item[0])
    print(item[1])

keys in annotation number 0:  
dict_keys(['segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id'])

 segmentation
[[140.88, 150.48, 176.1, 150.48, 217.72, 152.89, 245.74, 140.08, 242.54, 116.87, 262.55, 110.46, 294.57, 151.28, 308.17, 144.88, 328.18, 152.09, 344.19, 160.09, 336.19, 165.69, 304.17, 158.49, 294.57, 162.49, 312.98, 187.3, 342.59, 185.7, 362.6, 215.32, 352.2, 244.14, 319.38, 255.34, 297.77, 237.73, 292.16, 208.12, 289.76, 200.91, 297.77, 187.3, 286.56, 164.89, 276.96, 190.51, 273.75, 200.11, 280.96, 214.52, 261.75, 208.92, 227.33, 222.52, 225.73, 219.32, 202.51, 224.93, 185.7, 260.95, 153.69, 264.15, 136.08, 236.13, 137.68, 212.92, 172.1, 195.31, 192.91, 208.12, 196.91, 190.51, 175.3, 184.1, 154.49, 172.1, 155.29, 160.89, 144.08, 154.49]]

 area
16329.365099999994

 iscrowd
0

 image_id
102331

 bbox
[136.08, 110.46, 226.52, 153.69]

 category_id
4

 id
151869


### Now we will transform our data to COCO format

In [20]:
# do not run this
#create_image_patches(slide_dir, DB, 624)

#### Convert data to COCO object detection format

In [21]:
data_to_coco_format = {}

license = licenses()
data_to_coco_format['license'] = license

info_dict = info()
data_to_coco_format['info'] = info()

data_to_coco_format['categories'] = categories()

In [22]:
data_to_coco_format

{'license': [{'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/',
   'id': 1,
   'name': 'Attribution-NonCommercial-ShareAlike License'}],
 'info': {'info': {'description': 'Canine Breast Cancer',
   'url': 'https://doi.org/10.1038/s41597-020-00756-z',
   'version': '1.0',
   'year': 2020,
   'contributor': 'R Klopfleisch',
   'date_created': '2020/09/01'}},
 'categories': [{'supercategory': 'cell',
   'id': 1,
   'name': 'mitotic_look_alike'},
  {'supercategory': 'cell', 'id': 2, 'name': 'mitotic'},
  {'supercategory': 'cell', 'id': 3, 'name': 'background'}]}

#### Create dict for key = "annotations"

In [23]:
# get a particular slide
slide_list = database.listOfSlides()
annot_count = 0

patch_size = 624
bbox_length = 50

coco_type_annotations = []

for item in slide_list:
    #print(item)
    slide_num = item[0]
    
    currslide, filename = item
    print("\n",currslide, filename)

    #load the WSI into memory
    database.loadIntoMemory(currslide)
    annot_count += len(database.annotations.keys())
    #print(annot_count, database.())
    
    slide_path = os.path.join(slide_dir, filename)  #basepath + os.sep + filename
    slide = openslide.open_slide(str(slide_path))
    
    (wsi_x, wsi_y) = slide.dimensions
    patch_count_in_wsi = calculate_patch_count_in_WSI(wsi_x, wsi_y, patch_size)
    print(f"Slide dimension: {wsi_x, wsi_y}, number of patches : {patch_count_in_wsi} of size: {patch_size}")
    
    single_img_annotations = database.annotations
    
    print(len(single_img_annotations.items()))
    for uid, annotation in single_img_annotations.items():
        annot_details = {}
        
        annotation = single_img_annotations[uid]
        if annotation.deleted or annotation.annotationType != AnnotationType.SPOT:
            continue
        else:
            #get X and Y coordinates of the annotation
            x = annotation.x1 
            y = annotation.y1
            agreed_classs = annotation.agreedClass
            annot_uid = annotation.uid #unique ID of the annotation
            #print(annotation.annotationType, annotation.agreedClass, x, y)
            bbox = calculate_bbox(x,y)
            
            (nx,ny) = find_patch_index_within_WSI(x, y, patch_size)
            img_id = create_patch_center_identifier(filename, nx, ny)
            
            'segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id'
            annot_details['id'] = uid
            annot_details['category_id'] = agreed_classs
            annot_details['image_id'] = img_id
            annot_details['iscrowd'] = 0
            annot_details['area'] = bbox_length**2
            annot_details['segmentation'] = []
            annot_details['bbox'] = bbox
            
            coco_type_annotations.append(annot_details)
            

data_to_coco_format['annotations'] = coco_type_annotations          


 1 a8773be388e12df89edd.svs
Slide dimension: (61199, 57462), number of patches : 9016 of size: 624
667

 2 460906c0b1fe17ea5354.svs
Slide dimension: (136800, 63831), number of patches : 22338 of size: 624
3223

 3 2d56d1902ca533a5b509.svs
Slide dimension: (70199, 65453), number of patches : 11648 of size: 624
522

 5 d0423ef9a648bb66a763.svs
Slide dimension: (116999, 85208), number of patches : 25432 of size: 624
2009

 6 50cf88e9a33df0c0c8f9.svs
Slide dimension: (125999, 87273), number of patches : 27939 of size: 624
3890

 7 da18e7b9846e9d38034c.svs
Slide dimension: (95399, 84305), number of patches : 20520 of size: 624
1567

 8 d7a8af121d7d4f3fbf01.svs
Slide dimension: (111599, 88592), number of patches : 25098 of size: 624
3122

 9 2191a7aa287ce1d5dbc0.svs
Slide dimension: (98999, 82534), number of patches : 20856 of size: 624
4097

 10 69a02453620ade0edefd.svs
Slide dimension: (97199, 90076), number of patches : 22320 of size: 624
2151

 11 c4b95da36e32993289cb.svs
Slide dimensio

In [24]:
len(data_to_coco_format['annotations'])
#calculate_bbox(100, 100), patch_size*patch_size

50282

In [25]:
data_to_coco_format['annotations'][100]

{'id': 105,
 'category_id': 2,
 'image_id': 'a8773be388e12df89edd.svs_22_55.jpg',
 'iscrowd': 0,
 'area': 2500,
 'segmentation': [],
 'bbox': [14268, 34654, 14318, 34704]}

#### Create dict for key = "images"

In [26]:
data_to_coco_format['images'] = create_image_patches(slide_dir, DB, patch_size)



Slide number: 1,  Name: a8773be388e12df89edd.svs, Slide dimension: (61199, 57462)

1000   ***   {'license': 1, 'file_name': ('a8773be388e12df89edd.svs_10_79.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 1000}
2000   ***   {'license': 1, 'file_name': ('a8773be388e12df89edd.svs_21_67.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 2000}
3000   ***   {'license': 1, 'file_name': ('a8773be388e12df89edd.svs_32_55.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/

62000   ***   {'license': 1, 'file_name': ('d0423ef9a648bb66a763.svs_139_93.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 62000}
63000   ***   {'license': 1, 'file_name': ('d0423ef9a648bb66a763.svs_147_5.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 63000}
64000   ***   {'license': 1, 'file_name': ('d0423ef9a648bb66a763.svs_154_53.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 64000}
65000   ***   {'license': 1, 'file_

174000   ***   {'license': 1, 'file_name': ('69a02453620ade0edefd.svs_77_64.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 174000}
175000   ***   {'license': 1, 'file_name': ('69a02453620ade0edefd.svs_84_56.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 175000}
176000   ***   {'license': 1, 'file_name': ('69a02453620ade0edefd.svs_91_48.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 176000}
177000   ***   {'license': 1, '

289000   ***   {'license': 1, 'file_name': ('b1bdee8e5e3372174619.svs_159_39.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 289000}
290000   ***   {'license': 1, 'file_name': ('b1bdee8e5e3372174619.svs_166_101.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 290000}
291000   ***   {'license': 1, 'file_name': ('b1bdee8e5e3372174619.svs_174_29.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 291000}
292000   ***   {'license': 

340000   ***   {'license': 1, 'file_name': ('e09512d530d933e436d5.svs_63_58.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 340000}
341000   ***   {'license': 1, 'file_name': ('e09512d530d933e436d5.svs_71_18.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 341000}
342000   ***   {'license': 1, 'file_name': ('e09512d530d933e436d5.svs_78_108.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 342000}
343000   ***   {'license': 1, 

407000   ***   {'license': 1, 'file_name': ('022857018aa597374b6c.svs_61_41.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 407000}
408000   ***   {'license': 1, 'file_name': ('022857018aa597374b6c.svs_68_96.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 408000}
409000   ***   {'license': 1, 'file_name': ('022857018aa597374b6c.svs_76_16.jpg',), 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',), 'height': 624, 'width': 624, 'date_captured': ('2013-11-14 17:02:52',), 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 409000}
410000   ***   {'license': 1, '

In [27]:
data_to_coco_format['images'][100]

{'license': 1,
 'file_name': ('a8773be388e12df89edd.svs_1_8.jpg',),
 'coco_url': ('http://images.cocodataset.org/val2017/000000397133.jpg',),
 'height': 624,
 'width': 624,
 'date_captured': ('2013-11-14 17:02:52',),
 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg',
 'id': 101}