In [1]:
import os
from collections import namedtuple, defaultdict
import xml.etree.ElementTree as ET
from shapely.geometry import MultiPolygon, Point, Polygon
import numpy as np
from threading import local, get_ident
from time import time
import random
from tqdm import tqdm

from multiprocessing import Process, Queue
from multiprocessing.dummy import Pool


In [2]:
import sys
if 'D:\\ACDC_LUNG_HISTOPATHOLOGY\\ASAP 1.9\\bin' not in sys.path:
    sys.path.append('D:\\ACDC_LUNG_HISTOPATHOLOGY\\ASAP 1.9\\bin')
import multiresolutionimageinterface as mir
reader = mir.MultiResolutionImageReader()

In [3]:
_SlideInfo=namedtuple('_Slide', ['slide_path', 'mask_path', 'xml_tissue', 'xml_cancer'])
_PolygonRaw = namedtuple('_Polygon', ['slide_path', 'coord'])
_Polygon=namedtuple('_Polygon', ['slide_path', 'area', 'polygon'])
_Data=namedtuple('_Data', ['slide_path', 'coord', 'label'])

In [4]:
def get_slideinfo(folder_with_data):
    slides_path=folder_with_data+'slides/'
    masks_path=folder_with_data+'masks/'
    xmls_path=folder_with_data+'annotations/'
    for name in os.listdir(slides_path):
        yield _SlideInfo(slides_path+name, 
                     masks_path+name.split('.')[0]+'_M.tif', 
                     xmls_path+name.split('.')[0]+'_G.xml',
                     xmls_path+name.split('.')[0]+'.xml')

In [5]:
def get_coordinate(_list, cancer=True):
    for el in _list:
        if cancer:
            xml_file = el.xml_cancer
        else:
            xml_file=el.xml_tissue
        for ann in ET.parse(xml_file).getroot().find('Annotations'):
            yx=[(int(float(coord.attrib['Y'])), int(float(coord.attrib['X']))) for coord in ann.find('Coordinates').findall('Coordinate')]
            yield _PolygonRaw(el.slide_path, yx)

In [6]:
def clear_small_polygon(_list):
    for el in _list:
        p=Polygon(el.coord)
        area=p.area
        if area > 1024*1024:
            yield _Polygon(el.slide_path, area, p)

In [7]:
def get_dict(source_files, cancer=True):
    d=defaultdict(list)
    for path, area, poly in clear_small_polygon(get_coordinate(source_files, cancer=cancer)):
        d[path].append((area, poly))
    return d

In [8]:
def to_multi(_list):
    return MultiPolygon([[p.exterior.coords, []] for _, p in _list]).buffer(0)

In [9]:
def subtraction(cancer, tissue):
    for k in (set(cancer) & set(tissue)):
        gt = tissue[k]
        canc = cancer[k]
        mp = to_multi(canc)
        tmp = ((poly.buffer(5) - mp) for _, poly in gt)
        tissue[k] = [(p.area, p) for p in tmp]
    return (cancer, tissue)

In [10]:
_TLS = local()
def tls_prng():
    # pylint: disable=no-member
    try:
        return _TLS.prng
    except AttributeError:
        _TLS.prng = np.random.RandomState(
            (get_ident() + np.random.get_state()[1][0]) % 2**32
        )
        return _TLS.prng

In [11]:
def get_random_point(polygon, number):
    prng = tls_prng()

    if not polygon.area:
        return

    minx, miny, maxx, maxy = polygon.bounds
    for _ in range(number):
        while True:
            p = Point(prng.uniform(minx, maxx), prng.uniform(miny, maxy))
            if polygon.contains(p):
                yield (int(p.coords[0][0]), int(p.coords[0][1]))
                break

In [12]:
def get_square(point, size):
    return Polygon([(point[0]-size/2, point[1]-size/2), 
                    (point[0]-size/2, point[1]+size/2), 
                    (point[0]+size/2, point[1]+size/2), 
                    (point[0]+size/2, point[1]-size/2)])

In [None]:
slideinfo=list(get_slideinfo('../data/'))
np.random.shuffle(slideinfo)
train_slideinfo, valid_slideinfo = (slideinfo[:-len(slideinfo) // 10], slideinfo[-len(slideinfo) // 10:])
# train_cancer=get_dict(train_slideinfo, cancer=True)
# train_tissue=get_dict(train_slideinfo, cancer=False)
# valid_cancer=get_dict(valid_slideinfo, cancer=True)
# valid_tissue=get_dict(valid_slideinfo, cancer=False)
# train_cancer, train_tissue=subtraction(train_cancer, train_tissue)
# valid_cancer, valid_tissue=subtraction(valid_cancer, valid_tissue)

In [None]:
# nbpoints=1500
# i=0
# while i < nbpoints:
#     rnd_slide=random.choice(list(train_cancer))
#     rnd_polygon_info=random.choice(train_cancer[rnd_slide])
#     rnd_point=list(get_random_point(rnd_polygon_info[1], (256,256), 0, 1))[0]
#     square=get_square(rnd_point, 316)
#     if rnd_polygon_info[1].contains(square):
#         i+=1
#         print(rnd_slide, rnd_point)

In [None]:
_Data=namedtuple('_Data', ['slide_path', 'coord', 'label'])

def get_data(_defdict, nbpoints, patch_size, zoom, cancer=True):
    np.random.seed(int(time() * 1000) % 2**32)
    reader = mir.MultiResolutionImageReader()
    for _ in tqdm(range(nbpoints)):
        while True:
            rnd_slide=random.choice(list(_defdict))
            rnd_polygon_info=random.choice(_defdict[rnd_slide])
            rnd_point=list(get_random_point(rnd_polygon_info[1], 1))[0]
            square=get_square(rnd_point, 316)
            if rnd_polygon_info[1].contains(square):
                mask = reader.open(rnd_slide.replace('slides', 'masks').replace('.tif', '_M.tif'))
                slide=reader.open(rnd_slide)
                slide_patch= slide.getUCharPatch(startY=rnd_point[0], startX=rnd_point[1], height=316, width=316, level=zoom)
                mask_patch = mask.getUCharPatch(startY=rnd_point[0], startX=rnd_point[1], height=316, width=316, level=zoom)
                mask_patch = mask_patch.astype(bool)
                values, counts = np.unique(mask_patch, return_counts=True)
                if cancer:
                    if sum(values * counts) / sum(counts) * 100==100.0 and len(np.unique(slide_patch))>225:
                        yield _Data(rnd_slide, rnd_point, 1)
                        break
                else:
                    if sum(values * counts) / sum(counts) * 100==0.0 and len(np.unique(slide_patch))>225:
                        yield _Data(rnd_slide, rnd_point, 0)
                        break

In [None]:
train_tissue=list(get_data(train_tissue, 3000, (256,256), 0, False))

In [None]:
train_cancer=list(get_data(train_cancer, 3000, (256,256), 0, True))

In [None]:
train_tissue

In [None]:
train=train_cancer+train_tissue

In [None]:
len(train)

In [None]:
random.shuffle(train)

In [13]:
def get_data(slideinfo, nbpoints, patch_size, zoom):
    np.random.seed(int(time() * 1000) % 2**32)
    reader = mir.MultiResolutionImageReader()
    cancer=get_dict(slideinfo, cancer=True)
    tissue=get_dict(slideinfo, cancer=False)
    cancer, tissue=subtraction(cancer, tissue)
    for _defdict in (cancer, tissue):
        for _ in tqdm(range(nbpoints)):
            while True:
                rnd_slide=random.choice(list(_defdict))
                rnd_polygon_info=random.choice(_defdict[rnd_slide])
                rnd_point=list(get_random_point(rnd_polygon_info[1], 1))[0]
                square=get_square(rnd_point, 316)
                if rnd_polygon_info[1].contains(square):
                    mask = reader.open(rnd_slide.replace('slides', 'masks').replace('.tif', '_M.tif'))
                    slide=reader.open(rnd_slide)
                    slide_patch= slide.getUCharPatch(startY=rnd_point[0], startX=rnd_point[1], height=316, width=316, level=zoom)
                    mask_patch = mask.getUCharPatch(startY=rnd_point[0], startX=rnd_point[1], height=316, width=316, level=zoom)
                    mask_patch = mask_patch.astype(bool)
                    values, counts = np.unique(mask_patch, return_counts=True)
                    if cancer and sum(values * counts) / sum(counts) * 100==100.0 and len(np.unique(slide_patch))>225:
                        yield _Data(rnd_slide, rnd_point, 1)
                        break
                    if tissue and sum(values * counts) / sum(counts) * 100==0.0 and len(np.unique(slide_patch))>225:
                        yield _Data(rnd_slide, rnd_point, 0)
                        break

In [14]:
slideinfo=list(get_slideinfo('../data/'))
np.random.shuffle(slideinfo)
train_slideinfo, valid_slideinfo = (slideinfo[:-len(slideinfo) // 10], slideinfo[-len(slideinfo) // 10:])
train=list(get_data(train_slideinfo, 6000, (256,256), 0))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 6000/6000 [03:30<00:00, 28.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 6000/6000 [04:51<00:00, 20.58it/s]


In [203]:
import pickle

In [204]:
with open('../data/train.pickle', 'wb') as f:
    pickle.dump(train, f)

In [207]:
valid=list(get_data(valid_slideinfo, 600, (256,256), 0))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [00:24<00:00, 21.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [00:27<00:00, 21.70it/s]


In [208]:
with open('../data/valid.pickle', 'wb') as f:
    pickle.dump(valid, f)

In [215]:
from torch.utils.data import Dataset
# from ._mir_hook import mir
from PIL import Image
from torchvision import transforms as T

_READER = mir.MultiResolutionImageReader()


class TrainDataset(Dataset):
    def __init__(self, _list, patch_size, zoom, transform=None):
        self._list = _list
        self.patch_size = patch_size
        self.zoom = zoom
        self.transform = transform

    def __len__(self):
        return len(self._list)

    def __getitem__(self, index):
        slide = _READER.open(self._list[index].slide_path)
        X = slide.getUCharPatch(
            startY=self._list[index].coord[0],
            startX=self._list[index].coord[1],
            height=316,
            width=316,
            level=self.zoom
        )
        X = Image.fromarray(X).convert('RGB')
        if self.transform is not None:
            X = self.transform(X)
        else:
            X = T.Compose([T.ToTensor()])(X)
        Y = self._list[index].label

        return X, Y.float()


In [221]:
t_dataset=TrainDataset(train[:100], (256,256), 0)

In [222]:
list(t_dataset)

[(tensor([[[0.5804, 0.6118, 0.6863,  ..., 0.9686, 0.8902, 0.7647],
           [0.5451, 0.5569, 0.6392,  ..., 0.9686, 0.8706, 0.7608],
           [0.5098, 0.5216, 0.6196,  ..., 0.9804, 0.8824, 0.7843],
           ...,
           [0.6118, 0.7098, 0.7725,  ..., 0.7176, 0.7059, 0.7059],
           [0.6745, 0.7686, 0.7922,  ..., 0.6510, 0.6549, 0.6824],
           [0.7412, 0.8118, 0.8157,  ..., 0.6078, 0.6314, 0.6706]],
  
          [[0.3608, 0.3804, 0.4471,  ..., 0.8667, 0.7647, 0.6314],
           [0.3294, 0.3373, 0.4078,  ..., 0.8863, 0.7647, 0.6471],
           [0.2980, 0.3059, 0.3922,  ..., 0.9137, 0.7882, 0.6824],
           ...,
           [0.3608, 0.4510, 0.5176,  ..., 0.4627, 0.4549, 0.4510],
           [0.4353, 0.5137, 0.5333,  ..., 0.4078, 0.4157, 0.4431],
           [0.4863, 0.5490, 0.5490,  ..., 0.3725, 0.3961, 0.4392]],
  
          [[0.7294, 0.7569, 0.8118,  ..., 0.9882, 0.9451, 0.8824],
           [0.7098, 0.7216, 0.7804,  ..., 0.9961, 0.9451, 0.9020],
           [0.7059, 0.