In [22]:
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import xml.etree.ElementTree as ET

from glob import glob
import os
from os.path import join
import numpy as np
import shutil
from tqdm import tqdm

In [2]:
def check_tumor(point, polygons, physical_size=224):
    points = ((point[0]+physical_size, point[1]), 
              (point[0], point[1]+physical_size), 
              (point[0]+physical_size, point[1]+physical_size), 
              (point[0]+physical_size/2, point[1]+physical_size/2), 
              (point[0], point[1]))
    inside = False
    for polygon in polygons:
        try:
            polygon = Polygon(polygon)
            for p in points:
                p = Point(p)
                if polygon.contains(p):
                    inside = True
        except:
            continue
    return inside

def get_polygons(xml_path, annotation_tool='Aperio'):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    if annotation_tool == 'Aperio':
        annotations = root.findall('Annotation/Regions/Region')
    else:
        annotations = root.findall('Annotations/Annotation')
    polygons = []
    for annotation in annotations:
        if annotation_tool == 'Aperio':
            coords = annotation.findall('Vertices/Vertex')
        else:
            coords = annotation.findall('Coordinates/Coordinate')
        polygon = []
        for coord in coords:
            x = float(coord.attrib['X'])
            y = float(coord.attrib['Y'])
            polygon.append((int(np.round(x)), int(np.round(y))))
        polygons.append(polygon)
    return polygons

positive and negative need to be changed in 2 places in the code below if changing folder names
'_PDAC' in hte save_name is used to identify labels for later validation, and should be changed per class

In [18]:
def group_patches(out_path, tumor_bags, tumor_annotations, pixel_size=1, patch_size=224, ext='jpeg'):
    """This function takes in a list of bags of patches and annotation xmls and group patches into `tumor` or `normal` folder.
        Args: out_path (string): output folder. tumor_bags (list): list of folders of bags. tumor_annotations (list): list of file paths of xml (names must match).
            pixel_size (flot): pixel size in um, use `1` if annotations using pixel unit. patch_size (int): image patch size. ext (string): image patch extension.
    """
    for tumor_bag, tumor_annotation in tqdm(zip(tumor_bags, tumor_annotations), total=len(tumor_bags)):
        polygons = get_polygons(tumor_annotation)
#         return polygons
        imgs = glob(join(tumor_bag, '*.'+ext))
        imgs.sort()
        slide_name = tumor_bag.split(os.sep)[-1]
        os.makedirs(os.path.join(out_path, slide_name, 'positive'), exist_ok=True)
        os.makedirs(os.path.join(out_path, slide_name, 'negative'), exist_ok=True)
        #must adjust image name handling
        for img in imgs:
            img_name = os.path.splitext(os.path.basename(img))[0]
            y = int(img_name.split('_')[1]) * patch_size * pixel_size
            x = int(img_name.split('_')[0]) * patch_size * pixel_size
            if check_tumor((x, y), polygons, physical_size=patch_size*pixel_size):
                save_name = os.path.join(out_path, slide_name, 'positive', img_name+'_CP.'+ext)
                shutil.copyfile(img, save_name)
            else:
                save_name = os.path.join(out_path, slide_name, 'negative', img_name+'.'+ext)
                shutil.copyfile(img, save_name)

change both paths below per class
**pixel_size** is not the actual pixel size of the image, it is the downsample. Leave this as 1 if the image patches are not downsampled.

In [20]:
tumor_bags = glob('d:/bin_DL_project/dsmil-wsi/WSI/CP_PDAC/single/CP/*') # folder containing folders of patches
tumor_annotations = glob('d:/bin_DL_project/dsmil-wsi/annotations/CP/*') # folder containing the annotations (xml)

In [21]:
group_patches('d:/bin_DL_project/dsmil-wsi/WSI/CP_PDAC/annotated_bags/CP', tumor_bags, tumor_annotations, pixel_size=1)

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [03:02<00:00, 30.48s/it]
