In [3]:
import os
import cv2
import openslide
import numpy as np
from skimage import filters
from lxml import etree
from tqdm import trange

In [4]:
def get_ratio_mask(patch):
    h_, w_ = patch.shape[0], patch.shape[1]
    n_total = h_*w_
    n_cell = np.count_nonzero(patch)
    if (n_cell != 0):
        return n_cell*1.0/n_total*1.0
    else:
        return 0
    
def save_image(save_path, patch_name, image_patch):
    os.makedirs(save_path, exist_ok = True)
    cv2.imwrite(os.path.join(save_path, patch_name), image_patch)

def execute_patch(image_patch, target_patch_size, save_path, start_levelm_x, start_levelm_y, patch_count, patch_label):
    resize_image = cv2.resize(image_patch, (target_patch_size,target_patch_size), cv2.INTER_AREA)
    save_image(save_path, f'{start_levelm_x}_{start_levelm_y}_{patch_count}_{patch_label}.png', resize_image)

def get_tissue_mask(rgb_image):
    hsv = cv2.cvtColor(rgb_image,cv2.COLOR_RGB2HSV)
    tissue_S = hsv[:, :, 1] > filters.threshold_otsu(hsv[:, :, 1])
    background_R = rgb_image[:, :, 0] > filters.threshold_otsu(rgb_image[:, :, 0])
    background_G = rgb_image[:, :, 1] > filters.threshold_otsu(rgb_image[:, :, 1])
    background_B = rgb_image[:, :, 2] > filters.threshold_otsu(rgb_image[:, :, 2])
    tissue_RGB = np.logical_not(background_R & background_G & background_B)
    mask = tissue_S & (tissue_RGB)
    ret = np.array(mask).astype(np.uint8)
    return ret

def get_anno_list(anno_path, min_size, min_downsample):
    pts_list = []
    trees = etree.parse(anno_path).getroot()[0]

    for tree in trees:
        if (tree.get('PartOfGroup') == 'Tumor'):
            regions = tree.findall('Coordinates')
            for region in regions:
                coordinates = region.findall('Coordinate')
                pts = list()
                for coord in coordinates:
                      x = float(coord.get('X'))
                      y = float(coord.get('Y'))
                      x = np.clip(round(x/min_downsample), 0, round(min_size[0]))
                      y = np.clip(round(y/min_downsample), 0, round(min_size[1]))
                      pts.append((x,y))
                pts_list.append(pts)
    return pts_list

def get_anno_mask(pts_list, min_size):
    # 주의 numpy로 mask 생성 시 w,h 순서가 아닌 h,w !!
    mask = np.zeros((min_size[1],min_size[0])).astype(np.uint8)
    for pts in pts_list:
        point = [np.array(pts, dtype=np.int32)]
        mask = cv2.fillPoly(mask, point, 1)
    return mask

In [10]:
import os

base_slide_path = '/data3/yumi/TCGA_STAD'

# tif_files = [file for file in os.listdir(base_slide_path) if file.endswith('.tif')]
# tif_files_sorted = sorted(tif_files)

def find_svs_files(base_path):
    svs_files = []
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.svs'):
                full_path = os.path.join(root, file)
                svs_files.append([file, full_path])
    return svs_files

svs_files = find_svs_files(base_slide_path)
svs_files_sorted = sorted(svs_files)

In [None]:
def get_ratio_mask(patch):
    h_, w_ = patch.shape[0], patch.shape[1]
    n_total = h_*w_
    n_cell = np.count_nonzero(patch)
    if (n_cell != 0):
        return n_cell*1.0/n_total*1.0
    else:
        return 0
    
def save_image(save_path, patch_name, image_patch):
    os.makedirs(save_path, exist_ok = True)
    cv2.imwrite(os.path.join(save_path, patch_name), image_patch)

def execute_patch(image_patch, target_patch_size, save_path, start_levelm_x, start_levelm_y, patch_count, patch_label):
    resize_image = cv2.resize(image_patch, (target_patch_size,target_patch_size), cv2.INTER_AREA)
    save_image(save_path, f'{start_levelm_x}_{start_levelm_y}_{patch_count}_{patch_label}.png', resize_image)

def get_tissue_mask(rgb_image):
    hsv = cv2.cvtColor(rgb_image,cv2.COLOR_RGB2HSV)
    tissue_S = hsv[:, :, 1] > filters.threshold_otsu(hsv[:, :, 1])
    background_R = rgb_image[:, :, 0] > filters.threshold_otsu(rgb_image[:, :, 0])
    background_G = rgb_image[:, :, 1] > filters.threshold_otsu(rgb_image[:, :, 1])
    background_B = rgb_image[:, :, 2] > filters.threshold_otsu(rgb_image[:, :, 2])
    tissue_RGB = np.logical_not(background_R & background_G & background_B)
    mask = tissue_S & (tissue_RGB)
    ret = np.array(mask).astype(np.uint8)
    return ret

def get_anno_list(anno_path, min_size, min_downsample):
    pts_list = []
    trees = etree.parse(anno_path).getroot()[0]

    for tree in trees:
        if (tree.get('PartOfGroup') == 'Tumor'):
            regions = tree.findall('Coordinates')
            for region in regions:
                coordinates = region.findall('Coordinate')
                pts = list()
                for coord in coordinates:
                      x = float(coord.get('X'))
                      y = float(coord.get('Y'))
                      x = np.clip(round(x/min_downsample), 0, round(min_size[0]))
                      y = np.clip(round(y/min_downsample), 0, round(min_size[1]))
                      pts.append((x,y))
                pts_list.append(pts)
    return pts_list

def get_anno_mask(pts_list, min_size):
    # 주의 numpy로 mask 생성 시 w,h 순서가 아닌 h,w !!
    mask = np.zeros((min_size[1],min_size[0])).astype(np.uint8)
    for pts in pts_list:
        point = [np.array(pts, dtype=np.int32)]
        mask = cv2.fillPoly(mask, point, 1)
    return mask

In [15]:
target_patch_size = 224
target_mpp = 1000

tissue_ratio = 0.3
base_save_path = '/data3/yein/TCGA_STAD_patches'

for svs_filename, svs_path in svs_files_sorted:
    try:
        slide = openslide.open_slide(svs_path)

        slide_name = svs_filename.split('.')[0]
        save_path = os.path.join(base_save_path, slide_name)
        osmakedirs(save_path, exist_ok=True)

        min_level = slide.level_count - 1
        min_downsample = slide.level_downsamples[min_level]
        min_size = slide.level_dimensions[min_level]

        whole_region = slide.read_region(location=(0, 0), level=min_level, size=min_size)
        whole_image = np.array(whole_region)[..., :3]
        tissue_mask = get_tissue_mask(whole_image)

        level0_mpp = round(float(slide.properties.get('openslide.mpp-x')), 2)
        level0_patch_size = int((target_patch_size * target_mpp) / level0_mpp)
        level0_size = slide.level_dimensions[0]

        levelm_mpp = level0_mpp * min_downsample
        levelm_patch_size = int((target_patch_size * target_mpp) / levelm_mpp)

        patch_count = 0

        for start_level0_y in trange(0, level0_size[1], level0_patch_size):
            for start_level0_x in range(0, level0_size[0], level0_patch_size):
                start_levelm_x = int(start_level0_x / min_downsample)
                start_levelm_y = int(start_level0_y / min_downsample)
                end_levelm_x = int((start_level0_x + level0_patch_size) / min_downsample)
                end_levelm_y = int((start_level0_y + level0_patch_size) / min_downsample)
        
                tissue_mask_patch = tissue_mask[start_levelm_y:end_levelm_y, start_levelm_x:end_levelm_x]
        
                if get_ratio_mask(tissue_mask_patch) >= tissue_ratio:
                    image_patch = np.array(slide.read_region(
                        location=(start_level0_x, start_level0_y),
                        level=0,
                        size=(level0_patch_size, level0_patch_size)
                    )).astype(np.uint8)[..., :3]
                    patch_count += 1
                    execute_patch(image_patch, target_patch_size, save_path, start_level0_x, start_level0_y, patch_count, 0)  # 0 for non-tumor        
    
    except Exception as e:
        print(f"Error processing file {svs_file}: {e}")

NameError: name 'svs_file' is not defined

In [None]:
# for slide_filename in tif_files_sorted:
#     slide_path = os.path.join(base_slide_path, slide_filename)
#     save_path = os.path.join(base_save_path, slide_filename.split('.')[0])
#     os.makedirs(save_path, exist_ok=True)
    
#     slide = openslide.open_slide(slide_path)
    
#     min_level = slide.level_count - 1
#     min_downsample = slide.level_downsamples[slide.level_count - 1]
#     min_size = slide.level_dimensions[slide.level_count - 1]
    
#     whole_region = slide.read_region(location=(0, 0), level=min_level, size=min_size)
#     whole_image = np.array(whole_region)[..., :3]
#     tissue_mask = get_tissue_mask(whole_image)
    
#     level0_mpp = round(float(slide.properties.get('openslide.mpp-x')), 2)
#     level0_patch_size = int((target_patch_size * target_mpp) / level0_mpp)
#     level0_size = slide.level_dimensions[0]

#     levelm_mpp = level0_mpp * min_downsample
#     levelm_patch_size = int((target_patch_size * target_mpp) / levelm_mpp)
    
#     patch_count = 0
    
#     for start_level0_y in trange(0, level0_size[1], level0_patch_size):
#         for start_level0_x in range(0, level0_size[0], level0_patch_size):
#             start_levelm_x = int(start_level0_x / min_downsample)
#             start_levelm_y = int(start_level0_y / min_downsample)
#             end_levelm_x = int((start_level0_x + level0_patch_size) / min_downsample)
#             end_levelm_y = int((start_level0_y + level0_patch_size) / min_downsample)
    
#             tissue_mask_patch = tissue_mask[start_levelm_y:end_levelm_y, start_levelm_x:end_levelm_x]
    
#             if get_ratio_mask(tissue_mask_patch) >= tissue_ratio:
#                 image_patch = np.array(slide.read_region(
#                     location=(start_level0_x, start_level0_y),
#                     level=0,
#                     size=(level0_patch_size, level0_patch_size)
#                 )).astype(np.uint8)[..., :3]
#                 patch_count += 1
#                 execute_patch(image_patch, target_patch_size, save_path, start_level0_x, start_level0_y, patch_count, 0)