In [1]:
import openslide
import tifffile
import matplotlib.pyplot as plt
import numpy as np
import cv2
import os
import csv
import math
import json
import random
import shutil
import scipy.misc
import scipy.ndimage
from skimage import io
from skimage import morphology
from skimage import measure
import mahotas.polygon as ploygon_to_mask
import re
import mahotas.polygon as mp
from skimage.morphology import remove_small_holes, remove_small_objects
from scipy.ndimage import binary_dilation
from scipy.ndimage.morphology import binary_closing, binary_opening

from lxml import etree
from tqdm import tqdm
from glob import glob
from PIL import Image
import imutils

from util import read_json_as_dict, get_tissue_mask, get_parent_dir_name
from data_util import get_mask_image, get_rec_info_list, remove_orange_peel, resize_with_preserve_rgb_value
from data_util import get_size4mpp, get_wsi_info_read_region

  from scipy.ndimage.morphology import binary_closing, binary_opening
  from scipy.ndimage.morphology import binary_closing, binary_opening
2024-12-05 14:10:15.648351: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-05 14:10:15.648475: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-05 14:10:15.648715: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 14:10:15.694868: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instruction

# Define Function

In [2]:
def get_mask_array(path_json, roi_array, verbose=1):
    
    mask_value = 255
    
    geo_json_string = open(path_json)
    geo_json = json.load(geo_json_string)
    try:
        mask_array_transposed = np.zeros(roi_array.shape, dtype=np.uint8)
        for item in geo_json:
            for coordinates in item["geometry"]["coordinates"]:
                pts = [(round(loc[1]), round(loc[0])) for loc in coordinates]
                mp.fill_polygon(pts, mask_array_transposed, mask_value)

        mask_bool = (mask_array_transposed > 0).astype('uint8') 
        roi_bool = (roi_array > 0).astype('uint8') 
        mask_area = np.sum(mask_bool) 

        mask_in_roi = (mask_bool + roi_bool) == 2
        mask_in_roi_ratio_transposed = (np.sum(mask_in_roi) / mask_area)
    except IndexError:
        mask_in_roi_ratio_transposed = 0
        
    try:
        mask_array = np.zeros(roi_array.shape, dtype=np.uint8)
        for item in geo_json:
            for coordinates in item["geometry"]["coordinates"]:
                pts = [(round(loc[0]), round(loc[1])) for loc in coordinates]
                mp.fill_polygon(pts, mask_array, mask_value)

        mask_bool = (mask_array > 0).astype('uint8') 
        roi_bool = (roi_array > 0).astype('uint8') 
        mask_area = np.sum(mask_bool) 

        mask_in_roi = (mask_bool + roi_bool) == 2
        mask_in_roi_ratio = (np.sum(mask_in_roi) / mask_area)
    except IndexError:
        mask_in_roi_ratio = 0
        
    if verbose:
        print(f"mask_in_roi_ratio_transposed: {mask_in_roi_ratio_transposed}")
        print(f"mask_in_roi_ratio: {mask_in_roi_ratio}")
        
    if mask_in_roi_ratio_transposed > mask_in_roi_ratio:
        return mask_array_transposed
    else:
        return mask_array

def get_roi_idx_tuple(tissue_mask_array):
    tissue_mask_bool_array = tissue_mask_array != 0
    row_mask_exist_idx_array = np.any(tissue_mask_bool_array, axis=1)
    col_mask_exist_idx_array = np.any(tissue_mask_bool_array, axis=0)
    min_row_index, max_row_index = np.where(row_mask_exist_idx_array)[0][[0, -1]]
    min_col_index, max_col_index = np.where(col_mask_exist_idx_array)[0][[0, -1]]
    return min_row_index, max_row_index, min_col_index, max_col_index

def get_row_col_range(min_row_index, max_row_index, min_col_index, max_col_index, stride):
    row_fill = (max_row_index - min_row_index) % stride
    col_fill = (max_col_index - min_col_index) % stride
    
    start_row_idx = min_row_index - stride
    end_row_idx = max_row_index + row_fill + stride + 1
    start_col_idx = min_col_index - stride
    end_col_idx = max_col_index + col_fill + stride + 1
    
    row_range = range(start_row_idx, end_row_idx, stride)
    col_range = range(start_col_idx, end_col_idx, stride)
    
    return row_range, col_range

# Setting Config

In [3]:
json_path = "./config.json"
config_dict = read_json_as_dict(json_path)

mpp_standard = config_dict["mpp_standard"]
train_wsi_path = config_dict['train_wsi_path']
test_wsi_path = config_dict['test_wsi_path']
data_common_path = config_dict['data_common_path']

tissue_mask_level = config_dict["tissue_mask_level"]
downsizing_per_level = config_dict["downsizing_per_level"]
resize_scale = downsizing_per_level ** tissue_mask_level
level_0_patch_size = config_dict["level_0_patch_size"]

print(f"mpp_standard: {mpp_standard}")
print(f"train_wsi_path: {train_wsi_path}")
print(f"test_wsi_path: {test_wsi_path}")
print(f"data_common_path: {data_common_path}")
print(f"tissue_mask_level: {tissue_mask_level}")
print(f"downsizing_per_level: {downsizing_per_level}")
print(f"level_0_patch_size: {level_0_patch_size}")

mpp_standard: 0.65
train_wsi_path: ../data/0_source_data/hubmap-kidney-segmentation/train
test_wsi_path: ../data/0_source_data/hubmap-kidney-segmentation/test
data_common_path: ../data
tissue_mask_level: 2
downsizing_per_level: 2
level_0_patch_size: 1024


# Get Tissue Mask

In [4]:
tissue_mask_folder_basename = "1_tissue_mask"
patch_folder_basename = "2_extract_patch"

tissue_mask_folder = f"{data_common_path}/{tissue_mask_folder_basename}"
patch_folder = f"{data_common_path}/{patch_folder_basename}"

os.makedirs(tissue_mask_folder, exist_ok=True)
os.makedirs(patch_folder, exist_ok=True)

wsi_path_list = glob(f"{train_wsi_path}/*.tiff")
remove_region_ratio = 0.005
patch_stride_ratio = 0.5
patch_num_in_one_folder = 5000
use_memmap = False

mask_policy_dict = {"background": 0, "glomerulus": 255}

In [5]:
for wsi_idx, wsi_path in tqdm(enumerate(wsi_path_list)):
    wsi_basename = os.path.basename(wsi_path)
    
    annotation_json_path = wsi_path.replace(".tiff", ".json")
    wsi_basename = get_parent_dir_name(wsi_path, level=0)
    tissue_mask_basename = get_parent_dir_name(wsi_path, level=0).replace('.tiff', '_tissue_mask.png')
    tissue_mask_path = f"{tissue_mask_folder}/{tissue_mask_basename}"
    wsi_array, mpp_value = get_wsi_info_read_region(wsi_path, downsize_scale=1, use_memmap=use_memmap)
    print("load_wsi")
    if mpp_value == "undefined":
        mpp_value = 0.5

    tissue_mask_array = cv2.imread(tissue_mask_path, cv2.IMREAD_GRAYSCALE)
    tissue_mask_array = cv2.resize(tissue_mask_array, wsi_array.shape[:2][::-1], cv2.INTER_NEAREST)
    print("load_tissue_mask")
    mask_array = get_mask_array(annotation_json_path, tissue_mask_array, verbose=1)
    print("load_mask")
    negative_patch_folder = f"{patch_folder}/{wsi_basename}/negative"
    positive_patch_folder = f"{patch_folder}/{wsi_basename}/positive"
    
    level_0_patch_size_mpp = get_size4mpp(level_0_patch_size, mpp_standard, mpp_value)
    level_0_patch_stride = int(level_0_patch_size_mpp * patch_stride_ratio)
    
    min_row_index, max_row_index, min_col_index, max_col_index = get_roi_idx_tuple(tissue_mask_array)
    
    row_range, col_range = get_row_col_range(min_row_index, max_row_index, min_col_index, max_col_index, level_0_patch_stride)
    negative_num = 0
    positive_num = 0
    patch_idx = 0
    for row_idx in tqdm(row_range):
        for col_idx in col_range:
            row_slice = slice(row_idx, row_idx + level_0_patch_size_mpp)
            col_slice = slice(col_idx, col_idx + level_0_patch_size_mpp)
            position_tuple = (row_slice, col_slice)

            tissue_patch_array = tissue_mask_array[position_tuple]
            tissue_area = (tissue_patch_array != 0).mean()
            if tissue_area > 0.25:
                image_patch_array = wsi_array[position_tuple]
                mask_patch_array = mask_array[position_tuple]
                is_positive = (mask_patch_array != 0).mean() != 0
                if is_positive:
                    mask_patch_array = remove_orange_peel(mask_patch_array, mask_policy_dict,
                                                               remove_region_ratio=remove_region_ratio)
                    folder_idx = positive_num // patch_num_in_one_folder
                    positive_num += 1
                    patch_save_folder = positive_patch_folder
                else:
                    folder_idx = negative_num // patch_num_in_one_folder
                    negative_num += 1
                    patch_save_folder = negative_patch_folder
                    
                patch_save_folder = f"{patch_save_folder}/{folder_idx:04d}/{patch_idx:06d}"
                os.makedirs(patch_save_folder, exist_ok=True)
                
                image_patch_path = f"{patch_save_folder}/image.png"
                mask_patch_path = f"{patch_save_folder}/mask.png"
                tissue_patch_path = f"{patch_save_folder}/tissue.png"

                cv2.imwrite(image_patch_path, image_patch_array[..., ::-1])
                cv2.imwrite(mask_patch_path, mask_patch_array)
                cv2.imwrite(tissue_patch_path, tissue_patch_array)
                patch_idx += 1


0it [00:00, ?it/s]

load_wsi
load_tissue_mask
mask_in_roi_ratio_transposed: 0.8698520381731398
mask_in_roi_ratio: 1.0
load_mask



  tissue_area = (tissue_patch_array != 0).mean()
  ret = ret.dtype.type(ret / rcount)

  5%|███▉                                                                                | 2/43 [00:03<01:16,  1.88s/it][A
  7%|█████▊                                                                              | 3/43 [00:07<01:48,  2.70s/it][A
  9%|███████▊                                                                            | 4/43 [00:12<02:18,  3.54s/it][A
 12%|█████████▊                                                                          | 5/43 [00:17<02:38,  4.17s/it][A
 14%|███████████▋                                                                        | 6/43 [00:24<02:59,  4.85s/it][A
 16%|█████████████▋                                                                      | 7/43 [00:30<03:09,  5.26s/it][A
 19%|███████████████▋                                                                    | 8/43 [00:36<03:19,  5.69s/it][A
 21%|█████████████████▌                     

# Unused Code