In [1]:
import os
from PIL import Image
import numpy as np
import pandas as pd
import scipy.io
from tqdm import tqdm
import splitfolders

In [2]:
IMG_DIR = './data/benchmarks/NuCLS/rgb/'
CSV_DIR = './data/benchmarks/NuCLS/csv/'
NUCLS_CLASS_MAP = {
    'fov': 14,
    'tumor': 1,
    'fibroblast': 2,
    'lymphocyte': 3,
    'plasma_cell': 4,
    'macrophage': 5,
    'mitotic_figure': 6,
    'vascular_endothelium': 7,
    'myoepithelium': 8,
    'apoptotic_body': 9,
    'neutrophil': 10,
    'ductal_epithelium': 11,
    'eosinophil': 12,
    'unlabeled': 13
}

In [3]:
def get_file_names(path, extension):
    """Returns a sorted list of file names that match the 
    specified file extension in the given directory.
    """
    file_names = []
    # Get file names of files with the correct extnesion
    for file in os.listdir(path):
        if file.endswith(extension):
            file_names.append(os.path.join(path, file))
    return sorted(file_names)

def csv_to_df(csv_path):
    """Reads a CSV file and returns a pandas dataframe.
    """
    df = pd.read_csv(csv_path, header=0, index_col=0)
    return df

def save_mask_nucls(img_dir, csv_dir, mask_dir):
    if not os.path.exists(mask_dir):
        os.makedirs(mask_dir)
    img_paths = get_file_names(img_dir, '.png')
    csv_paths = get_file_names(csv_dir, 'csv')
    for idx1 in tqdm(range(len(img_paths))):
        img_path = img_paths[idx1]
        csv_path = csv_paths[idx1]
        file_name = img_path.split('/')[-1].split('.')[0]
        # print('Processing csv: ' + csv_path)
        
        # Read image and CSV file
        img = Image.open(img_path)
        ann = csv_to_df(csv_path)
        width, height = img.size
        # print(f"width: {width}, height: {height}")
        
        # Create mask
        mask = {}
        type_map = np.zeros((height, width), dtype=np.int32)
        inst_map = np.zeros((height, width), dtype=np.int32)
        id = np.zeros((len(ann), 1), dtype=np.int32)
        class_id = np.zeros((len(ann), 1), dtype=np.uint8)
        for idx in range(len(ann)):
            inst_id = idx + 1
            coords_x = [min(int(x) - 1, width - 1) for x in ann.iloc[idx]['coords_x'].split(',')]
            coords_y = [min(int(y) - 1, height - 1) for y in ann.iloc[idx]['coords_y'].split(',')]
            inst_map[coords_y, coords_x] = inst_id
            id[idx] = inst_id
            raw_classification = ann.iloc[idx]['raw_classification']
            class_id[idx] = NUCLS_CLASS_MAP[raw_classification]
            type_map[coords_y, coords_x] = class_id[idx]
            
        mask['inst_map'] = inst_map
        mask['id'] = id
        mask['class'] = class_id
        mask['type_map'] = type_map

        # Save mask as mat file
        scipy.io.savemat(mask_dir + file_name + '.mat', mask)
        # print('Saved mask for image: ' + file_name)

save_mask_nucls(IMG_DIR, CSV_DIR, './data/benchmarks/NuCLS/mask_mat/')

  1%|          | 11/1744 [00:00<00:16, 102.85it/s]

100%|██████████| 1744/1744 [00:15<00:00, 114.92it/s]


In [4]:
splitfolders.ratio('./data/benchmarks/NuCLS/', output='./data/benchmarks/NuCLS-split/',
                    seed=2023,
                    ratio=(0.8, 0.1, 0.1))

Copying files: 8720 files [00:02, 3391.69 files/s]
