In [60]:
import os
import pandas as pd
import cv2
import numpy as np
import scipy.io
from tqdm import tqdm
import splitfolders

In [56]:
def get_file_names(path, extension):
    """Returns a sorted list of file names that match the 
    specified file extension in the given directory.
    """
    file_names = []
    # Get file names of files with the correct extnesion
    for file in os.listdir(path):
        if file.endswith(extension):
            file_names.append(os.path.join(path, file))
    return sorted(file_names)

In [57]:
def csv_to_df(csv_path):
    """Reads a CSV file and returns a pandas dataframe.
    """
    df = pd.read_csv(csv_path, header=0, index_col=0)
    return df

In [58]:
def nucls(mask_path, csv_path, mask_mat_dir):
    # make directory for mask_mat files
    if not os.path.exists(mask_mat_dir):
        os.makedirs(mask_mat_dir)
    
    # get file names
    mask_files = get_file_names(mask_path, '.png')
    ann_files = get_file_names(csv_path, '.csv')
    
    for idx in tqdm(range(len(mask_files))):
        # get file
        ann_file = ann_files[idx]
        mask_file = mask_files[idx]
        file_name = mask_file.split('/')[-1].split('.')[0]
        
        # read csv
        ann = csv_to_df(ann_file)
        
        # read mask
        mask = cv2.imread(mask_file)
        height, width, _ = mask.shape
        channels = cv2.split(mask)
        instances = cv2.multiply(channels[0], channels[1])
        labels = channels[2]
        
        # first two are invalid, one is background and one is fov
        unique_inst_ids = np.unique(instances) 
        
        # Get instance masks
        mask_dict = {}
        inst_map = np.zeros((height, width), dtype=np.int32)
        type_map = np.zeros((height, width), dtype=np.int32)
        id = np.zeros((len(ann), 1), dtype=np.int32)
        class_id = np.zeros((len(ann), 1), dtype=np.int32)

        # Loop
        for idx, unique_inst_id in enumerate(unique_inst_ids[2:]):
            unique_inst_mask = np.isin(instances, unique_inst_id)
            label = np.unique(labels[unique_inst_mask])
            if len(label) != 1:
                print('Error: more than one label in instance mask')
                break
            if label[0] == 0 or label[0] == 253:
                print('Error: invalid label')
            if label == 99:
                label = 13  # change unlabeled 99 to 13

            inst_map[unique_inst_mask] = idx + 1
            type_map[unique_inst_mask] = label
            id[idx] = idx + 1
            class_id[idx] = label
        
        mask_dict['inst_map'] = inst_map
        mask_dict['type_map'] = type_map
        mask_dict['id'] = id
        mask_dict['class'] = class_id
        
        scipy.io.savemat(mask_mat_dir + file_name + '.mat', mask_dict)

In [59]:
mask_path = './data/benchmarks/NuCLS/mask/'
csv_path = './data/benchmarks/NuCLS/csv/'
nucls(mask_path, csv_path, './data/benchmarks/NuCLS/mask_mat/')

100%|██████████| 1744/1744 [00:14<00:00, 121.60it/s]


In [61]:
splitfolders.ratio('./data/benchmarks/NuCLS/', output='./data/benchmarks/NuCLS-split/',
                    seed=2023,
                    ratio=(0.8, 0.1, 0.1))

Copying files: 8720 files [00:02, 3439.13 files/s]
