Create NIH data-set.

We don't use the handy short-hand methods that are used in other notebooks, because they don't exist anymore.

In [1]:
# standard stuff
import os
import json
from collections import OrderedDict

# medical imaging stuff
import dicom2nifti
import SimpleITK as sitk

In [2]:
# Task specifics
TASK = "501"
TEST_IDS = ["0001", "0009", "0017", "0026", "0033", "0041", "0049", "0057", "0065", "0073", "0081", "0082"]
create_images = False
create_labels = False 
overwrite_datasetjson = True

In [3]:
# nih paths
BASE_DIR = "C:/Users/ikke_/OneDrive/Documenten/Thesis"
NIH_DIR = f'{BASE_DIR}/Data/NIH_pancreas_CT'

# nnUnet paths
TASK_NAME = f"Task{TASK}"
DATA_DIR = f"{BASE_DIR}/Data/nnUNet_raw_data_base"
TASK_DIR = f"{DATA_DIR}/nnUNet_raw_data/{TASK_NAME}"
TRAIN_DATA_DIR = f"{TASK_DIR}/imagesTr"
TRAIN_LABEL_DIR = f"{TASK_DIR}/labelsTr"
TEST_DATA_DIR = f"{TASK_DIR}/imagesTs"
TEST_LABEL_DIR = f"{TASK_DIR}/labelsTs"

In [4]:
def create_folders():
    os.makedirs(TRAIN_DATA_DIR, exist_ok=True)
    os.makedirs(TRAIN_LABEL_DIR, exist_ok=True)
    os.makedirs(TEST_DATA_DIR, exist_ok=True)
    os.makedirs(TEST_LABEL_DIR, exist_ok=True)

def convert_images():
    img_path = f'{NIH_DIR}/Pancreas-CT'

    print("\nCreating nn-Unet image data\n")
    for scan_dir in os.listdir(img_path):
        print(scan_dir)

        # get scan id and path to the scan
        scan_id = scan_dir[-4:]
        curr_dir = img_path + "/"+ scan_dir

        # NIH data is very nested so we go deep
        for dir1 in os.listdir(curr_dir):
            curr_dir += "/" + dir1
            for dir2 in os.listdir(curr_dir):
                curr_dir += "/" + dir2
                
                # Make nifty file from the current DICOM directory
                print(f'Making nifti file from {curr_dir}, scan_id = {scan_id}')
                print(TRAIN_DATA_DIR)
                dicom2nifti.convert_directory(curr_dir, TRAIN_DATA_DIR, compression=True)
                nifti_path = TRAIN_DATA_DIR+"/none_pancreas.nii.gz"
                
                # Reorient to LPS
                img = sitk.ReadImage(nifti_path)
                img = sitk.DICOMOrient(img, 'LPS')
                
                # Move to either train or test folder
                if scan_id in TEST_IDS:
                    file_path = TEST_DATA_DIR+f'/panc_{scan_id}_0000.nii.gz' # 0000 = hardcoded CT modality. We only have CT
                else:
                    file_path = TRAIN_DATA_DIR+f'/panc_{scan_id}_0000.nii.gz' # 0000 = hardcoded CT modality. We only have CT.

                # Save 
                print(f'Saving to {file_path}')
                sitk.WriteImage(img, file_path)

def convert_labels():
    label_path = f'{NIH_DIR}/labels'

    print("\nCreating nn-Unet label data\n")
    for label in os.listdir(label_path):
        
        # Get label id
        scan_id = label[5:9]
        scan_path = os.path.join(label_path, label)
        print(scan_id)
        
        # Reorient
        label_img = sitk.ReadImage(scan_path)
        label_img = sitk.DICOMOrient(label_img, 'LPS')

        # Move to either train or test folder
        if scan_id in TEST_IDS:
            scan_path = os.path.join(TEST_LABEL_DIR, f'panc_{scan_id}.nii.gz')
        else:
            scan_path = os.path.join(TRAIN_LABEL_DIR, f'panc_{scan_id}.nii.gz')

        # Save
        print(f'Saving label to {scan_path}')
        sitk.WriteImage(label_img, scan_path)

def sanity_data_check():
    # verify the amount of scans and labels
    train_files = os.listdir(TRAIN_DATA_DIR)
    label_files = os.listdir(TRAIN_LABEL_DIR)
    print("train image files:",len(train_files))
    print("train label files:",len(label_files))

def generate_dataset_json(overwrite_json_file):
    ''' Make a dataset.json file'''
    json_file_exist = False
    json_path = os.path.join(TASK_DIR,'dataset.json')

    # check whether already exists
    if os.path.exists(json_path):
        print(f'dataset.json already exist! {json_path}')
        json_file_exist = True

    # create new dataset.json file
    if json_file_exist==False or overwrite_json_file:

        json_dict = OrderedDict()
        json_dict['name'] = TASK_NAME
        json_dict['description'] = "500"
        json_dict['tensorImageSize'] = "3D"
        json_dict['reference'] = "TCIA, NIH"
        json_dict['licence'] = "TCIA"
        json_dict['release'] = "0.0"
        json_dict['modality'] = {"0": "CT"}
        json_dict['labels'] = {
            "0": "background",
            "1": "pancreas"
        }
        
        # get train and test filenames
        train_ids = os.listdir(TRAIN_LABEL_DIR)
        test_ids = os.listdir(TEST_LABEL_DIR)
        json_dict['numTraining'] = len(train_ids)

        # create raining and test entries 
        # NOTE: no modality in train image and labels in dataset.json!!
        json_dict['training'] = [{'image': "./imagesTr/%s" % i, "label": "./labelsTr/%s" % i} for i in train_ids]
        json_dict['test'] = [{'image': "./imagesTr/%s" % i, "label": "./labelsTr/%s" % i} for i in test_ids]
        with open(json_path, 'w') as f:
            json.dump(json_dict, f, indent=4, sort_keys=True)

        # Save
        if os.path.exists(os.path.join(TASK_DIR,'dataset.json')):
            if json_file_exist==False:
                print('dataset.json created!')
            else: 
                print('dataset.json overwritten!')

In [5]:
# first, create folders
create_folders()

# convert scans and labels
if create_images: convert_images()
if create_labels: convert_labels()

# sanity check on created files
sanity_data_check()

# create a description of the dataset
generate_dataset_json(overwrite_datasetjson)

train image files: 68
train label files: 70
dataset.json created!
