In [1]:
import sys
sys.path.append("/host/d/Github/")
import os
import numpy as np
import pandas as pd
import nibabel as nb
import re
import json
import shutil
import Aorta_segmentation_ZC.functions_collection as ff 
import Aorta_segmentation_ZC.Data_processing as Data_processing

  from .autonotebook import tqdm as notebook_tqdm


### do for resampled data

### patient split

#### for public dataset

In [None]:
patient_list = pd.read_excel('/host/d/Data/CTA/Patient_lists/resampled_data_info.xlsx')
print(patient_list.shape)

### get train and test split
# first shuffle the patient list then split 76 for training and 20 for testing
n = np.arange(0,patient_list.shape[0])
np.random.seed(42)
np.random.shuffle(n)
batch_column = []
for i in range(0,patient_list.shape[0]):
    where_in_n = np.where(n==i)[0][0]
    if where_in_n < 76:
        batch_column.append('train')
    else:
        batch_column.append('test')
patient_list['batch'] = batch_column
patient_list.to_excel('/host/d/Data/CTA/Patient_lists/resampled_data_info_with_train_test_split.xlsx', index=False)

##### for our TAA dataset

In [8]:
patient_list = ff.sort_timeframe(ff.find_all_target_files(['TAA_peking/*'], '/host/d/Data/CTA/processed_data'),0)
print(patient_list.shape)

(47,)


In [13]:
# in total we have 47 patients, we use the first 40 for training and the last 7 for testing
# so now we should build a [key: value] dictionary, where key is patient ID and value is 'train' or 'test'
batch_dict = {}
# shuffle the patient list first
np.random.seed(42)
shuffled_indices = np.random.permutation(patient_list.shape[0])
patient_list = patient_list[shuffled_indices]
for i in range(0, patient_list.shape[0]):
    patient_id = str(os.path.basename(patient_list[i]))
    if i < 40:
        batch_dict[patient_id] = 'train'
    else:
        batch_dict[patient_id] = 'test'

# load our resampled data info
data_info = pd.read_excel('/host/d/Data/CTA/Patient_lists/resampled_data_info_TAA.xlsx')
batch = []
for i in range(0, data_info.shape[0]):
    patient_subclass = str(data_info.iloc[i]['subclass'])
    # find out which batch it belongs to
    batch.append(batch_dict[patient_subclass])
data_info['batch'] = batch
data_info.to_excel('/host/d/Data/CTA/Patient_lists/resampled_data_info_with_train_test_split_TAA.xlsx', index=False)


### prepare data into nnUNet raw

In [None]:
patient_list = pd.read_excel('/host/d/Data/CTA/Patient_lists/resampled_data_info_with_train_test_split_TAA.xlsx')
print(patient_list.shape)

save_folder = '/host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA'
ff.make_folder([save_folder, os.path.join(save_folder, 'imagesTr'), os.path.join(save_folder, 'imagesTs'), os.path.join(save_folder, 'labelsTr')])

for i in range(0, len(patient_list)):
    patient_class = patient_list.iloc[i]['class']
    patient_subclass = str(patient_list.iloc[i]['subclass'])
    patient_id = patient_list.iloc[i]['id']
    patient_index = patient_list.iloc[i]['patient_index']
    patient_split = patient_list.iloc[i]['batch']

    # set it is tr or ts
    if patient_split == 'train':
        phase = 'Tr'
    else:
        phase = 'Ts'

    # find the image data
    img_file = os.path.join('/host/d/Data/CTA/processed_data',patient_class, patient_subclass, patient_id,'img_resampled.nii.gz')

    # copy image data
    img_save_path = os.path.join(save_folder, 'images' + phase, 'AortaTAA_' + str(patient_index).zfill(4) + '_0000.nii.gz')
    shutil.copyfile(img_file, img_save_path)

    # find the mask data
    if phase == 'Tr':
        mask_file = os.path.join('/host/d/Data/CTA/processed_data',patient_class, patient_subclass, patient_id,'seg_resampled.nii.gz')

        # we need to load the mask and convert all non-aorta region to 0, aorta region to 1
        mask_nii = nb.load(mask_file)
        mask_data = mask_nii.get_fdata()
        mask_data_converted = np.zeros_like(mask_data)
        mask_data_converted[mask_data >0] = 1  # aorta
        mask_nii_converted = nb.Nifti1Image(mask_data_converted, mask_nii.affine)

        # copy mask data
        mask_save_path = os.path.join(save_folder, 'labels' + phase, 'AortaTAA_' + str(patient_index).zfill(4) + '.nii.gz')
        nb.save(mask_nii_converted, mask_save_path)

(90, 13)


### write the json file

In [18]:
# write the json file
save_folder = '/host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA'
json_example = os.path.join(save_folder, 'dataset_raw.json')
with open(json_example, 'r') as file:
    data = json.load(file)

# Now 'data' is a Python dictionary or list containing the JSON data
print(data)

{'name': 'AortaTAA', 'licence': 'CC-BY-SA 4.0', 'relase': '1.0 04/05/2018', 'tensorImageSize': '3D', 'channel_names': {'0': 'CT'}, 'labels': {'background': 0, 'aorta': 1}, 'numTraining': 60, 'numTest': 40, 'file_ending': '.nii.gz', 'training': [{'image': './imagesTr/hippocampus_367.nii.gz', 'label': './labelsTr/hippocampus_367.nii.gz'}, {'image': './imagesTr/hippocampus_304.nii.gz', 'label': './labelsTr/hippocampus_304.nii.gz'}, {'image': './imagesTr/hippocampus_204.nii.gz', 'label': './labelsTr/hippocampus_204.nii.gz'}, {'image': './imagesTr/hippocampus_279.nii.gz', 'label': './labelsTr/hippocampus_279.nii.gz'}, {'image': './imagesTr/hippocampus_308.nii.gz', 'label': './labelsTr/hippocampus_308.nii.gz'}, {'image': './imagesTr/hippocampus_375.nii.gz', 'label': './labelsTr/hippocampus_375.nii.gz'}, {'image': './imagesTr/hippocampus_216.nii.gz', 'label': './labelsTr/hippocampus_216.nii.gz'}, {'image': './imagesTr/hippocampus_316.nii.gz', 'label': './labelsTr/hippocampus_316.nii.gz'}, {'i

In [19]:
patient_list = pd.read_excel('/host/d/Data/CTA/Patient_lists/resampled_data_info_with_train_test_split_TAA.xlsx')
print(patient_list.shape)

train_list = []
test_list = []
for i in range(0,len(patient_list)):
    patient_index = patient_list.iloc[i]['patient_index']
    patient_name = 'AortaTAA_' + str(patient_index).zfill(4)
    patient_split = patient_list.iloc[i]['batch']
    if patient_split == 'train':
        phase = 'Tr'
        train_list.append({'image': "./images%s/%s_0000.nii.gz" % (phase, patient_name), 'label': "./labels%s/%s.nii.gz" % (phase, patient_name)})
    else:
        phase = 'Ts'
        test_list.append("./images%s/%s_0000.nii.gz" % (phase, patient_name))
data["training"] = train_list
data["test"] = test_list
data["numTraining"] = len(train_list)
data["numTest"] = len(test_list)

save_json_file = os.path.join(save_folder, 'dataset.json')
with open(save_json_file, 'w') as file:
    json.dump(data, file, indent=4)

(90, 13)


## check dimensions and labels

In [23]:
main_path = '/host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA'
patients = ff.find_all_target_files(['imagesTr/*.nii.gz'],main_path)
for p in patients:
    print('path : ', p)
    patient_id = os.path.basename(p)[0:13]
    # image = nb.load(p).get_fdata()
    # image_shape = image.shape
    

    seg = nb.load(os.path.join(main_path, 'labelsTr', patient_id + '.nii.gz')).get_fdata()
    print('unique labels:', np.unique(seg))
    # seg_shape = seg.shape

    # if image_shape != seg_shape:
    #     print(patient_id)
    #     print('image shape:', image_shape, 'seg shape:', seg_shape)


path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0000_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0001_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0002_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0003_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0004_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0005_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0006_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0007_0000.nii.gz
unique labels: [0. 1.]
path :  /host/d/Data/CTA/nnUNet_raw/Dataset504_AortaTAA/imagesTr/AortaTAA_0008_0000.nii.