In [7]:
import sys
sys.path.append('/workspace/Documents')
import os
import numpy as np 
import nibabel as nb
import pandas as pd
import sam_cmr.functions_collection as ff

# Run following for ACDC

In [8]:
data_path = '/mnt/camca_NAS/SAM_for_CMR/data/ACDC_database/temporal'
patient_list_path = '/mnt/camca_NAS/SAM_for_CMR/data/Patient_list/'
basic_info_file = pd.read_excel(os.path.join(patient_list_path, 'ACDC_basic_info.xlsx'))

In [9]:
training_dataset = ff.find_all_target_files(['training/*'],os.path.join(data_path))
testing_dataset = ff.find_all_target_files(['testing/*'],os.path.join(data_path))
all_dataset = ff.find_all_target_files(['training/*','testing/*'],os.path.join(data_path))
print(len(training_dataset), len(testing_dataset), len(all_dataset))
######### in total there should be 100 training cases and 50 testing cases, 150 cases in total

# divide training dataset into 5 batches, batch 0 to batch 4
if len(training_dataset)%5 == 0:
    batch_size = len(training_dataset)//5
else:
    batch_size = len(training_dataset)//5 + 1
# the testing dataset is an individual batch, batch 5

100 50 150


In [10]:
result = []
for i in range(len(all_dataset)):
    
    folder = all_dataset[i]
    patient_id = os.path.basename(folder)
    patient_group = os.path.basename(os.path.dirname(folder))

    # find the corresponding row in basic info spreadsheet
    row = basic_info_file[basic_info_file['patient_id'] == patient_id]
    
    if patient_group[0:3] == 'tra': # training
        batch_index = i // batch_size
    else:
        
      batch_index = 5
    
    # image file with all slices (including non-zero slices and zero slices)
    image_full_slice_file = os.path.join(folder, '4d_temporal_img_full.nii.gz')

    # segmentation file with all slices (including non-zero slices and zero slices)
    seg_full_slice_file = os.path.join(folder, '4d_temporal_seg_full.nii.gz')

    # image file with only nonzero slices - strict: both ED and ES need to have GT segmentation
    image_nonzero_slice_file = os.path.join(folder, '4d_temporal_img_nonzero.nii.gz')

    # segmentation file with only nonzero slices - strict
    seg_nonzero_slice_file = os.path.join(folder, '4d_temporal_seg_nonzero.nii.gz')

    # image file with only nonzero slices - loose: only ED or ES need to have GT segmentation
    image_nonzero_slice_file_loose = os.path.join(folder, '4d_temporal_img_nonzero_loose.nii.gz')

    # segmentation file with only nonzero slices - loose
    seg_nonzero_slice_file_loose = os.path.join(folder, '4d_temporal_seg_nonzero_loose.nii.gz')

    # image start from 'base' or 'apex'
    start_slice_name = row.iloc[0]['start_slice_name']

    # image total slice number
    total_slice_num = row.iloc[0]['total_slice_num']

    # image nonzero slice number
    nonzero_slice_num = row.iloc[0]['nonzero_slice_num']

    # image nonzero slice start index
    nonzero_slice_start_index = row.iloc[0]['nonzero_slice_start_index']

    # image nonzero slice end index
    nonzero_slice_end_index = row.iloc[0]['nonzero_slice_end_index']

    # image nonzero slice number - loose
    nonzero_slice_num_loose = row.iloc[0]['nonzero_slice_num_loose']

    # image nonzero slice start index - loose
    nonzero_slice_start_index_loose = row.iloc[0]['nonzero_slice_start_index_loose']

    # image nonzero slice end index - loose
    nonzero_slice_end_index_loose = row.iloc[0]['nonzero_slice_end_index_loose']

    # original time frame number
    original_time_frame_num = row.iloc[0]['tf_num']

    # ED time frame
    ED = row.iloc[0]['ED']

    # ES time frame
    ES = row.iloc[0]['ES']
 
    # processed time frame number: always 15, ALWAYS including ED and ES
    processed_time_frame_num = row.iloc[0]['processed_time_frame_num']

    # ED index in the processed time frame
    ED_index_in_processed_time_frame = row.iloc[0]['ED_index_in_processed_time_frame']

    # ES index in the processed time frame
    ES_index_in_processed_time_frame = row.iloc[0]['ES_index_in_processed_time_frame']

    # what are these 15 time frames' index?
    processed_time_frame_index_list = row.iloc[0]['processed_time_frame_index_list']

    result.append([patient_id, patient_group, batch_index,image_full_slice_file, seg_full_slice_file,  image_nonzero_slice_file, seg_nonzero_slice_file, image_nonzero_slice_file_loose, seg_nonzero_slice_file_loose,
                   start_slice_name, total_slice_num, nonzero_slice_num, nonzero_slice_start_index, nonzero_slice_end_index, nonzero_slice_num_loose, nonzero_slice_start_index_loose, nonzero_slice_end_index_loose,
                   original_time_frame_num, ED, ES, processed_time_frame_num, ED_index_in_processed_time_frame, ES_index_in_processed_time_frame, processed_time_frame_index_list])




df = pd.DataFrame(result, columns=['patient_id', 'patient_group', 'batch_index', 'image_full_slice_file', 'seg_full_slice_file', 'image_nonzero_slice_file', 'seg_nonzero_slice_file', 'image_nonzero_slice_file_loose', 'seg_nonzero_slice_file_loose',
                                    'start_slice_name', 'total_slice_num', 'nonzero_slice_num', 'nonzero_slice_start_index', 'nonzero_slice_end_index', 'nonzero_slice_num_loose', 'nonzero_slice_start_index_loose', 'nonzero_slice_end_index_loose',
                                    'original_time_frame_num', 'ED', 'ES', 'processed_time_frame_num', 'ED_index_in_processed_time_frame', 'ES_index_in_processed_time_frame', 'processed_time_frame_index_list'])

df.to_excel(os.path.join(patient_list_path, 'ACDC_Patient_List_training_testing.xlsx'), index=False)

# Run following for STACOM

In [3]:
data_path = '/mnt/camca_NAS/SAM_for_CMR/data/STACOM_database/temporal'
patient_list_path = '/mnt/camca_NAS/SAM_for_CMR/data/Patient_list/'
basic_info_file = pd.read_excel(os.path.join(patient_list_path, 'STACOM_basic_info.xlsx'))

In [4]:
all_dataset = ff.find_all_target_files(['*'],os.path.join(data_path))
print(len(all_dataset))
######### in total there should be 100 training cases

# divide dataset into 5 batches, batch 0 to batch 4

if len(all_dataset)%5 == 0:
    batch_size = len(all_dataset)//5
else:
    batch_size = len(all_dataset)//5 + 1

print(batch_size)

100
20


In [6]:
result = []
for i in range(len(all_dataset)):
    
    folder = all_dataset[i]
    patient_id = os.path.basename(folder)
    patient_group = 'training'

    # find the corresponding row in basic info spreadsheet
    row = basic_info_file[basic_info_file['patient_id'] == patient_id]
    
    if patient_group[0:3] == 'tra': # training
        batch_index = i // batch_size
    else:
        
      batch_index = 5
    
    # image file with all slices (including non-zero slices and zero slices)
    image_full_slice_file = os.path.join(folder, '4d_temporal_img_full.nii.gz')

    # segmentation file with all slices (including non-zero slices and zero slices)
    seg_full_slice_file = os.path.join(folder, '4d_temporal_seg_full.nii.gz')

    # image file with only nonzero slices - strict: both ED and ES need to have gt segmentation
    image_nonzero_slice_file = os.path.join(folder, '4d_temporal_img_nonzero.nii.gz')

    # segmentation file with only nonzero slices - strict: both ED and ES need to have gt segmentation
    seg_nonzero_slice_file = os.path.join(folder, '4d_temporal_seg_nonzero.nii.gz')

    # image file with only nonzero slices - loose: either ED or ES has gt segmentation
    image_nonzero_slice_file_loose = os.path.join(folder, '4d_temporal_img_nonzero_loose.nii.gz')

    # segmentation file with only nonzero slices - loose: either ED or ES has gt segmentation
    seg_nonzero_slice_file_loose = os.path.join(folder, '4d_temporal_seg_nonzero_loose.nii.gz')

    # image start from 'base' or 'apex'
    start_slice_name = row.iloc[0]['start_slice_name']

    # image total slice number
    total_slice_num = row.iloc[0]['total_slice_num']

    # image nonzero slice number
    nonzero_slice_num = row.iloc[0]['nonzero_slice_num']

    # image nonzero slice start index
    nonzero_slice_start_index = row.iloc[0]['nonzero_slice_start_index']

    # image nonzero slice end index
    nonzero_slice_end_index = row.iloc[0]['nonzero_slice_end_index']

    # image nonzero slice number - loose
    nonzero_slice_num_loose = row.iloc[0]['nonzero_slice_num_loose']

    # image nonzero slice start index - loose
    nonzero_slice_start_index_loose = row.iloc[0]['nonzero_slice_start_index_loose']

    # image nonzero slice end index - loose
    nonzero_slice_end_index_loose = row.iloc[0]['nonzero_slice_end_index_loose']

    # original time frame number
    original_time_frame_num = row.iloc[0]['tf_num']

    # ED time frame
    ED = row.iloc[0]['ED']

    # ES time frame
    ES = row.iloc[0]['ES']
 
    # processed time frame number: always 15, ALWAYS including ED and ES
    processed_time_frame_num = row.iloc[0]['processed_time_frame_num']

    # ED index in the processed time frame
    ED_index_in_processed_time_frame = row.iloc[0]['ED_index_in_processed_time_frame']

    # ES index in the processed time frame
    ES_index_in_processed_time_frame = row.iloc[0]['ES_index_in_processed_time_frame']

    # what are these 15 time frames' index?
    processed_time_frame_index_list = row.iloc[0]['processed_time_frame_index_list']

    result.append([patient_id, patient_group, batch_index,image_full_slice_file, seg_full_slice_file,  image_nonzero_slice_file, seg_nonzero_slice_file, image_nonzero_slice_file_loose, seg_nonzero_slice_file_loose,
                   start_slice_name, total_slice_num, nonzero_slice_num, nonzero_slice_start_index, nonzero_slice_end_index,  nonzero_slice_num_loose, nonzero_slice_start_index_loose, nonzero_slice_end_index_loose,
                   original_time_frame_num, ED, ES, processed_time_frame_num, ED_index_in_processed_time_frame, ES_index_in_processed_time_frame, processed_time_frame_index_list])




df = pd.DataFrame(result, columns=['patient_id', 'patient_group', 'batch_index', 'image_full_slice_file', 'seg_full_slice_file', 'image_nonzero_slice_file', 'seg_nonzero_slice_file', 'image_nonzero_slice_file_loose', 'seg_nonzero_slice_file_loose',
                                    'start_slice_name', 'total_slice_num', 'nonzero_slice_num', 'nonzero_slice_start_index', 'nonzero_slice_end_index', 'nonzero_slice_num_loose', 'nonzero_slice_start_index_loose', 'nonzero_slice_end_index_loose',
                                    'original_time_frame_num', 'ED', 'ES', 'processed_time_frame_num', 'ED_index_in_processed_time_frame', 'ES_index_in_processed_time_frame', 'processed_time_frame_index_list'])

df.to_excel(os.path.join(patient_list_path, 'STACOM_Patient_List_training_testing.xlsx'), index=False)