In [62]:
import os
import pandas as pd

In [63]:
data_directory = '../dataset/MYCN_Student_ROI_dataset'
ods_file_path = '../dataset/neuroblast_GT.ods'

ground_truth = pd.read_excel(ods_file_path)

display(ground_truth.head())

# Rename table column headers
ground_truth = ground_truth.rename({'Unnamed: 0': 'sample_id'}, axis=1)
ground_truth['MYCN_amplified'] = ground_truth['MYCN_amplified'].map({'No': 0, 'Yes': 1})
display(ground_truth.head())

Unnamed: 0.1,Unnamed: 0,MYCN_amplified
0,32 itksnap_2 CEFC132D-Q01,No
1,36 itksnap_2 CEFC132D-Q01,Yes
2,10 itksnap_10 CEFC132D-Q01,No
3,52 itksnap_2 CEFC13ORG,No
4,48 itksnap_2 ANON48,Yes


Unnamed: 0,sample_id,MYCN_amplified
0,32 itksnap_2 CEFC132D-Q01,0
1,36 itksnap_2 CEFC132D-Q01,1
2,10 itksnap_10 CEFC132D-Q01,0
3,52 itksnap_2 CEFC13ORG,0
4,48 itksnap_2 ANON48,1


In [64]:
# Checking for missing data
for column in ground_truth.columns:
    assert ground_truth[column].isna().any() == 0
    print(f'Column {column} has no missing labels')

# Ensure that all sample ids are unique
num_rows = len(ground_truth['sample_id'])
num_unique_rows = ground_truth["sample_id"].nunique()
assert num_rows == num_unique_rows
print(f'Total number of sample ids with ground truth labelled: {num_rows}')

Column sample_id has no missing labels
Column MYCN_amplified has no missing labels
Total number of sample ids with ground truth labelled: 47


In [65]:
# Creates a set containing the paths for all samples

all_samples = set()
all_samples_ids = set()

for sample_folder in os.listdir(data_directory):
    sample_path = os.path.join(data_directory, sample_folder)
    all_samples_ids.add(str(sample_folder))
    for files in os.listdir(sample_path):
        file_path = os.path.join(sample_path, files)
        all_samples.add(file_path)

assert '../dataset/MYCN_Student_ROI_dataset/32 itksnap_2 CEFC132D-Q01/32 itksnap_2 CEFC132D-Q01_roi_image.nii.gz' in all_samples
assert '../dataset/MYCN_Student_ROI_dataset/32 itksnap_2 CEFC132D-Q01/32 itksnap_2 CEFC132D-Q01_roi_image_gt.nii.gz'in all_samples
assert '32 itksnap_2 CEFC132D-Q01' in all_samples_ids
assert len(all_samples_ids) == len(all_samples)/2

print(f'Total number of unique ids: {len(all_samples_ids)}')

print(f'Total number of samples: {len(all_samples)}')

Total number of unique ids: 55
Total number of samples: 110


In [66]:
# Check if all sample ids have labelled ground truth

ground_truth_ids = set(ground_truth["sample_id"])

if ground_truth_ids == all_samples_ids:
    print('ids in file images are the same as ids in ground_truth')
else: 
    in_files_but_not_in_ods = all_samples_ids.difference(ground_truth_ids)
    in_ods_but_not_in_files = ground_truth_ids.difference(all_samples_ids)

    if len(in_files_but_not_in_ods) == 0:
        print('All ids in files are found in ods')
    else:
        print('Following ids are found in files but not in ods:')
        for id in in_files_but_not_in_ods:
            print(id)        
    if len(in_ods_but_not_in_files) == 0:
        print('All ids in ods are found in files')
    else:
        print('Following ids are found in ods but not in files:')
        for id in in_ods_but_not_in_files:
            print(id)

Following ids are found in files but not in ods:
55 itksnap_2 CEFC132D-Q01
60 itksnap_7 Pediatric 3.0 Axial.38 CE
62 itksnap_2 Fl_Thorax_C  3.0  Br40  2
56 itksnap_3 Pediatric 5.0  CE
58 itksnap_4 POSTCON Pediatric 2.000
57 itksnap_2 CEFC132D-Q01
61 itksnap_2 CEFC13ORG
59 itksnap_2 Body 5.0 IVC CE
All ids in ods are found in files


In [67]:
# Add the missing ids into ods file. 
# ods file will have all ids. 
# For ids without MYCN_amplified labels, it will be valued as -1.

new_rows = pd.DataFrame({
        'sample_id': list(in_files_but_not_in_ods),
        'MYCN_amplified': -1
    })

ground_truth = pd.concat([ground_truth, new_rows], ignore_index=True)
display(ground_truth.head())
display(ground_truth.loc[ground_truth['MYCN_amplified'] == -1])
assert len(ground_truth) == len(all_samples_ids)
print(f'Total number of sample ids: {len(ground_truth)}')

Unnamed: 0,sample_id,MYCN_amplified
0,32 itksnap_2 CEFC132D-Q01,0
1,36 itksnap_2 CEFC132D-Q01,1
2,10 itksnap_10 CEFC132D-Q01,0
3,52 itksnap_2 CEFC13ORG,0
4,48 itksnap_2 ANON48,1


Unnamed: 0,sample_id,MYCN_amplified
47,55 itksnap_2 CEFC132D-Q01,-1
48,60 itksnap_7 Pediatric 3.0 Axial.38 CE,-1
49,62 itksnap_2 Fl_Thorax_C 3.0 Br40 2,-1
50,56 itksnap_3 Pediatric 5.0 CE,-1
51,58 itksnap_4 POSTCON Pediatric 2.000,-1
52,57 itksnap_2 CEFC132D-Q01,-1
53,61 itksnap_2 CEFC13ORG,-1
54,59 itksnap_2 Body 5.0 IVC CE,-1


Total number of sample ids: 55


In [32]:
ground_truth.to_csv('../dataset/neuroblast_GT_preprocessed.csv', index=False)    