In [1]:
import os
import pandas as pd

In [2]:
data_directory = '../dataset/MYCN_Student_ROI_dataset'
ods_file_path = '../dataset/neuroblast_GT.ods'

ground_truth = pd.read_excel(ods_file_path)
ground_truth.head()

Unnamed: 0.1,Unnamed: 0,MYCN_amplified
0,32 itksnap_2 CEFC132D-Q01,No
1,36 itksnap_2 CEFC132D-Q01,Yes
2,10 itksnap_10 CEFC132D-Q01,No
3,52 itksnap_2 CEFC13ORG,No
4,48 itksnap_2 ANON48,Yes


In [3]:
for column in ground_truth.columns:
    if not ground_truth[column].isna().any():
        print(f'Column {column} has no missing labels')
ground_truth = ground_truth.rename({'Unnamed: 0': 'sample_id'}, axis=1)
ground_truth['MYCN_amplified'] = ground_truth['MYCN_amplified'].map({'No': 0, 'Yes': 1})
ground_truth.head()

Column Unnamed: 0 has no missing labels
Column MYCN_amplified has no missing labels


Unnamed: 0,sample_id,MYCN_amplified
0,32 itksnap_2 CEFC132D-Q01,0
1,36 itksnap_2 CEFC132D-Q01,1
2,10 itksnap_10 CEFC132D-Q01,0
3,52 itksnap_2 CEFC13ORG,0
4,48 itksnap_2 ANON48,1


In [4]:
dataset_samples = set()

for sample_folder in os.listdir(data_directory):
    sample_path = os.path.join(data_directory, sample_folder)
    for files in os.listdir(sample_path):
        file_path = os.path.join(sample_path, files)
        
        dataset_samples.add(file_path)

In [10]:
all_samples = set()
all_samples_ids = set()

for sample_folder in os.listdir(data_directory):
    sample_path = os.path.join(data_directory, sample_folder)

    image_path = os.path.join(sample_path, sample_folder + '_roi_image.nii.gz')
    mask_path = os.path.join(sample_path, sample_folder + '_roi_image_gt.nii.gz')

    all_samples_ids.add(str(sample_folder))
    all_samples.add(image_path)
    all_samples.add(mask_path)

In [6]:
if dataset_samples == all_samples:
    print('Path to all existing image files in dataset has been captured.')
print(len(dataset_samples), len(all_samples))

Path to all existing image files in dataset has been captured.
110 110


In [7]:
print(f'Number of sample_id rows in ods: {len(ground_truth["sample_id"])}')
print(f'Number of unique sample_id rows in ods: {ground_truth["sample_id"].nunique()}')
ground_truth_ids = set(ground_truth["sample_id"])

Number of sample_id rows in ods: 47
Number of unique sample_id rows in ods: 47


In [17]:
if ground_truth_ids == all_samples_ids:
    print('ids in file images are the same as ids in ground_truth')
else: 
    print('Presence of missing ids data')
    in_files_but_not_in_ods = all_samples_ids.difference(ground_truth_ids)
    in_ods_but_not_in_files = ground_truth_ids.difference(all_samples_ids)

    print('In files but not in ods')
    for id in in_files_but_not_in_ods: 
        print(id)

    print('\nIn ods but not in files')
    for id in in_ods_but_not_in_files:
        print(id)

Presence of missing ids data
In files but not in ods
56 itksnap_3 Pediatric 5.0  CE
55 itksnap_2 CEFC132D-Q01
62 itksnap_2 Fl_Thorax_C  3.0  Br40  2
61 itksnap_2 CEFC13ORG
57 itksnap_2 CEFC132D-Q01
58 itksnap_4 POSTCON Pediatric 2.000
59 itksnap_2 Body 5.0 IVC CE
60 itksnap_7 Pediatric 3.0 Axial.38 CE

In ods but not in files


In [26]:
# Add the missing ids into ods file. 
# ods file will have all ids. 
# For ids without MYCN_amplified labels, it will be valued as -1.

new_rows = pd.DataFrame({
        'sample_id': list(in_files_but_not_in_ods),
        'MYCN_amplified': -1
    })

ground_truth = pd.concat([ground_truth, new_rows], ignore_index=True)

In [29]:
ground_truth.loc[ground_truth['MYCN_amplified']==-1]

Unnamed: 0,sample_id,MYCN_amplified
47,56 itksnap_3 Pediatric 5.0 CE,-1
48,55 itksnap_2 CEFC132D-Q01,-1
49,62 itksnap_2 Fl_Thorax_C 3.0 Br40 2,-1
50,61 itksnap_2 CEFC13ORG,-1
51,57 itksnap_2 CEFC132D-Q01,-1
52,58 itksnap_4 POSTCON Pediatric 2.000,-1
53,59 itksnap_2 Body 5.0 IVC CE,-1
54,60 itksnap_7 Pediatric 3.0 Axial.38 CE,-1


In [32]:
ground_truth.to_csv('../dataset/neuroblast_GT_preprocessed.csv', index=False)    