In [41]:
import pydicom 
import numpy as np 
import pandas as pd 
import os 
import sys 
import glob 
import re 
from datetime import datetime 

In [42]:
DATA_ROOT = '/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations'
LDCT_ROOT = os.path.join(DATA_ROOT, 'Low-Dose_CT')
CECT_ROOT = os.path.join(DATA_ROOT, 'Contrast-Enhanced_CT')

# Checking the number of timepoints and scans

In [43]:
def check_annotations(data_root_dir):
    for patient_id in os.listdir(data_root_dir): 
        print(f"Patient: {patient_id}")
        patient_path = os.path.join(data_root_dir, patient_id)
        if not os.path.isdir(patient_path):
            print(f"{patient_path} is not a directory. Pass.")
            pass 
        for timepoint in os.listdir(patient_path):
            timepoint_path = os.path.join(patient_path, timepoint)
            print(f"\tTimepoint: {timepoint}")
            scan_count = 0
            annotations_available = False

            for scan in os.listdir(timepoint_path):
                scan_path = os.path.join(timepoint_path, scan)

                if os.path.isdir(scan_path):
                    scan_count += 1

                    contours_path = os.path.join(scan_path, 'Contours')
                    if os.path.exists(contours_path):
                        annotations_available = True
                        print(f"\t\tScan: {scan} - Annotation Available")
                    else:
                        print(f"\t\tScan: {scan} - No Annotation")


In [44]:
check_annotations(CECT_ROOT)

Patient: 10064_1_UTI1Q537
	Timepoint: 2023-09-28
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023032314262280400002944 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100023320 - No Annotation
		Scan: CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.75487.30000023032314421944100023006 - Annotation Available
Patient: 10064_1_0XVHE7T8
	Timepoint: 2023-11-27
		Scan: 1.3.12.2.1107.5.1.4.76430.30000023021314165347900033329 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.76430.30000023021314053084100002218 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.76430.3000002302131416534790003

		Scan: 1.3.12.2.1107.5.1.4.73443.30000022070115430016100010098 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.73443.30000022070115430016100010220 - No Annotation
Patient: 10057_1_6073QA8B
	Timepoint: 2017-07-20
		Scan: 1.3.12.2.1107.5.1.4.60120.30000016120121073584700016477 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000016120121073584700016504 - No Annotation
		Scan: CT_Full_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.60120.30000016120121073584700015798 - Annotation Available
		Scan: 1.3.12.2.1107.5.1.4.60120.30000016120121073584700016182 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000016120121073584700015669 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000016120121073584700016308 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000016120119284092100006930 - No Annotation
Patient: 10064_1_25C08LV7
	Timepoint: 2019-05-21
		Scan: 1.3.12.2.1107.5.1.4.95559.30000019010813312986200003156 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.95559.3000001901081331

		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514184649300037330 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514052213500007127 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514184649300037240 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514184649300037448 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514052213500007138 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514184649300037604 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514052213500007133 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514184649300036974 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.66859.30050021030514184649300037629 - No Annotation
		Scan: CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.66859.30050021030514184649300036708 - Annotation Available
Patient: 10057_1_6X4K1WYT
	Timepoint: 2019-06-20
		Scan: CT_Full_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.64297.30000018051015492206900005349

		Scan: 1.3.12.2.1107.5.1.4.60120.30000019060514415957000024280 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019060514415957000023952 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019060514415957000023442 - No Annotation
		Scan: CT_Full_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.60120.30000019060514415957000023570 - Annotation Available
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019060514415957000024251 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019060514102735100002268 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019060514415957000024085 - No Annotation
Patient: 10064_1_32C7Q601
	Timepoint: 2024-04-09
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023121214373759400002791 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023121214480242700028076 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023121214480242700027258 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.75487.30000023121214480242700027862 - No Annotation
		Scan: 1.2.276.0.4

		Scan: 1.3.12.2.1107.5.1.4.105604.30000022110116471113400000939 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.105604.30000022110116471113400000708 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.105604.30000022110116364748400000031 - No Annotation
Patient: 10064_1_3W45E990
	Timepoint: 2020-07-27
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019080514143998400002304 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019080514273541500019603 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019080514273541500019995 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019080514273541500020306 - No Annotation
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019080514273541500020142 - No Annotation
		Scan: CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.60120.30000019080514273541500019702 - Annotation Available
		Scan: 1.3.12.2.1107.5.1.4.60120.30000019080514273541500020337 - No Annotation
Patient: 10064_1_BM3T4H4Z
	Timepoint: 2022-09-21
		Scan: 1.3.12.2.1107.5.1.4.73443.3000002103031638412050

# Filter UCLA LDCT cases based on kernel, slice thickness, and orientation

In [45]:
# Get paths to each DICOM folder and first .dcm file 
def get_first_dcm_per_scan(root_directory): 
    dcm_paths = [] 
    dcm_dir_paths = []
    for pid in os.listdir(root_directory): 
        patient_dir_path = os.path.join(root_directory, pid)
        for timepoint in os.listdir(patient_dir_path): 
            timepoint_dir_path = os.path.join(patient_dir_path, timepoint)
            
            for scan in os.listdir(timepoint_dir_path): 
                scan_dir_path = os.path.join(timepoint_dir_path, scan)
                dicom_dir_path = os.path.join(scan_dir_path, 'DICOM')
                if os.path.exists(dicom_dir_path): # this should always be true 
                    dcm_filenames = sorted(os.listdir(dicom_dir_path))
                    dcm_paths.append(os.path.join(dicom_dir_path, dcm_filenames[0]))
                    dcm_dir_paths.append(dicom_dir_path)
                else: 
                    print(f" Warning: path does not exist --{dicom_dir_path}")
    return dcm_dir_paths, dcm_paths 

In [46]:
dicom_dir_paths, first_files = get_first_dcm_per_scan(CECT_ROOT) 
print(len(dicom_dir_paths))
print(dicom_dir_paths[0:5])
print(len(first_files))
print(first_files[0:5])

1048
['/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532/DICOM', '/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500/DICOM', '/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM', '/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM', '/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314262280400002944/DICOM']
1048
['/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanc

In [47]:
#make list of dicom files based on list of directories

first_dicom_per_file = []

for scan in first_files:
        first_dicom_per_file.append(pydicom.dcmread(scan,force=True))
    
print(first_dicom_per_file[0])

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 200
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100023543
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.2.276.0.7230010.3.0.3.6.4
(0002, 0013) Implementation Version Name         SH: 'OFFIS_DCMTK_364'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['DERIVED', 'PRIMARY', 'AXIAL', 'CT_SOM5 MIP']
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.12.2.1107.5.1.4.75487.30000023032314421944100023543
(0008, 0020) Study Date

In [48]:
def make_dicom_metadata_df(root_directory): 
    dicom_metadata_list = []
    for pid in os.listdir(root_directory): 
        patient_dir_path = os.path.join(root_directory, pid)
        for timepoint in os.listdir(patient_dir_path): 
            timepoint_dir_path = os.path.join(patient_dir_path, timepoint)
            
            for scan in os.listdir(timepoint_dir_path): 
                scan_dir_path = os.path.join(timepoint_dir_path, scan)
                dicom_dir_path = os.path.join(scan_dir_path, 'DICOM')
                
                try:
                    dcm_filenames = sorted(os.listdir(dicom_dir_path))
                    
                    # read the first .dcm file 
                    dcm_path = os.path.join(dicom_dir_path, dcm_filenames[0])
                    dicom = pydicom.dcmread(dcm_path, stop_before_pixels=True)
                
                    accession_number = getattr(dicom, 'AccessionNumber', 'missing')
                    slice_thickness = getattr(dicom, 'SliceThickness', 'missing')
                    convolution_kernel = getattr(dicom, 'ConvolutionKernel', 'missing')
                    image_orientation = getattr(dicom, 'ImageOrientationPatient', 'missing')
                    patient_id = getattr(dicom, 'PatientID', 'missing')
                    
                    dicom_metadata_list.append([
                        pid, 
                        timepoint, 
                        scan, 
                        accession_number,
                        slice_thickness,
                        convolution_kernel,
                        image_orientation,
                        patient_id,
                        dicom_dir_path
                    ])
                    
                except Exception as e:
                    # Ignore the scan if there's an error
                    print(f"Error reading DICOM file in {dicom_dir_path}: {e}")
                    continue
                                        
    dicom_metadata_df = pd.DataFrame(dicom_metadata_list, columns=[
        'pid', 
        'timepoint',
        'scan',
        'AccessionNumber', 
        'SliceThickness', 
        'ConvolutionKernel',
        'ImageOrientationPatient', 
        'PatientID', 
        "Directory",
    ])
    
    return dicom_metadata_df


In [49]:
dicom_metadata_df = make_dicom_metadata_df(CECT_ROOT)
dicom_metadata_df

Error reading DICOM file in /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_U1EYI762/2024-02-21/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.76430.30000022040614130273200041243/DICOM: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error reading DICOM file in /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_U1EYI762/2024-02-21/1.3.12.2.1107.5.1.4.76430.30000022040614130273200041970/DICOM: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error reading DICOM file in /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_U1EYI762/2024-02-21/1.3.12.2.1107.5.1.4.76430.30000022040613561720700002794/DICOM: File is missing DICOM File Meta Information header or the 'DICM' prefix is mi

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory
0,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532,A_GP1F6404,10.0,"[Br49d, 3]","[0.99939217971798, -0.01220219337, -0.032655437457, 0.01220870465, 0.9999254709881, -3e-012]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532/DICOM
1,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500,A_GP1F6404,10.0,"[Br49d, 3]","[0.99939217971798, -0.01220219337, -0.032655437457, -0.032653003679, 0.000398680594, -0.9994666689813]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500/DICOM
2,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755,A_GP1F6404,3.0,"[Br49d, 3]","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM
3,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567,A_GP1F6404,3.0,"[Br49d, 3]","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM
4,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314262280400002944,A_GP1F6404,,Tr20f,"[1, 6.123233996e-017, 0, 0, 0, -1]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314262280400002944/DICOM
...,...,...,...,...,...,...,...,...,...
1036,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028786,A_778P105L,10.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028786/DICOM
1037,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596,A_778P105L,2.0,Br46f,"[0, 1, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596/DICOM
1038,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028755,A_778P105L,10.0,Br46f,"[1, 0, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028755/DICOM
1039,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997,A_778P105L,3.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997/DICOM


In [50]:

pd.set_option('display.max_colwidth', None)
print(dicom_metadata_df['Directory'])


0       /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532/DICOM
1       /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500/DICOM
2       /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM
3       /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM
4       /workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314262280400002944/DICOM
                                                                      

In [51]:
#function to determine ct orientation based on 'ImageOrientationPatient' column

def get_ct_orientation(df, index):

    image_orientation_patient_string = str(df['ImageOrientationPatient'].iloc[index])
    image_vectors = image_orientation_patient_string.replace('[','').replace(']','').split(',')
    image_vectors = [round(float(x)) for x in image_vectors]

    #[1,0,0,0,1,0]
    if image_vectors[0] == 1 and image_vectors[4] == 1: 
        return 'axial'
    
    #[0,1,0,0,0,-1]
    elif image_vectors[1] == 1 and image_vectors[5] == -1: 
        return 'sagittal'
    
    #[1,0,0,0,0,-1]
    elif image_vectors[0] == 1 and image_vectors[5] == -1: 
        return 'coronal'
    
    else:
        return 'unknown'

In [52]:
#add column for scan orientation

def add_orientation_column(dicom_df):
    orientation = []
    
    for index in range(len(dicom_df)):
        if str(dicom_df['ImageOrientationPatient'].iloc[index]) == 'missing':
            orientation.append('missing')
        else:
            orientation.append(get_ct_orientation(dicom_df, index))

    dicom_df['Orientation'] = orientation
    return dicom_df

In [53]:
dicom_metadata_df = add_orientation_column(dicom_metadata_df)
dicom_metadata_df

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory,Orientation
0,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532,A_GP1F6404,10.0,"[Br49d, 3]","[0.99939217971798, -0.01220219337, -0.032655437457, 0.01220870465, 0.9999254709881, -3e-012]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532/DICOM,axial
1,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500,A_GP1F6404,10.0,"[Br49d, 3]","[0.99939217971798, -0.01220219337, -0.032655437457, -0.032653003679, 0.000398680594, -0.9994666689813]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500/DICOM,coronal
2,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755,A_GP1F6404,3.0,"[Br49d, 3]","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM,axial
3,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567,A_GP1F6404,3.0,"[Br49d, 3]","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM,axial
4,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314262280400002944,A_GP1F6404,,Tr20f,"[1, 6.123233996e-017, 0, 0, 0, -1]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314262280400002944/DICOM,coronal
...,...,...,...,...,...,...,...,...,...,...
1036,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028786,A_778P105L,10.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028786/DICOM,axial
1037,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596,A_778P105L,2.0,Br46f,"[0, 1, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596/DICOM,sagittal
1038,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028755,A_778P105L,10.0,Br46f,"[1, 0, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028755/DICOM,coronal
1039,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997,A_778P105L,3.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997/DICOM,axial


In [54]:
# for feature counts, we need to convert everything to strings 
dicom_metadata_df['SliceThickness'] = dicom_metadata_df['SliceThickness'].astype(str)
dicom_metadata_df['ConvolutionKernel'] = dicom_metadata_df['ConvolutionKernel'].astype(str)
dicom_metadata_df['Orientation'] = dicom_metadata_df['Orientation'].astype(str)

# calculate the number of cases for each combination 
feature_count_df = (
    dicom_metadata_df
    .groupby(['SliceThickness', 'ConvolutionKernel', 'Orientation'])
    .size()
    .reset_index()
    .rename(columns={0: 'count'})
)

feature_count_df = feature_count_df.sort_values('count', ascending=False)
feature_count_df

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
3,1,B45f,axial,79
68,3,B45f,axial,62
83,,Tr20f,coronal,57
22,10,B45f,coronal,54
21,10,B45f,axial,54
...,...,...,...,...
62,2,FC51,sagittal,1
32,10,Br44d,coronal,1
31,10,Br44d,axial,1
66,2.0,FL01,coronal,1


## Exclude cases whose slice thickness is "missing" or None

In [55]:
# Filter out rows where SliceThickness is "missing"
filtered_df = dicom_metadata_df[dicom_metadata_df['SliceThickness'] != 'missing']

# Convert columns to strings to ensure they are hashable
filtered_df['SliceThickness'] = filtered_df['SliceThickness'].astype(str)
filtered_df['ConvolutionKernel'] = filtered_df['ConvolutionKernel'].astype(str)
filtered_df['Orientation'] = filtered_df['Orientation'].astype(str)

# Now group by the specified columns and calculate the counts
feature_count_df = (
    filtered_df
    .groupby(['SliceThickness', 'ConvolutionKernel', 'Orientation'])
    .size()
    .reset_index()
    .rename(columns={0: 'count'})
)

# Sort the DataFrame by the count column in descending order
feature_count_df = feature_count_df.sort_values('count', ascending=False)

# Display the result
feature_count_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['SliceThickness'] = filtered_df['SliceThickness'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['ConvolutionKernel'] = filtered_df['ConvolutionKernel'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Orientation'] = filtered_df['Orientati

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
3,1,B45f,axial,79
68,3,B45f,axial,62
83,,Tr20f,coronal,57
21,10,B45f,axial,54
22,10,B45f,coronal,54
...,...,...,...,...
26,10,B46f,unknown,1
61,2,FC51,coronal,1
62,2,FC51,sagittal,1
39,10,FC51,axial,1


In [56]:
# Filter out rows where SliceThickness is "missing"
filtered_df = filtered_df[filtered_df['SliceThickness'] != 'None']

# Now group by the specified columns and calculate the counts
feature_count_df = (
    filtered_df
    .groupby(['SliceThickness', 'ConvolutionKernel', 'Orientation'])
    .size()
    .reset_index()
    .rename(columns={0: 'count'})
)

# Sort the DataFrame by the count column in descending order
feature_count_df = feature_count_df.sort_values('count', ascending=False)

# Display the result
feature_count_df


Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
3,1,B45f,axial,79
68,3,B45f,axial,62
22,10,B45f,coronal,54
21,10,B45f,axial,54
49,2,B45f,coronal,44
...,...,...,...,...
28,10,B50f,coronal,1
1,0.6,T20f,sagittal,1
39,10,FC51,axial,1
31,10,Br44d,axial,1


In [57]:
filtered_df

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory,Orientation
0,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532,A_GP1F6404,10,"['Br49d', '3']","[0.99939217971798, -0.01220219337, -0.032655437457, 0.01220870465, 0.9999254709881, -3e-012]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023532/DICOM,axial
1,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500,A_GP1F6404,10,"['Br49d', '3']","[0.99939217971798, -0.01220219337, -0.032655437457, -0.032653003679, 0.000398680594, -0.9994666689813]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023500/DICOM,coronal
2,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755,A_GP1F6404,3,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM,axial
3,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567,A_GP1F6404,3,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM,axial
5,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861,A_GP1F6404,2,Br49d,"[0.9991633259601, -0.012198637262, -0.039036409995, 0.01221725761601, 0.99992533808546, 0.000238476471]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861/DICOM,axial
...,...,...,...,...,...,...,...,...,...,...
1036,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028786,A_778P105L,10,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028786/DICOM,axial
1037,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596,A_778P105L,2,Br46f,"[0, 1, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596/DICOM,sagittal
1038,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028755,A_778P105L,10,Br46f,"[1, 0, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028755/DICOM,coronal
1039,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997,A_778P105L,3,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997/DICOM,axial


## Exclude slice thickness > 3

In [58]:
filtered_df['SliceThickness'] = pd.to_numeric(filtered_df['SliceThickness'], errors='coerce')

# only keep slice thickness <= 3 
filtered_df = filtered_df[filtered_df['SliceThickness'] <= 3]

# convert back to str for counting 
filtered_df['SliceThickness'] = filtered_df['SliceThickness'].astype(str)
filtered_df['ConvolutionKernel'] = filtered_df['ConvolutionKernel'].astype(str)
filtered_df['Orientation'] = filtered_df['Orientation'].astype(str)

feature_count_df = (
    filtered_df
    .groupby(['SliceThickness', 'ConvolutionKernel', 'Orientation'])
    .size()
    .reset_index()
    .rename(columns={0: 'count'})
)
feature_count_df = feature_count_df.sort_values('count', ascending=False)
feature_count_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['SliceThickness'] = filtered_df['SliceThickness'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['ConvolutionKernel'] = filtered_df['ConvolutionKernel'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Orientation'] = filtered_df['Orientati

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
3,1.0,B45f,axial,79
42,3.0,B45f,axial,62
23,2.0,B45f,coronal,44
24,2.0,B45f,sagittal,43
48,3.0,Br46f,axial,38
9,1.0,Br46f,axial,35
33,2.0,Br46f,sagittal,34
32,2.0,Br46f,coronal,34
51,3.0,"['Br49d', '3']",axial,24
13,1.0,"['Br49d', '3']",axial,22


In [59]:
filtered_df

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory,Orientation
2,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755,A_GP1F6404,3.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM,axial
3,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567,A_GP1F6404,3.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM,axial
5,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861,A_GP1F6404,2.0,Br49d,"[0.9991633259601, -0.012198637262, -0.039036409995, 0.01221725761601, 0.99992533808546, 0.000238476471]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861/DICOM,axial
6,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023320,A_GP1F6404,2.0,"['Br49d', '3']","[0.012208704553, 0.99992547098928, 0, -0.032653004301, 0.000398680596, -0.999466668961]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023320/DICOM,sagittal
7,10064_1_UTI1Q537,2023-09-28,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.75487.30000023032314421944100023006,A_GP1F6404,1.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.75487.30000023032314421944100023006/DICOM,axial
...,...,...,...,...,...,...,...,...,...,...
1033,10057_1_UU92H1J4,2022-03-12,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.73938.30050022021915380709700001466,A_C6666P8N,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_UU92H1J4,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_UU92H1J4/2022-03-12/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.73938.30050022021915380709700001466/DICOM,axial
1035,10057_1_85T8773T,2023-11-28,CT_Partial_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.73443.30000022112215432129000028111,A_778P105L,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/CT_Partial_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.73443.30000022112215432129000028111/DICOM,axial
1037,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596,A_778P105L,2.0,Br46f,"[0, 1, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596/DICOM,sagittal
1039,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997,A_778P105L,3.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997/DICOM,axial


## Exclude scans with missing convolution kernel 

In [60]:
# Filter out rows where SliceThickness is "missing"
filtered_df = filtered_df[filtered_df['ConvolutionKernel'] != 'None' ]
filtered_df = filtered_df[filtered_df['ConvolutionKernel'] != 'missing' ]

# Now group by the specified columns and calculate the counts
feature_count_df = (
    filtered_df
    .groupby(['SliceThickness', 'ConvolutionKernel', 'Orientation'])
    .size()
    .reset_index()
    .rename(columns={0: 'count'})
)

# Sort the DataFrame by the count column in descending order
feature_count_df = feature_count_df.sort_values('count', ascending=False)

# Display the result
feature_count_df


Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
3,1.0,B45f,axial,79
42,3.0,B45f,axial,62
23,2.0,B45f,coronal,44
24,2.0,B45f,sagittal,43
48,3.0,Br46f,axial,38
9,1.0,Br46f,axial,35
33,2.0,Br46f,sagittal,34
32,2.0,Br46f,coronal,34
51,3.0,"['Br49d', '3']",axial,24
13,1.0,"['Br49d', '3']",axial,22


In [61]:
filtered_df

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory,Orientation
2,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755,A_GP1F6404,3.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM,axial
3,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567,A_GP1F6404,3.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM,axial
5,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861,A_GP1F6404,2.0,Br49d,"[0.9991633259601, -0.012198637262, -0.039036409995, 0.01221725761601, 0.99992533808546, 0.000238476471]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861/DICOM,axial
6,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023320,A_GP1F6404,2.0,"['Br49d', '3']","[0.012208704553, 0.99992547098928, 0, -0.032653004301, 0.000398680596, -0.999466668961]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023320/DICOM,sagittal
7,10064_1_UTI1Q537,2023-09-28,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.75487.30000023032314421944100023006,A_GP1F6404,1.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.75487.30000023032314421944100023006/DICOM,axial
...,...,...,...,...,...,...,...,...,...,...
1033,10057_1_UU92H1J4,2022-03-12,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.73938.30050022021915380709700001466,A_C6666P8N,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_UU92H1J4,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_UU92H1J4/2022-03-12/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.73938.30050022021915380709700001466/DICOM,axial
1035,10057_1_85T8773T,2023-11-28,CT_Partial_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.73443.30000022112215432129000028111,A_778P105L,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/CT_Partial_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.73443.30000022112215432129000028111/DICOM,axial
1037,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596,A_778P105L,2.0,Br46f,"[0, 1, 0, 0, 0, -1]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000028596/DICOM,sagittal
1039,10057_1_85T8773T,2023-11-28,1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997,A_778P105L,3.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/1.3.12.2.1107.5.1.4.73443.30000022112215432129000027997/DICOM,axial


## Exclude orientation != axial 

In [62]:
# Filter out rows where SliceThickness is "missing"
filtered_df = filtered_df[filtered_df['Orientation'] == 'axial' ]

# Now group by the specified columns and calculate the counts
feature_count_df = (
    filtered_df
    .groupby(['SliceThickness', 'ConvolutionKernel', 'Orientation'])
    .size()
    .reset_index()
    .rename(columns={0: 'count'})
)

# Sort the DataFrame by the count column in descending order
feature_count_df = feature_count_df.sort_values('count', ascending=False)

# Display the result
feature_count_df


Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
0,1.0,B45f,axial,79
18,3.0,B45f,axial,62
23,3.0,Br46f,axial,38
4,1.0,Br46f,axial,35
26,3.0,"['Br49d', '3']",axial,24
13,2.0,B45f,axial,22
7,1.0,"['Br49d', '3']",axial,22
15,2.0,Br49d,axial,12
1,1.0,B46f,axial,11
19,3.0,B46f,axial,11


In [63]:
filtered_df

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory,Orientation
2,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755,A_GP1F6404,3.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022755/DICOM,axial
3,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567,A_GP1F6404,3.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100023567/DICOM,axial
5,10064_1_UTI1Q537,2023-09-28,1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861,A_GP1F6404,2.0,Br49d,"[0.9991633259601, -0.012198637262, -0.039036409995, 0.01221725761601, 0.99992533808546, 0.000238476471]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/1.3.12.2.1107.5.1.4.75487.30000023032314421944100022861/DICOM,axial
7,10064_1_UTI1Q537,2023-09-28,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.75487.30000023032314421944100023006,A_GP1F6404,1.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_UTI1Q537,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_UTI1Q537/2023-09-28/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.75487.30000023032314421944100023006/DICOM,axial
8,10064_1_0XVHE7T8,2023-11-27,1.3.12.2.1107.5.1.4.76430.30000023021314165347900033329,A_5UY8S58R,3.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10064_1_0XVHE7T8,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_0XVHE7T8/2023-11-27/1.3.12.2.1107.5.1.4.76430.30000023021314165347900033329/DICOM,axial
...,...,...,...,...,...,...,...,...,...,...
1025,10057_1_ZN1M7487,2018-08-15,1.3.12.2.1107.5.1.4.60120.30000018041114364546500011411,A_TZLH2G7R,3.0,B45f,"[1, 0, 0, 0, 1, 0]",10057_1_ZN1M7487,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_ZN1M7487/2018-08-15/1.3.12.2.1107.5.1.4.60120.30000018041114364546500011411/DICOM,axial
1027,10057_1_UU92H1J4,2022-03-12,1.3.12.2.1107.5.1.4.73938.30050022021915380709700001369,A_C6666P8N,3.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_UU92H1J4,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_UU92H1J4/2022-03-12/1.3.12.2.1107.5.1.4.73938.30050022021915380709700001369/DICOM,axial
1033,10057_1_UU92H1J4,2022-03-12,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.73938.30050022021915380709700001466,A_C6666P8N,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_UU92H1J4,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_UU92H1J4/2022-03-12/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.73938.30050022021915380709700001466/DICOM,axial
1035,10057_1_85T8773T,2023-11-28,CT_Partial_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.73443.30000022112215432129000028111,A_778P105L,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10057_1_85T8773T,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10057_1_85T8773T/2023-11-28/CT_Partial_(thin)_Chest_at_TLC_Supine_Low_Dose--1.3.12.2.1107.5.1.4.73443.30000022112215432129000028111/DICOM,axial


In [64]:
unique_pid_count = filtered_df['pid'].nunique()
unique_pid_count

131

In [65]:
filtered_df = filtered_df.sort_values(by=['pid', 'timepoint', 'SliceThickness'], ascending=[True, True, True])

# Drop duplicates by keeping the first occurrence of each combination of 'pid' and 'timepoint' (which has the lowest SliceThickness)
filtered_df = filtered_df.drop_duplicates(subset=['pid', 'timepoint'], keep='first')


In [66]:
filtered_df

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory,Orientation
827,10056_1_2D6033B8,2021-11-07,1.3.12.2.1107.5.1.4.73938.30050020062815422282900025937,A_64VM3K16,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10056_1_2D6033B8,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_2D6033B8/2021-11-07/1.3.12.2.1107.5.1.4.73938.30050020062815422282900025937/DICOM,axial
943,10056_1_327YHR52,2020-09-04,1.3.12.2.1107.5.1.4.73938.30050020071015435508200016834,A_0OTOMW6X,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10056_1_327YHR52,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_327YHR52/2020-09-04/1.3.12.2.1107.5.1.4.73938.30050020071015435508200016834/DICOM,axial
365,10056_1_35287EBF,2017-11-09,1.3.12.2.1107.5.1.4.55034.30000017070611472115600021389,A_T7616624,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10056_1_35287EBF,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_35287EBF/2017-11-09/1.3.12.2.1107.5.1.4.55034.30000017070611472115600021389/DICOM,axial
561,10056_1_3OQZM43U,2017-07-21,CT_Chest_at_TLC_Supine_Post_Contrast--1.3.12.2.1107.5.1.4.60120.30000016120215164413600015053,A_K9B244N8,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10056_1_3OQZM43U,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_3OQZM43U/2017-07-21/CT_Chest_at_TLC_Supine_Post_Contrast--1.3.12.2.1107.5.1.4.60120.30000016120215164413600015053/DICOM,axial
324,10056_1_4234Y2Y8,2021-05-10,1.3.12.2.1107.5.1.4.75487.30000021032914311465900014774,A_691JCELP,1.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10056_1_4234Y2Y8,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_4234Y2Y8/2021-05-10/1.3.12.2.1107.5.1.4.75487.30000021032914311465900014774/DICOM,axial
...,...,...,...,...,...,...,...,...,...,...
299,10064_1_XU847S6B,2015-08-14,1.3.12.2.1107.5.1.4.65225.30000015052916523008800000526,A_78413Q71,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10064_1_XU847S6B,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_XU847S6B/2015-08-14/1.3.12.2.1107.5.1.4.65225.30000015052916523008800000526/DICOM,axial
179,10064_1_Y58EADQ2,2023-08-21,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.66859.30050022020714325080400019674,A_E1L9A1K8,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10064_1_Y58EADQ2,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_Y58EADQ2/2023-08-21/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.66859.30050022020714325080400019674/DICOM,axial
636,10064_1_Y6357FFF,2022-07-10,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.64297.30000021061315172546300013646,A_WU680PX8,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10064_1_Y6357FFF,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_Y6357FFF/2022-07-10/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.64297.30000021061315172546300013646/DICOM,axial
104,10064_1_YRU26VP4,2021-10-27,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.51964.30000021040715374392100001703,A_2344T37V,1.0,B46f,"[1, 0, 0, 0, 1, 0]",10064_1_YRU26VP4,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_YRU26VP4/2021-10-27/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.51964.30000021040715374392100001703/DICOM,axial


In [67]:
duplicates = filtered_df.duplicated(subset=['pid', 'timepoint'])
if duplicates.any():
    print(filtered_df[duplicates])
else:
    print("All rows have unique combinations of pid and timepoint.")

All rows have unique combinations of pid and timepoint.


## Save all cases and only the first timepoint 

In [68]:
CECT_OUT = '/workspace/home/tengyuezhang/sybil_cect/data/ucla_cect'

In [69]:
filtered_df.to_csv(os.path.join(CECT_OUT, 'ucla_cect_cases.csv'), index=False)

In [70]:
baseline_df = filtered_df.sort_values(by=['pid', 'timepoint'])
baseline_df = baseline_df.drop_duplicates(subset=['pid'], keep='first')
baseline_df

Unnamed: 0,pid,timepoint,scan,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,PatientID,Directory,Orientation
827,10056_1_2D6033B8,2021-11-07,1.3.12.2.1107.5.1.4.73938.30050020062815422282900025937,A_64VM3K16,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10056_1_2D6033B8,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_2D6033B8/2021-11-07/1.3.12.2.1107.5.1.4.73938.30050020062815422282900025937/DICOM,axial
943,10056_1_327YHR52,2020-09-04,1.3.12.2.1107.5.1.4.73938.30050020071015435508200016834,A_0OTOMW6X,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10056_1_327YHR52,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_327YHR52/2020-09-04/1.3.12.2.1107.5.1.4.73938.30050020071015435508200016834/DICOM,axial
365,10056_1_35287EBF,2017-11-09,1.3.12.2.1107.5.1.4.55034.30000017070611472115600021389,A_T7616624,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10056_1_35287EBF,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_35287EBF/2017-11-09/1.3.12.2.1107.5.1.4.55034.30000017070611472115600021389/DICOM,axial
561,10056_1_3OQZM43U,2017-07-21,CT_Chest_at_TLC_Supine_Post_Contrast--1.3.12.2.1107.5.1.4.60120.30000016120215164413600015053,A_K9B244N8,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10056_1_3OQZM43U,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_3OQZM43U/2017-07-21/CT_Chest_at_TLC_Supine_Post_Contrast--1.3.12.2.1107.5.1.4.60120.30000016120215164413600015053/DICOM,axial
324,10056_1_4234Y2Y8,2021-05-10,1.3.12.2.1107.5.1.4.75487.30000021032914311465900014774,A_691JCELP,1.0,"['Br49d', '3']","[1, 0, 0, 0, 1, 0]",10056_1_4234Y2Y8,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10056_1_4234Y2Y8/2021-05-10/1.3.12.2.1107.5.1.4.75487.30000021032914311465900014774/DICOM,axial
...,...,...,...,...,...,...,...,...,...,...
299,10064_1_XU847S6B,2015-08-14,1.3.12.2.1107.5.1.4.65225.30000015052916523008800000526,A_78413Q71,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10064_1_XU847S6B,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_XU847S6B/2015-08-14/1.3.12.2.1107.5.1.4.65225.30000015052916523008800000526/DICOM,axial
179,10064_1_Y58EADQ2,2023-08-21,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.66859.30050022020714325080400019674,A_E1L9A1K8,1.0,Br46f,"[1, 0, 0, 0, 1, 0]",10064_1_Y58EADQ2,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_Y58EADQ2/2023-08-21/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.66859.30050022020714325080400019674/DICOM,axial
636,10064_1_Y6357FFF,2022-07-10,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.64297.30000021061315172546300013646,A_WU680PX8,1.0,B45f,"[1, 0, 0, 0, 1, 0]",10064_1_Y6357FFF,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_Y6357FFF/2022-07-10/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.64297.30000021061315172546300013646/DICOM,axial
104,10064_1_YRU26VP4,2021-10-27,CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.51964.30000021040715374392100001703,A_2344T37V,1.0,B46f,"[1, 0, 0, 0, 1, 0]",10064_1_YRU26VP4,/workspace/radraid/whsu/idx_data_request_79/Images_and_Annotations/Contrast-Enhanced_CT/10064_1_YRU26VP4/2021-10-27/CT_Full_(thin)_Chest_at_TLC_Supine--1.3.12.2.1107.5.1.4.51964.30000021040715374392100001703/DICOM,axial


In [71]:
print(baseline_df)

                  pid   timepoint  \
827  10056_1_2D6033B8  2021-11-07   
943  10056_1_327YHR52  2020-09-04   
365  10056_1_35287EBF  2017-11-09   
561  10056_1_3OQZM43U  2017-07-21   
324  10056_1_4234Y2Y8  2021-05-10   
..                ...         ...   
299  10064_1_XU847S6B  2015-08-14   
179  10064_1_Y58EADQ2  2023-08-21   
636  10064_1_Y6357FFF  2022-07-10   
104  10064_1_YRU26VP4  2021-10-27   
973  10064_1_ZP6X0LZT  2021-08-26   

                                                                                              scan  \
827                                        1.3.12.2.1107.5.1.4.73938.30050020062815422282900025937   
943                                        1.3.12.2.1107.5.1.4.73938.30050020071015435508200016834   
365                                        1.3.12.2.1107.5.1.4.55034.30000017070611472115600021389   
561  CT_Chest_at_TLC_Supine_Post_Contrast--1.3.12.2.1107.5.1.4.60120.30000016120215164413600015053   
324                                        1.

In [72]:
baseline_df.to_csv(os.path.join(CECT_OUT, 'ucla_cect_baseline_cases.csv'), index=False)