In [16]:
import importlib
from src import extract
from pathlib import Path
import os 
import zipfile

importlib.reload(extract)  # reloads the module

<module 'src.extract' from '/workspace/spine-score/src/extract.py'>

In [2]:
# defaults to extracting from /workspaces/data
df = extract.extract_tsv_files()

Combined DataFrame shape: (25389, 79)


In [None]:
# Assuming combined_df is your DataFrame
print(list(df.columns))

['type', 'submitter_id', 'md5sum', 'file_size', 'file_name', 'data_type', 'data_format', 'data_category', 'annotation_name', 'annotation_method', 'cases.submitter_id', 'case_ids', 'mr_series_files.submitter_id', 'imaging_studies.submitter_id', 'urls', 'datasets', 'age_at_index', 'age_at_index_gt89', 'country_of_residence', 'covid19_positive', 'ethnicity', 'gen3_linked_subjects_available', 'icu_indicator', 'index_event', 'linked_external_data', 'long_covid_diagnosis', 'race', 'sex', 'site_id', 'token_record_id', 'ventilator_indicator', 'treatment_info', 'zip', 'acl', 'storage_urls', 'study_uid', 'series_uid', 'instance_uid', 'modality', 'datasets.submitter_id', 'body_part_examined', 'days_to_study', 'image_data_modified', 'study_description', 'study_modality', 'study_year', 'study_year_shifted', 'age_at_imaging', 'age_at_imaging_gt89', 'angio_flag', 'contrast_bolus_agent', 'diffusion_b_value', 'diffusion_gradient_orientation', 'echo_number', 'echo_train_length', 'echo_time', 'image_type

In [15]:
zip_path = "/workspace/data/DukeCSpineSeg_"
interim_dir = Path("/workspace/data/interim")

extract.extract_data(zip_path, interim_dir)

Files in /workspace/data/DukeCSpineSeg_segmentation.zip extracted.
Files in /workspace/data/DukeCSpineSeg_imaging_files.zip extracted.
Files in /workspace/data/DukeCSpineSeg_annotation.zip extracted.


In [17]:
# root folder containing ZIP files (nested)
root_dir = "/workspace/data/interim/imaging_files/case_image"

for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".zip"):
            zip_path = os.path.join(dirpath, file)
            # extract into the folder where the ZIP resides
            extract_to = dirpath
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
            print(f"Extracted {zip_path} to {extract_to}")

Extracted /workspace/data/interim/imaging_files/case_image/593973-000001/1.2.826.0.1.3680043.10.474.593973.19/1.2.826.0.1.3680043.10.474.593973.20.zip to /workspace/data/interim/imaging_files/case_image/593973-000001/1.2.826.0.1.3680043.10.474.593973.19
Extracted /workspace/data/interim/imaging_files/case_image/593973-000001/1.2.826.0.1.3680043.10.474.593973.2/1.2.826.0.1.3680043.10.474.593973.3.zip to /workspace/data/interim/imaging_files/case_image/593973-000001/1.2.826.0.1.3680043.10.474.593973.2
Extracted /workspace/data/interim/imaging_files/case_image/593973-000003/1.2.826.0.1.3680043.10.474.593973.36/1.2.826.0.1.3680043.10.474.593973.37.zip to /workspace/data/interim/imaging_files/case_image/593973-000003/1.2.826.0.1.3680043.10.474.593973.36
Extracted /workspace/data/interim/imaging_files/case_image/593973-000004/1.2.826.0.1.3680043.10.474.593973.53/1.2.826.0.1.3680043.10.474.593973.54.zip to /workspace/data/interim/imaging_files/case_image/593973-000004/1.2.826.0.1.3680043.10.4

In [24]:
import glob

# DICOM root after extraction
dicom_root = "/workspace/data/interim/imaging_files/case_image"

# Segmentation root
mask_root = "/workspace/data/interim/segmentation"

# Map study_id -> DICOM folder
dicom_map = {}
for folder in glob.glob(os.path.join(dicom_root, "*")):  # first level
    study_id = os.path.basename(folder)
    # pick the deepest folder containing DICOMs
    dcm_subfolders = [f for f in glob.glob(os.path.join(folder, "**"), recursive=True) if os.path.isdir(f) and len(glob.glob(os.path.join(f, "*.dcm"))) > 0]
    if dcm_subfolders:
        dicom_map[study_id] = dcm_subfolders[0]
        
# Map study_id -> mask file
mask_map = {}
for mask_file in glob.glob(os.path.join(mask_root, "*_SEG.nii.gz")):
    study_id = os.path.basename(mask_file).split("_")[0]
    mask_map[study_id] = mask_file

# Only keep the studies that have both DICOMs and masks
dataset_entries = [(dicom_map[s], mask_map[s]) for s in dicom_map if s in mask_map]

print(f"Found {len(dataset_entries)} paired DICOMs and masks")


Found 1231 paired DICOMs and masks


In [27]:
print(dataset_entries[0])

('/workspace/data/interim/imaging_files/case_image/593973-000001/1.2.826.0.1.3680043.10.474.593973.19/1.2.826.0.1.3680043.10.474.593973.20', '/workspace/data/interim/segmentation/593973-000001_Study-MR-1_Series-22_SEG.nii.gz')


In [28]:
import os
import numpy as np
import torch
import pydicom
import nibabel as nib

def load_dicom_volume(dicom_folder):
    """
    Load a DICOM series as a 3D numpy array.
    """
    dicom_files = [pydicom.dcmread(f) for f in sorted(glob.glob(os.path.join(dicom_folder, "*.dcm")))]
    # sort by InstanceNumber to get correct slice order
    dicom_files.sort(key=lambda x: int(x.InstanceNumber))
    volume = np.stack([f.pixel_array for f in dicom_files], axis=-1)  # (H, W, D)
    return volume.astype(np.float32)

def load_mask(mask_file):
    """
    Load a NIfTI mask file as a 3D numpy array.
    """
    mask = nib.load(mask_file).get_fdata()
    return mask.astype(np.uint8)


In [29]:
# Example: load the first pair
dicom_path, mask_path = dataset_entries[0]

volume = load_dicom_volume(dicom_path)
mask = load_mask(mask_path)

print("Volume shape:", volume.shape)
print("Mask shape:", mask.shape)

# Optional: convert to torch tensors with channel dimension
volume_tensor = torch.tensor(volume).unsqueeze(0)  # (1, H, W, D)
mask_tensor = torch.tensor(mask).unsqueeze(0)      # (1, H, W, D)

Volume shape: (512, 512, 14)
Mask shape: (512, 512, 14)


In [30]:
print("Mask unique labels:", torch.unique(mask_tensor))


Mask unique labels: tensor([0, 1, 2], dtype=torch.uint8)


In [25]:
# MRI Python Loading
from src import dicom_dataset

dataset = dicom_dataset.DICOMSegDataset(
    image_root = "/workspace/data/interim/imaging_files",
    mask_root = "/workspace/data/interim/segmentation"
)

In [26]:
img, msk =dataset[0]
print(img.shape, msk.shape)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: need at least one array to stack