Adapted from : https://www.kaggle.com/code/theoviel/dicom-resized-png-jpg

**Dataset Links :**
- Part 1 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt1
- Part 2 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt2
- Part 3 : https://www.kaggle.com/datasets/theoviel/rsna-2023-abdominal-trauma-detection-pngs-3-8
- Part 4 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt4
- Part 5 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt5
- Part 6 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt6
- Part 7 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-pngs-pt7
- Part 8 : https://www.kaggle.com/datasets/theoviel/rsna-2023-abdominal-trauma-detection-pngs-18

**Changes :**
- Apply `standardize_pixel_array` function
- Update links
- Remove `apply_voi_luit`
- Add rescaling, thanks @sukharev !

**TODO :**
- Dicom processing on GPU
- Figure out why example dicom is too dark

In [1]:
!pip install -qU python-gdcm pydicom pylibjpeg

[31mERROR: Could not install packages due to an OSError: [Errno 28] No space left on device: '/tmp/pip-ephem-wheel-cache-ense2buy'
[0m[31m
[0m

In [1]:
import os
import cv2
import glob
import gdcm
import pydicom
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from joblib import Parallel, delayed
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [3]:
def standardize_pixel_array(dcm: pydicom.dataset.FileDataset) -> np.ndarray:
    """
    Source : https://www.kaggle.com/competitions/rsna-2023-abdominal-trauma-detection/discussion/427217
    """
    # Correct DICOM pixel_array if PixelRepresentation == 1.
    pixel_array = dcm.pixel_array
    if dcm.PixelRepresentation == 1:
        bit_shift = dcm.BitsAllocated - dcm.BitsStored
        dtype = pixel_array.dtype 
        pixel_array = (pixel_array << bit_shift).astype(dtype) >>  bit_shift
#         pixel_array = pydicom.pixel_data_handlers.util.apply_modality_lut(new_array, dcm)

    intercept = float(dcm.RescaleIntercept)
    slope = float(dcm.RescaleSlope)
    center = int(dcm.WindowCenter)
    width = int(dcm.WindowWidth)
    low = center - width / 2
    high = center + width / 2    
    
    pixel_array = (pixel_array * slope) + intercept
    pixel_array = np.clip(pixel_array, low, high)

    return pixel_array

In [6]:
BASEDIR = '../../rsna-2023-abdominal-trauma-detection'

TRAIN_PATH = os.path.join(BASEDIR, 'train_images/')
TEST_PATH = os.path.join(BASEDIR, 'test_images/')

print('Number of training patients :', len(os.listdir(TRAIN_PATH)))
print('Number of test patients :', len(os.listdir(TEST_PATH)))

Number of training patients : 3147
Number of test patients : 3


In [9]:
from IPython.display import clear_output, display
import time


for patient in sorted(os.listdir(TRAIN_PATH)):
    for study in os.listdir(TRAIN_PATH + patient):
        imgs = {}
        for f in sorted(glob.glob(TRAIN_PATH + f"{patient}/{study}/*.dcm"))[::10]:
            dicom = pydicom.dcmread(f)

            pos_z = dicom[(0x20, 0x32)].value[-1]  # to retrieve the order of frames

            img = standardize_pixel_array(dicom)
            img = (img - img.min()) / (img.max() - img.min() + 1e-6)

            if dicom.PhotometricInterpretation == "MONOCHROME1":
                img = 1 - img

            imgs[pos_z] = img

        for i, k in enumerate(sorted(imgs.keys())):
            img = imgs[k]
            
            plt.figure(figsize=(5, 5))
            plt.imshow(img, cmap="gray")
            plt.title(f"Patient {patient} - Study {study} - Frame {i}/{len(imgs)}")
            plt.axis(False)
            plt.show()
            clear_output(wait=True)
            time.sleep(0.01)


KeyboardInterrupt: 

### Save the processed data

In [25]:
def process(patient, size=512, save_folder="", data_path=""):
    for study in sorted(os.listdir(os.path.join(data_path, patient))):
        imgs = {}
        for f in sorted(glob.glob(data_path + f"{patient}/{study}/*.dcm")):
            dicom = pydicom.dcmread(f)

            pos_z = dicom[(0x20, 0x32)].value[-1]

            img = standardize_pixel_array(dicom)
            img = (img - img.min()) / (img.max() - img.min() + 1e-6)

            if dicom.PhotometricInterpretation == "MONOCHROME1":
                img = 1 - img

            imgs[pos_z] = img

        for i, k in enumerate(sorted(imgs.keys())):
            img = imgs[k]

            if size is not None:
                img = cv2.resize(img, (size, size))

            if isinstance(save_folder, str):
                cv2.imwrite(save_folder + f"{patient}_{study}_{i}.png", (img * 255).astype(np.uint8))
            else:
                im = cv2.imencode('.png', (img * 255).astype(np.uint8))[1]
                save_folder.writestr(f'{patient}_{study}_{i:04d}.png', im)

In [26]:
patients = os.listdir(TRAIN_PATH)

# Chunking
patients = (
    patients[:400],
    patients[400:  800],
    patients[800:  1200],
    patients[1200: 1600],
    patients[1600: 2000],
    patients[2000: 2400],
    patients[2400: 2800],
    patients[2800:],
)

# patients = patients[0]  # [:10]  # subsample

In [27]:
# with zipfile.ZipFile("output.zip", 'w') as save_folder:
save_folder = "./rsna-2023-png/train_images/"
for ppp in patients:
    for patient in tqdm(ppp):
        process(patient, size=None, save_folder=save_folder, data_path=TRAIN_PATH)

 18%|█▊        | 72/400 [05:49<26:33,  4.86s/it]


KeyboardInterrupt: 

Done ! 