In [None]:
!pip install pydicom opencv-python

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import pydicom
import cv2
#from joblib import Parallel, delayed

In [None]:
ROOT = "/data3/wv2019/data/PLIC_CHIESA_DICOM"
ROOT_OUT = "/data3/wv2019/data/processed/PLIC_CHIESA_DICOM"
# image expected dimensions
rows = 576
cols = 640

In [None]:
#Input > DCM file
#Output > Image (np matrix)

def metacrop2(file):
    for key in file.dir():
       value = getattr(file, key, "")
       if(key == "SequenceOfUltrasoundRegions"):
           value = value[0]
           break
    x0, x1, y0, y1 = None, None, None, None
    for key in value.dir():
       if key == "RegionLocationMinX0":
           x0 = getattr(value, key, "")
       if key == "RegionLocationMaxX1":
           x1 = getattr(value, key, "")
       if key == "RegionLocationMinY0":
           y0 = getattr(value, key, "")
       if key == "RegionLocationMaxY1":
           y1 = getattr(value, key, "")
    return file.pixel_array[y0:y1+1, x0:x1+1]

In [None]:
# define callbacks
def person_names_callback(dataset, data_element):
    if data_element.VR == "PN":
        data_element.value = "anonymous"
        
def curves_callback(dataset, data_element):
    if data_element.tag.group & 0xFF00 == 0x5000:
        del dataset[data_element.tag]

t2tag = 'PatientBirthDate'

In [None]:
### TODO
# - extend to whole set of images
# - remove filter based on image size
# - keep Doppler mode
# - parallelize code

In [None]:
# organize DICOMs by patient ID
# start with a subset of imgs
# "glob.glob(os.path.join(ROOT, "*"), recursive=True)"
for fn in glob.glob(os.path.join(ROOT, "*"), recursive=True):
    ds = pydicom.read_file(fn)
    # check image size
    if ds.Rows == rows and ds.Columns == cols:
        # clear private data
        ds.walk(person_names_callback)
        ds.walk(curves_callback)
        # address/sanitize patient IDs (remove spaces and non alphanum characters)
        pid = re.sub("\s+", "_", ds.data_element('PatientID').value.strip().replace("PLICC", "PLIC"))
        ds.data_element('PatientID').value = re.sub(r'\W+', '', pid)
        # type 2 tags
        if t2tag in ds:
            ds.data_element(t2tag).value = ''

        patientID = ds.PatientID
        dicomName = os.path.basename(fn).strip().replace(" ", "_")

        # throw away image header with private data
        pdata = metacrop2(ds)
        ds.Rows, ds.Columns, _ = pdata.shape
        ds.PixelData = pdata.tobytes()
        
        out_dir = os.path.join(ROOT_OUT, str(patientID))
        os.makedirs(out_dir, exist_ok=True)
        out_dicom = os.path.join(out_dir, f"{dicomName}.dcm")

        # write DICOM Standard compliant file
        if not os.path.isfile(out_dicom):
            pydicom.filewriter.write_file(out_dicom, ds, write_like_original=False)
            
            
#         write PNG for non-doppler images
#         NOTE: not needed at first, let's keep also Doppler mode
#         if len(ds.SequenceOfUltrasoundRegions) == 1 :
#             if not os.path.isfile(out_png):
#                 cv2.imwrite(out_png, ds.pixel_array)

In [116]:
# organize DICOMs by patient ID
# start with a subset of imgs
# "glob.glob(os.path.join(ROOT, "*"), recursive=True)"
for fn in glob.glob(os.path.join(ROOT, "*"), recursive=True):
    ds = pydicom.read_file(fn)
    # check image size
    if ds.Rows == rows and ds.Columns == cols:
        # clear private data
        ds.walk(person_names_callback)
        ds.walk(curves_callback)
        # address/sanitize patient IDs (remove spaces and non alphanum characters)
        pid = re.sub("\s+", "_", ds.data_element('PatientID').value.strip().replace("PLICC", "PLIC"))
        ds.data_element('PatientID').value = re.sub(r'\W+', '', pid)
        # type 2 tags
        if t2tag in ds:
            ds.data_element(t2tag).value = ''

        patientID = ds.PatientID
        dicomName = os.path.basename(fn).strip().replace(" ", "_")

        # throw away image header with private data
        pdata = metacrop2(ds)
        ds.Rows, ds.Columns, _ = pdata.shape
        ds.PixelData = pdata.tobytes()
        
        print(pdata)
        
        out_dir = os.path.join(ROOT_OUT, str(patientID))
        os.makedirs(out_dir, exist_ok=True)
        out_dicom = os.path.join(out_dir, f"{dicomName}.dcm")

        # write DICOM Standard compliant file
        #if not os.path.isfile(out_dicom):
        #    pydicom.filewriter.write_file(out_dicom, ds, write_like_original=False)
            
            
#         write PNG for non-doppler images
#         NOTE: not needed at first, let's keep also Doppler mode
#         if len(ds.SequenceOfUltrasoundRegions) == 1 :
#             if not os.path.isfile(out_png):
#                 cv2.imwrite(out_png, ds.pixel_array)
    break

[[[192 192 192]
  [192 192 192]
  [192 192 192]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 [[192 192 192]
  [192 192 192]
  [192 192 192]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 [[  0   0   0]
  [ 22  22  22]
  [ 22  22  22]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 ...

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]]
