# Install required packages

In [4]:
!pip install pydicom

Collecting pydicom
[?25l  Downloading https://files.pythonhosted.org/packages/53/e6/4cae2b4b2fdbea5e2ddd188361139606d8f10f710ba1abecd6600da099c3/pydicom-1.4.2-py2.py3-none-any.whl (35.3MB)
[K     |████████████████████████████████| 35.3MB 149kB/s eta 0:00:01    |█████▌                          | 6.1MB 5.6MB/s eta 0:00:06     |██████████▏                     | 11.2MB 5.6MB/s eta 0:00:05     |███████████████████▍            | 21.4MB 3.1MB/s eta 0:00:05     |█████████████████████           | 23.2MB 3.1MB/s eta 0:00:04      | 23.3MB 1.0MB/s eta 0:00:12     |███████████████████████████▋    | 30.4MB 1.0MB/s eta 0:00:05     |█████████████████████████████   | 31.9MB 1.2MB/s eta 0:00:03     |█████████████████████████████▋  | 32.6MB 1.2MB/s eta 0:00:03     |█████████████████████████████▉  | 32.9MB 1.2MB/s eta 0:00:03��████████████████  | 33.2MB 1.2MB/s eta 0:00:02
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-1.4.2


# Imports

In [10]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import pydicom as dicom
import os

# Code to Serialize to TFRecord

In [9]:
def _bytes_feature(value):
  """
  Returns a bytes list from a string or byte.
  """
  
  if isinstance(value, type(tf.constant(0))):
    # BytesList won't unpack a string from an EagerTensor
    value = value.numpy()
  
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """
  Returns a float list from a float or double.
  """
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """
  Returns an int64_list from a bool/enum/int/uint.
  """
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def serialize_example(image, label):
  """
  Creates a tf.Example ready to be written to a file.
  """

  assert(image.shape == label.shape)

  img_shape = image.shape

  height_feature = _int64_feature(img_shape[0])
  width_feature  = _int64_feature(img_shape[1])
  
  img_feature = _bytes_feature(image.tostring())
  lab_feature = _bytes_feature(label.tostring())

  feature_dict = {
      "height": height_feature,
      "width" : width_feature,
      "image" : img_feature,
      "label" : lab_feature
  }

  wrapped_features = tf.train.Features(feature=feature_dict)
  example = tf.train.Example(features=wrapped_features)

  return example.SerializeToString()

def serialize_example_sparse(image, label):
  """
  Creates a tf.Example ready to be written to a file.
  
  Since most of the masks are filled with zerors,
  it is more efficient to store sparse
  representation of the masks i.e. store only
  the non-zero values and their corresponding positions
  in the matrix.
  """

  assert(image.shape == label.shape)

  img_shape = image.shape

  # height and width are integer values
  height_feature = _int64_feature(img_shape[0])
  width_feature  = _int64_feature(img_shape[1])
  
  # matrices should be stored as a sequence of bytes
  img_feature    = _bytes_feature(image.tostring())

  # store sparse representation of labels/masks
  # First, locations
  indices        = np.argwhere(label != 0)
  lab_inds       = _bytes_feature(indices.tostring())

  # Then, the actual values itself
  #
  # If we assume all values are just 1s, then we might
  # be able to avoid storing the values and save even
  # more, but for now keep it simple.
  values         = label[label != 0]
  lab_vals       = _bytes_feature(values.tostring())

  # Store the number of non-zero vals so that we can re-create
  # the label in dense format
  lab_n_vals     = _int64_feature(values.shape[0])

  feature_dict = {
      "height"      : height_feature,
      "width"       : width_feature,
      "image"       : img_feature,
      "lab_indices" : lab_inds,
      "lab_values"  : lab_vals,
      "lab_n_vals"  : lab_n_vals
  }

  wrapped_features = tf.train.Features(feature=feature_dict)
  example = tf.train.Example(features=wrapped_features)

  return example.SerializeToString()

# Reading raw DICOM data

In [7]:
def read_dicom(img_F, lab_F):
  ds = dicom.dcmread(img_F)
  img = ds.pixel_array

  roi_arr=np.loadtxt(lab_F,
                     delimiter = ';',
                     skiprows=2,
                     dtype=np.int16)
  
  mask = np.zeros(img.shape, dtype=img.dtype)
  x_coords = roi_arr[:, 1]//2
  y_coords = roi_arr[:, 0]//2

  mask[x_coords, y_coords] = 1
  
  return img, mask

# Plotting images with labels

In [8]:
def plot_img_with_label(image, label):
  # In order to only show non-zero component of label,
  # set zero component to NAN
  label = np.ma.masked_where(label == 0, label)

  fig, (ax1, ax2) = plt.subplots(ncols=2, dpi=100)
  _ = ax1.imshow(image, cmap='gray')
  _ = ax1.set_title("Base image")
  _ = ax1.axis("off")

  _ = ax2.imshow(image, cmap='gray')
  _ = ax2.imshow(label, cmap='jet')
  _ = ax2.set_title("Base image with label")
  _ = ax2.axis("off")

# Create example IDs

In [13]:
!pwd

/home/jovyan/work/Segment/Segment MRI


In [18]:
def create_example_ids(data_dir):
    example_ids = []
    for f in os.listdir(os.path.join(data_dir)):
        f, _ = os.path.splitext(f)
        example_ids.append(f)
    
    return example_ids

DATA_BASE_DIR = "/home/jovyan/work/Segment/Segment MRI/"
DICOM_DIR = DATA_BASE_DIR + "DICOM"
ROI_DIR   = DATA_BASE_DIR + "Data/ROIs"
TFR_DIR   = DATA_BASE_DIR + "TFR_Data"

example_ids = create_example_ids(DICOM_DIR)

In [19]:
print(len(example_ids))

5113


# Function to create the TFRecords

In [17]:
def create_TFRecords_for_2D(data_dir, label_dir, dest_dir, example_ids):
    for example_id in example_ids:
        dcm_F = data_dir  + "/" + example_id + ".dcm"
        roi_F = label_dir + "/" + example_id + ".txt"
        
        tfrecord_F = dest_dir + "/" + example_id + ".tfr"
        
        # Create TFRecords only for examples with valid ROIs
        if(not os.path.exists(roi_F)):
            continue
        
        with tf.io.TFRecordWriter(tfrecord_F) as writer:
            image, label = read_dicom(dcm_F, roi_F)
            serialized_example = serialize_example_sparse(image, label)
            writer.write(serialized_example)
    

In [20]:
DATA_BASE_DIR = "/home/jovyan/work/Segment/Segment MRI/"
DICOM_DIR = DATA_BASE_DIR + "DICOM"
ROI_DIR   = DATA_BASE_DIR + "Data/ROIs"
TFR_DIR   = DATA_BASE_DIR + "TFR_Data"

create_TFRecords_for_2D(DICOM_DIR, ROI_DIR, TFR_DIR, example_ids)

In [22]:
%timeit create_TFRecords_for_2D(DICOM_DIR, ROI_DIR, TFR_DIR, example_ids)

1min 16s ± 1.05 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
test_ids = ["1470_20133_00009"]
%timeit create_TFRecords_for_2D(DICOM_DIR, ROI_DIR, TFR_DIR, test_ids)

34.4 ms ± 3.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
import time

start = time.time()
create_TFRecords_for_2D(DICOM_DIR, ROI_DIR, TFR_DIR, example_ids)
end = time.time()
print(end - start)

67.99456453323364
