In [54]:
import cv2
import numpy as np
import pydicom
import json
import os
import shutil
import sys
import random
from matplotlib import image
from scipy.ndimage import label
from zipfile import ZipFile
import re

In [None]:
# this varibale should be set as where your train.zip, validate.zip, test.zip store
data_path = '.'

In [55]:
# Create a ZipFile Object and load train.zip in it
with ZipFile(os.path.join(data_path, "train.zip"), 'r') as zipObj:
   # Extract all the contents of zip file in different directory
   zipObj.extractall()

FileNotFoundError: [Errno 2] No such file or directory: 'train.zip'

In [52]:
study_train = next(os.walk(os.path.join(data_path, "train")))[1]
# load labels in 'train.csv'
# the first column means id
# the second and third columns mean the volume
labels = np.loadtxt(os.path.join(data_path, "train.csv"), delimiter=",",skiprows=1)
labels[0:10]

array([[  1. , 108.3, 246.7],
       [  2. ,  54.6, 137.2],
       [  3. ,  32.7,  99.3],
       [  4. ,  57.7, 154.5],
       [  5. ,  83.3, 235.5],
       [  6. , 225.3, 317.9],
       [  7. ,  64.9, 138. ],
       [  8. , 158.3, 305.5],
       [  9. ,  61.4, 152.2],
       [ 10. , 105.2, 219.3]])

In [30]:
class Dataset(object):
    dataset_count = 0

    def __init__(self, directory, subdir):
        # deal with any intervening directories
        while True:
            subdirs = next(os.walk(directory))[1]
            if len(subdirs) == 1:
                directory = os.path.join(directory, subdirs[0])
            else:
                break

        slices = []
        for s in subdirs:
            m = re.match("sax_(\d+)", s)
            if m is not None:
                slices.append(int(m.group(1)))

        slices_map = {}
        first = True
        times = []
        for s in slices:
            files = next(os.walk(os.path.join(directory, "sax_%d" % s)))[2]
            offset = None

            for f in files:
                m = re.match("IM-(\d{4,})-(\d{4})\.dcm", f)
                if m is not None:
                    if first:
                        times.append(int(m.group(2)))
                    if offset is None:
                        offset = int(m.group(1))

            first = False
            slices_map[s] = offset

        self.directory = directory
        self.time = sorted(times)
        self.slices = sorted(slices)
        self.slices_map = slices_map
        Dataset.dataset_count += 1
        self.name = subdir

    def _filename(self, s, t):
        return os.path.join(self.directory,"sax_%d" % s, "IM-%04d-%04d.dcm" % (self.slices_map[s], t))

    def _read_dicom_image(self, filename):
        d = pydicom.read_file(filename)
        img = d.pixel_array
        return np.array(img)

    def _read_all_dicom_images(self):
        f1 = self._filename(self.slices[0], self.time[0])
        d1 = pydicom.read_file(f1)
        (x, y) = d1.PixelSpacing
        (x, y) = (float(x), float(y))
        f2 = self._filename(self.slices[1], self.time[0])
        d2 = pydicom.read_file(f2)

        # try a couple of things to measure distance between slices
        try:
            dist = np.abs(d2.SliceLocation - d1.SliceLocation)
        except AttributeError:
            try:
                dist = d1.SliceThickness
            except AttributeError:
                dist = 8  # better than nothing...

        self.images = np.array([[self._read_dicom_image(self._filename(d, i))
                                 for i in self.time]
                                for d in self.slices])
        self.dist = dist
        self.area_multiplier = x * y

    def load(self):
        self._read_all_dicom_images()

In [46]:
dset = []
for i,s in enumerate(study_train[0:10]):
    full_path = os.path.join(data_path, "train", s)
    dset.append(Dataset(full_path, s))
    print("Processing dataset %s..." % dset[i].name)
    p_edv = 0
    p_esv = 0
    try:
        dset[i].load()
        print("Dataset %s processing done." % dset[i].name)
    except Exception as e:
        print("ERROR: Exception %s thrown by dataset %s" % (str(e), dset[i].name))

Processing dataset 135...
Dataset 135 processing done.
Processing dataset 307...
Dataset 307 processing done.
Processing dataset 61...
Dataset 61 processing done.
Processing dataset 95...
Dataset 95 processing done.
Processing dataset 338...
Dataset 338 processing done.
Processing dataset 300...
Dataset 300 processing done.
Processing dataset 132...
Dataset 132 processing done.
Processing dataset 59...
Dataset 59 processing done.
Processing dataset 92...
Dataset 92 processing done.
Processing dataset 66...
Dataset 66 processing done.


In [62]:
# example
# you can call many attributes of dset[i]

dset[0].images

array([[[[ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0, 75, 54, ...,  0,  0,  0],
         ...,
         [ 0,  5,  4, ...,  0,  0,  0],
         [ 0,  4,  5, ...,  0,  0,  0],
         [ 0,  3,  4, ...,  0,  0,  0]],

        [[ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0, 74, 56, ...,  0,  0,  0],
         ...,
         [ 0,  5,  5, ...,  0,  0,  0],
         [ 0,  6,  6, ...,  0,  0,  0],
         [ 0,  4,  6, ...,  0,  0,  0]],

        [[ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0, 73, 53, ...,  0,  0,  0],
         ...,
         [ 0,  5,  4, ...,  0,  0,  0],
         [ 0,  5,  7, ...,  0,  0,  0],
         [ 0,  6,  6, ...,  0,  0,  0]],

        ...,

        [[ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0, 71, 52, ...,  0,  0,  0],
         ...,
         [ 0,  6,  5, ...,  0,  0,  0],
         [ 0,  5,  6, ...,  0,  0,  0],
    

In [67]:
# For example, you can use the following values as a feature of your model
dset[0].area_multiplier

1.9775390625