In [45]:
import pandas as pd
import numpy as np
from PIL import Image
import pydicom
import pylab
import os
from sklearn.model_selection import train_test_split
import json

In [46]:
def parse_data(df):
    """
    Method to read a CSV file (Pandas dataframe) and parse the 
    data into the following nested dictionary:

      parsed = {
        
        'patientId-00': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        },
        'patientId-01': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        }, ...

      }

    """
    # --- Define lambda to extract coords in list [y, x, height, width]
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]

    parsed = {}
    for n, row in df.iterrows():
        # --- Initialize patient entry into parsed 
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': '../input/stage_1_train_images/%s.dcm' % pid,
                'label': row['Target'],
                'boxes': []}

        # --- Add box if opacity is present
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))

    return parsed

In [47]:
class rsna:
    def parse_data():
        try :
            dict_ = np.load("../input/parsed_data.npy")
        except :
            # Read label files
            y = pd.read_csv('../input/stage_1_train_labels.csv')
            dict_ = parse_data(y)

            # --- Open DICOM file
            for key, value in dict_.items():
                d = pydicom.read_file(value['dicom'])
                d.pixel_array
                dict_[key]["img"] = pydicom.read_file(value['dicom']).pixel_array
                
            np.save("../input/parsed_data.npy", dict_)
        return dict_

    def load_data_classifier():
        dict_ = parse_data()
        x = []
        y = []
        
        for key, value in dict_.items():
            x.append(value["img"])
            y.append(value["label"])
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

        return (x_train, y_train), (x_test, y_test)

In [49]:
rsna.parse_data()

{'0004cfab-14fd-4e49-80ba-63a80b6bddd6': {'dicom': '../input/stage_1_train_images/0004cfab-14fd-4e49-80ba-63a80b6bddd6.dcm',
  'label': 0,
  'boxes': [],
  'img': array([[144, 128, 114, ..., 176, 184, 117],
         [125, 109,  95, ..., 171, 178, 111],
         [104,  89,  75, ..., 166, 175, 108],
         ...,
         [ 16,  14,  11, ...,  19,  15,   3],
         [ 18,  16,  12, ...,  21,  17,   5],
         [ 19,  17,  13, ...,  23,  19,   6]], dtype=uint8)},
 '00313ee0-9eaa-42f4-b0ab-c148ed3241cd': {'dicom': '../input/stage_1_train_images/00313ee0-9eaa-42f4-b0ab-c148ed3241cd.dcm',
  'label': 0,
  'boxes': [],
  'img': array([[ 31,  24,  19, ..., 191, 183, 176],
         [ 30,  23,  17, ..., 127, 128, 128],
         [ 29,  22,  16, ...,  90,  95,  98],
         ...,
         [105, 105, 106, ..., 157, 156, 154],
         [104, 104, 104, ..., 155, 154, 152],
         [100, 101, 103, ..., 153, 153, 151]], dtype=uint8)},
 '00322d4d-1c29-4943-afc9-b6754be640eb': {'dicom': '../input/stage