# RSNA_Pneumonia - JSON boxes

* Prepare JSON training set with boxes and info of images

## Imports

https://pydicom.github.io/pydicom/dev/getting_started.html

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import uuid
import pydicom
import glob, pylab
import pydicom
import matplotlib.pyplot as plt
%matplotlib inline
import math
import seaborn as sns
import scipy
# add alpha (transparency) to a colormap
import matplotlib.colors 
import matplotlib.cm 
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.image as mpimg 
import numpy.random as random 
import seaborn as sns; sns.set()
from matplotlib.pyplot import show 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from os.path import join
from pathlib import Path
print(os.listdir("data"))
print ()
# Any results you write to the current directory are saved as output.

['train_images', 'Features', 'test_images', 'stage_1_detailed_class_info.csv', 'dcm_data.csv', 'metadata.csv', 'stage_1_train_images', 'stage_1_sample_submission.csv', 'stage_1_train_labels.csv', 'stage_1_test_images']



## Constants & Functions

Constants

In [3]:
PATH= 'data'#/rsna-pneumonia-detection-challenge/'
path_pics =  join(str(Path.home()),'Documents/GitHub/datascienceworkshop-pneumonia/data/stage_1_train_images')#join(PATH,'stage_1_train_images')

print(os.listdir(path_pics)[:10])

['93e0aaa9-1c46-4041-a101-6b800a0df0ac.dcm', '87effd15-e357-4a0d-9874-634f9e94bb20.dcm', 'e43f52c1-f08f-43dd-a315-7e52646bd424.dcm', 'dac8c857-7604-4b20-8e61-da616e78e75e.dcm', '811411e8-31da-4df5-9244-dd888c6af020.dcm', 'fcc70304-c7bf-47ce-85cc-0754706b1a91.dcm', '68966e26-b5c0-42f7-92de-a2ccfc112740.dcm', '40a9e10b-f872-4c72-a482-7187e2dca6ed.dcm', '88c25715-03f5-474e-9cee-5ad1f7beb4ce.dcm', '9980df74-1e49-4293-8a42-3182d9258de6.dcm']


Functions

In [4]:
class Box():
    
    def __init__(self, y, x, h, w):
        self.y, self.x, self.h, self.w = y, x, h, w
        
    def flip(self):
        return Box(self.y, 1024 - self.x-self.w, self.h, self.w)

def row2string(row):
    """
    row box to string [y, x, height, width]
    """
    if row is None:
        return ''
    elif math.isnan(row['y']):
        return ''
    else:
        return '{0}_{1}_{2}_{3}'.format(row['y'], row['x'], row['height'], row['width'])
    
def string2boxes(s):
    """
    strings to boxes [y, x, height, width]
    """
    if s == None:
        raise ArgumentException()
    elif s == '' or s == '#':
        return None
    
    boxes = []
    ss = s.split('#')
    for r in ss:
        rr = r.split('_')
        try:
            box = Box(float(rr[0]), float(rr[1]), float(rr[2]), float(rr[3]))
            boxes.append(box)
        except:
            continue
        
    return boxes

def draw(data, auto_flip=True, write_info=False):
    """
    Method to draw single patient with bounding box(es) if present 
    """
    name_pic = data['patientId'] + '.dcm'
    fullpath_pic = join(path_pics, name_pic)

    # --- Open DICOM file
    d = pydicom.read_file(fullpath_pic,)
    im = d.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)

    # --- Add boxes with random color if present
    boxes = string2boxes(data['boxes'])
    
    # print data
    if write_info:
        print(d)

    if(auto_flip and data['View Position'] == 'AP'):
        im = np.flip(im, 1)
        for i, box in enumerate(boxes):
            boxes[i] = box.flip()
    
    if(boxes != None):
        for box in boxes:
            im = overlay_box(im=im, box=box, rgb=[255, 0, 0], stroke=6)

    plt.figure(figsize=(10,10))
    plt.imshow(im, cmap=pylab.cm.gist_gray)

def overlay_box(im, box, rgb, stroke=1):
    """
    Method to overlay single box on image
    """
    
    # --- Extract coordinates
    y1, x1, height, width = int(box.y), int(box.x), int(box.h), int(box.w)
    y2 = y1 + height
    x2 = x1 + width

    im[y1:y1 + stroke, x1:x2] = rgb
    im[y2:y2 + stroke, x1:x2] = rgb
    im[y1:y2, x1:x1 + stroke] = rgb
    im[y1:y2, x2:x2 + stroke] = rgb

    return im
import scipy.misc

def save_png(data):
    """
    Method to save as png
    """
    name_pic = data['patientId'] + '.dcm'
    print(data['patientId'])
    fullpath_pic = join(path_pics, name_pic)

    # --- Open DICOM file
    d = pydicom.read_file(fullpath_pic)
    im = d.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)

    #scipy.misc.imsave(join(path_pics, data['patientId']), im)
    scipy.misc.toimage(im, cmin=0.0, cmax=255).save(join(path_pics, data['patientId']))


def PDFonImage(file_name, matrix):
    wd = matplotlib.cm.winter._segmentdata # only has r,g,b  
    wd['alpha'] =  ((0.0, 0.0, 0.3), 
                   (0.3, 0.3, 1.0),
                   (1.0, 1.0, 1.0))

    # modified colormap with changing alpha
    al_winter = LinearSegmentedColormap('AlphaWinter', wd) 


    fullpath_pic = join(path_pics, file_name)
    # get the map image as an array so we can plot it 
    dcm_data = pydicom.read_file(fullpath_pic,force=True)
    im = dcm_data.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)
    
    if dcm_data.ViewPosition == 'AP':
        im = np.flip(im, 1)

    # making and plotting heatmap 
    plt.figure(figsize=(13,13))
    hmax = sns.heatmap(matrix,
                #cmap = al_winter, # this worked but I didn't like it
                #cmap = matplotlib.cm.winter,
                alpha = 0.5, # whole heatmap is translucent
                annot = True,
                zorder = 2,
                square=True,
                fmt='.2f',
                annot_kws={"size":8},
                xticklabels=[int((i+1) / dw) for i in range(matrix.shape[0])], 
                yticklabels=[int((i+1) / dw) for i in range(matrix.shape[0])] 
                )

    # heatmap uses pcolormesh instead of imshow, so we can't pass through 
    # extent as a kwarg, so we can't mmatch the heatmap to the map. Instead, 
    # match the map to the heatmap:

    hmax.imshow(im,
              aspect = hmax.get_aspect(),
              extent = hmax.get_xlim() + hmax.get_ylim(),
              zorder = 1) #put the map under the heatmap

    show()

In [5]:
train_label = pd.read_csv(join(PATH,'stage_1_train_labels.csv'))
detailed_class = pd.read_csv(join(PATH,'stage_1_detailed_class_info.csv'))

In [6]:
train_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28989 entries, 0 to 28988
Data columns (total 6 columns):
patientId    28989 non-null object
x            8964 non-null float64
y            8964 non-null float64
width        8964 non-null float64
height       8964 non-null float64
Target       28989 non-null int64
dtypes: float64(4), int64(1), object(1)
memory usage: 1.3+ MB


Build complete DataFrame
* Make a single string out of all the boxes relative to a single patient
* Add pic's properties to df

In [7]:
df = pd.merge(train_label, detailed_class, on='patientId')
df = pd.concat([df, pd.get_dummies(df['class'])], axis=1)

Execute the following code if generating the db

In [8]:
name_pic = pd.read_csv(join(PATH,'dcm_data.csv'), index_col=0)
name_pic['Age'] = name_pic['Age'].astype(int)
name_pic.head()

Unnamed: 0,Age,Sex,View Position,boxes,patientId
0,51,F,PA,#,0004cfab-14fd-4e49-80ba-63a80b6bddd6
1,48,F,PA,#,00313ee0-9eaa-42f4-b0ab-c148ed3241cd
2,19,M,AP,#,00322d4d-1c29-4943-afc9-b6754be640eb
3,28,M,PA,#,003d8fa0-6bf1-40ed-b54c-ac657f8495c5
4,32,F,AP,,00436515-870c-4b36-a041-de91049b9ab4


In [9]:
df = pd.concat([df, name_pic.drop('patientId', axis=1)], axis=1)
df = df.drop(['x', 'y', 'width', 'height'], axis=1)
df = df.dropna(axis=0, subset=['boxes'])
df.head()

Unnamed: 0,patientId,Target,class,Lung Opacity,No Lung Opacity / Not Normal,Normal,Age,Sex,View Position,boxes
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,0,No Lung Opacity / Not Normal,0,1,0,51,F,PA,#
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,0,No Lung Opacity / Not Normal,0,1,0,48,F,PA,#
2,00322d4d-1c29-4943-afc9-b6754be640eb,0,No Lung Opacity / Not Normal,0,1,0,19,M,AP,#
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0,Normal,0,0,1,28,M,PA,#
7,00436515-870c-4b36-a041-de91049b9ab4,1,Lung Opacity,1,0,0,32,F,AP,#152.0_264.0_379.0_213.0#152.0_264.0_379.0_213...


In [10]:
!ls

CheckEnvironment.ipynb	    data_preprocessing	Pneumonia_Training.ipynb
config.py		    kernel.ipynb	__pycache__
create_patient_JSONs.ipynb  kernel_LG.ipynb	README.md
data			    models		req.txt


In [11]:
import hashlib
import os
import json

def md5sum(pathname, blocksize=65536):
    checksum = hashlib.md5()

    with open(pathname, "rb") as stream:
        for block in iter(lambda: stream.read(blocksize), b""):
            checksum.update(block)

    return checksum.hexdigest()




def createPatientJSON():
    directory = r"data/train_images/positive"
    
    #return png's file names
    def genImageName():
        for image_name in os.listdir(directory):
            yield image_name
    
    r, c = 1024, 1024
    
    dictionaries = []
    
    for i_name in genImageName():
        
        pathname=os.path.join(directory,i_name)
        
        if os.path.exists(pathname):
            dictionary = {
                "image": {
                    "checksum": md5sum(pathname),
                    "pathname": pathname,
                    "shape": {
                        "r": r,
                        "c": c,
                        "channels": 3
                    }
                },
                "objects": []
            }
            
            #catch the index of the patient
            _filter = df['patientId'].str.find(i_name.split('.')[0])
            rows = df[_filter >= 0].iloc[0]
            
            #catch string and extract boxes
            s = rows['boxes']
            boxes = string2boxes(s)            
            
            #append boxes to dict
            for box in boxes:
                minimum_r, maximum_r = box.x, box.x+box.w
                minimum_c, maximum_c = box.y, box.y+box.h

                object_dictionary = {
                    "bounding_box": {
                        "minimum": {
                            "r": minimum_r - 1,
                            "c": minimum_c - 1
                        },
                        "maximum": {
                            "r": maximum_r - 1,
                            "c": maximum_c - 1
                        }
                    },
                    "category": "sick"
                }

                dictionary["objects"].append(object_dictionary)
            
            dictionaries.append(dictionary)
            #print(dictionaries)
            
            
    filename = "{}.json".format('train_positive')

    with open(os.path.join(r"/home/claudio/Documents",filename), "w") as stream:
        json.dump(dictionaries, stream)


In [12]:
createPatientJSON()

In [33]:
!ls

CheckEnvironment.ipynb	    data_preprocessing	Pneumonia_Training.ipynb
config.py		    kernel.ipynb	__pycache__
create_patient_JSONs.ipynb  kernel_LG.ipynb	README.md
data			    models		req.txt
