In [1]:
import numpy as np
import pandas as pd
import os
import glob
import pydicom
import random
import tensorflow as tf
import keras
from keras.models import Sequential, Model 
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, Conv2D, BatchNormalization, LeakyReLU
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping
from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
import cv2

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3694714426197216653
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6670351073
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12986545243941183951
physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [2]:
# create list of all DCM photos in both
train_dcm = glob.glob('E:\Projects\Pneumonia Project\Train_Images\*.dcm')

random.shuffle(train_dcm)

train_size = round(len(train_dcm)*0.8)

train_dcm = train_dcm[0:train_size]
validation_dcm = train_dcm[train_size:]
test_dcm = random.shuffle(glob.glob('E:\Projects\Pneumonia Project\Test_Images\*.dcm'))

# load and shuffle filenames
folder = 'E:\Projects\Pneumonia Project\Train_Images_JPG'
filenames = os.listdir(folder)
random.shuffle(filenames)
# split into train and validation filenames
n_valid_samples = 2560
train_filenames = filenames[n_valid_samples:]
valid_filenames = filenames[:n_valid_samples]
print('n train samples', len(train_filenames))
print('n valid samples', len(valid_filenames))
n_train_samples = len(filenames) - n_valid_samples

n train samples 0
n valid samples 2


In [3]:
def classify_file(filename,train):
    import shutil
    file = str(filename)
    os.chdir(r'E:\Projects\Pneumonia Project\Train_Images_JPG')
    if train:
        shutil.move(str(file),r'E:\Projects\Pneumonia Project\Train_Images_JPG\Train')
    else:
        shutil.move(str(file),r'E:\Projects\Pneumonia Project\Train_Images_JPG\Validation')
        
def dcm_converter(folder_in,folder_out, PNG=False):
    import pydicom as dicom
    import os
    import cv2
    import PIL # optional
    # make it True if you want in PNG format
    # Specify the .dcm folder path
    folder_path = str(folder_in)
    # Specify the output jpg/png folder path
    jpg_folder_path = str(folder_out)
    images_path = os.listdir(folder_path)
    for n, image in enumerate(images_path):
        ds = dicom.dcmread(os.path.join(folder_path, image))
        pixel_array_numpy = ds.pixel_array
        if PNG == False:
            image = image.replace('.dcm', '.jpg')
        else:
            image = image.replace('.dcm', '.png')
        cv2.imwrite(os.path.join(jpg_folder_path, image), pixel_array_numpy)
        if n % 50 == 0:
            print('{} image converted'.format(n))

In [4]:
def move_images(list_of_images,train=True):
    for filename in list_of_images:
            classify_file(filename,train)

#move_images(train_filenames,True)
#move_images(valid_filenames,False)
# reassign filenames once the directory has been changed
train_filenames = os.listdir(r'E:\Projects\Pneumonia Project\Train_Images_JPG\Train')

valid_filenames = os.listdir(r'E:\Projects\Pneumonia Project\Train_Images_JPG\Validation')

print('n train samples', len(train_filenames))
print('n valid samples', len(valid_filenames))

n train samples 23123
n valid samples 2560


In [5]:
import csv
import shutil
# empty dictionary
pneumonia_locations = {}
        
with open(r'E:\Projects\Pneumonia Project\stage_1_train_labels.csv', mode='r') as file:
    #using code primarilly written from Jonne on Kaggle see link below
    # link to Kaggle: https://www.kaggle.com/jonnedtc/cnn-segmentation-connected-components
    # open reader
    reader = csv.reader(file)
    # skip header
    next(reader, None)
    # loop through rows
    for rows in reader:
        # retrieve information
        filename = rows[0]
        location = rows[1:5]
        pneumonia = rows[-1]
        # if row contains pneumonia add label to dictionary of lists of pneumonia locations per filename
        if pneumonia == '1':
            # convert string to float to int
            location = [int(float(i)) for i in location]
            # save pneumonia location in dictionary
            if filename in pneumonia_locations:
                pneumonia_locations[filename].append(location)
            else:
                pneumonia_locations[filename] = [location]

In [6]:
class generator(keras.utils.Sequence):
    
    def __init__(self, folder, filenames, pneumonia_locations=None, batch_size=32, image_size=256, shuffle=True, augment=False, predict=False):
        self.folder = folder
        self.filenames = filenames
        self.pneumonia_locations = pneumonia_locations
        self.batch_size = batch_size
        self.image_size = image_size
        self.shuffle = shuffle
        self.augment = augment
        self.predict = predict
        self.on_epoch_end()
        
    def __load__(self, filename):
        # load dicom file as numpy array
        # create empty mask
        msk = np.zeros((0,0,0,0))
        # get filename without extension
        filename = filename.split('.')[0]
        # if image contains pneumonia
        if filename in pneumonia_locations:
        # loop through pneumonia
            for location in pneumonia_locations[filename]:
        # add 1's at the location of the pneumonia
                x, y, w, h = location
                msk[y:y+h, x:x+w] = 1
        # resize both image and mask
        #img = resize(img, (self.image_size, self.image_size), mode='reflect')
        msk = resize(msk, (self.image_size, self.image_size), mode='reflect') > 0.45
        # add trailing channel dimension
        msk = np.expand_dims(msk, -1)
        return msk
    def __loadpredict__(self, filename):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, filename)).pixel_array
        # resize image
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        return img

        
    def __getitem__(self, index):
        # select batch
        filenames = self.filenames[index*self.batch_size:(index+1)*self.batch_size]
        # train mode: return images and masks
            # load files
        items = [self.__load__(filename) for filename in filenames]
        # unzip images and masks
        imgs, msks = zip(*items)
        # create numpy batch
        imgs = np.array(imgs)
        msks = np.array(msks)
        return imgs, msks
        
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.filenames)
        
    def __len__(self):
        if self.predict:
            # return everything
            return int(np.ceil(len(self.filenames) / self.batch_size))
        else:
            # return full batches only
            return int(len(self.filenames) / self.batch_size)

In [8]:
from keras.engine.input_layer import InputLayer
from keras.layers import Conv2D, MaxPooling2D, MaxPooling3D, GlobalAveragePooling2D, Reshape, Flatten

vgg_conv = VGG16(weights='imagenet', include_top=False,input_shape = (256,256, 3))

layers_to_pop = 16

vgg_conv.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model = Sequential()

for layer in vgg_conv.layers[:4]:
    model.add(layer)
    

'''model.add(Dense(256,input_shape=(256,256,3)))
#model.add(vgg_conv.get_layer("input"))
model.add(vgg_conv.get_layer("block1_conv1"))
model.add(vgg_conv.get_layer("block1_conv2"))'''
model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=100,kernel_size=2,padding='same',activation='relu'))
model.add(Dropout(0.15))

model.add(Reshape((640,640,1)))

#x = GlobalAveragePooling2D(None, )


batch_size = 32

train_folder = r'E:\Projects\Pneumonia Project\Train_Images_JPG\Train'
valid_folder = r'E:\Projects\Pneumonia Project\Train_Images_JPG\Validation'

# create generator to load image data
image_train_generator = ImageDataGenerator(preprocessing_function=preprocess_input).flow_from_directory(train_folder,
                                                                                              shuffle=False,
                                                                                              batch_size=batch_size)

image_validation_generator = ImageDataGenerator(preprocessing_function=preprocess_input).flow_from_directory(valid_folder,
                                                                                              shuffle=False,
                                                                                              batch_size=batch_size)

train_label_generator = generator(train_folder, train_filenames, pneumonia_locations, batch_size=32, image_size=640, shuffle=False, augment=False, predict=False)
validation_label_generator = generator(valid_folder, valid_filenames, pneumonia_locations, batch_size=32, image_size=640, shuffle=False, augment=False, predict=False)


model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
print(model.summary())

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.VGG16.hdf5', 
                               verbose=1, save_best_only=True)

#zip together two generators to create final train generator
train_generator = zip(image_train_generator,train_label_generator)


#zip together two generators to create final validation generator
valid_generator = zip(image_validation_generator,validation_label_generator)


def cosine_annealing(x):
    lr = 0.001
    epochs = 25
    return lr*(np.cos(np.pi*x/epochs)+1.)/2

# create train and validation generators
#folder = r'../input/stage_1_train_images'

# set the learning rate
#learning_rate = tf.keras.callbacks.LearningRateScheduler(cosine_annealing)

steps_per_epoch = int(len(train_filenames)/batch_size)

valid_steps = int(len(valid_filenames)/batch_size)

history = model.fit_generator(generator = generator,callbacks=[checkpointer],validation_data=valid_generator,validation_steps = valid_steps, steps_per_epoch=steps_per_epoch, epochs=5)

Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
block1_conv1 (Conv2D)        (None, 256, 256, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 256, 256, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 128, 128, 64)      0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 64, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 64, 100)       25700     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64, 64, 100)       0         
________________________________________________________________

TypeError: 'type' object is not an iterator

In [None]:
# TODO: initialize model variable so that other layers can be added
# TODO: figure out how training labels are going to be attached to the image data (will need to reference the kaggle project)
# TODO: load in the initial model and run the images through the initial model 
# TODO: train a new model using the preprocessed data for ease of training