In [1]:
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import regularizers
from imgaug import augmenters as iaa

print(tf.__version__)

2.3.0


In [2]:
!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

[?25l[K     |▎                               | 10kB 17.8MB/s eta 0:00:01[K     |▋                               | 20kB 11.0MB/s eta 0:00:01[K     |█                               | 30kB 8.9MB/s eta 0:00:01[K     |█▎                              | 40kB 7.7MB/s eta 0:00:01[K     |█▋                              | 51kB 4.5MB/s eta 0:00:01[K     |██                              | 61kB 4.5MB/s eta 0:00:01[K     |██▎                             | 71kB 4.9MB/s eta 0:00:01[K     |██▋                             | 81kB 5.0MB/s eta 0:00:01[K     |██▉                             | 92kB 5.3MB/s eta 0:00:01[K     |███▏                            | 102kB 5.7MB/s eta 0:00:01[K     |███▌                            | 112kB 5.7MB/s eta 0:00:01[K     |███▉                            | 122kB 5.7MB/s eta 0:00:01[K     |████▏                           | 133kB 5.7MB/s eta 0:00:01[K     |████▌                           | 143kB 5.7MB/s eta 0:00:01[K     |████▉                    

In [2]:
def get_optimizer():
  return tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

In [3]:
def get_callbacks(logdir,name,monitor,checkpoint_path="/content/drive/MyDrive/CapstoneProject/data/weight"):
  return [
    tfdocs.modeling.EpochDots(),
    tf.keras.callbacks.EarlyStopping(monitor, patience=200),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,save_best_only=True, save_weights_only=True, verbose=1),
    tf.keras.callbacks.TensorBoard(logdir/name),
  ]

In [5]:
def compile_model(model, optimizer=None):
  if optimizer is None:
    optimizer = get_optimizer()
  model.compile(optimizer=optimizer,
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=[
                  tf.keras.losses.BinaryCrossentropy(
                      from_logits=True, name='binary_crossentropy'),
                  'accuracy'])

  model.summary()



In [8]:
def fit_model(model,train_ds,steps_per_epoch,validate_ds,logdir, logfile_name, max_epochs=1000):
   
    history = model.fit(
    train_ds,
    steps_per_epoch,
    epochs=max_epochs,
    validation_data=validate_ds,
    callbacks=get_callbacks(logdir, logfile_name),
    verbose=0)


    return history


In [5]:
def load_saved_model(path):
  model = tf.keras.models.load_model(path)
  model.summary();

  return model

In [None]:
def vgg_model():
  #add here vgg model

In [None]:
def mask_cnn_model():
  #add here mask_cnn_model

In [None]:
def resnet_model():
  #add here resnet model

In [None]:
def unet_model():
  #add here unet model

In [None]:
def mobilenet_model():
  #add here mobilenet model

In [None]:
def yolo_model():
  #add here yolo model

The dataset is too large to fit into memory, so we need to create a generator that loads data on the fly.

The generator takes in some filenames, batch_size and other parameters.

The generator outputs a random batch of numpy images and numpy masks.

In [7]:
!pip install pydicom

Collecting pydicom
[?25l  Downloading https://files.pythonhosted.org/packages/72/7b/6ed88f82dd33a32cdb43432dab7f84fcd40c49d63251442b3cfe0be983d4/pydicom-2.1.1-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 4.9MB/s 
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.1.1


In [8]:
import numpy as np
import keras
import os
import pydicom

class DataGenerator(keras.utils.Sequence):
    
    def __init__(self, folder, filenames, pneumonia_locations=None, batch_size=32, image_size=256, shuffle=True, augment=False, predict=False):
        self.folder = folder
        self.filenames = filenames
        self.pneumonia_locations = pneumonia_locations
        self.batch_size = batch_size
        self.image_size = image_size
        self.shuffle = shuffle
        self.augment = augment
        self.predict = predict
        self.on_epoch_end()
        
    def __load__(self, filename):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, filename)).pixel_array
        # create empty mask
        msk = np.zeros(img.shape)
        # get filename without extension
        filename = filename.split('.')[0]
        # if image contains pneumonia
        if filename in pneumonia_locations:
            # loop through pneumonia
            for location in pneumonia_locations[filename]:
                # add 1's at the location of the pneumonia
                x, y, w, h = location
                msk[y:y+h, x:x+w] = 1
        # if augment then horizontal flip half the time
        if self.augment and random.random() > 0.5:
            img = np.fliplr(img)
            msk = np.fliplr(msk)
        # resize both image and mask
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        msk = resize(msk, (self.image_size, self.image_size), mode='reflect') > 0.5
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        msk = np.expand_dims(msk, -1)
        return img, msk
    
    def __loadpredict__(self, filename):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, filename)).pixel_array
        # resize image
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        return img
        
    def __getitem__(self, index):
        # select batch
        filenames = self.filenames[index*self.batch_size:(index+1)*self.batch_size]
        # predict mode: return images and filenames
        if self.predict:
            # load files
            imgs = [self.__loadpredict__(filename) for filename in filenames]
            # create numpy batch
            imgs = np.array(imgs)
            return imgs, filenames
        # train mode: return images and masks
        else:
            # load files
            items = [self.__load__(filename) for filename in filenames]
            # unzip images and masks
            imgs, msks = zip(*items)
            # create numpy batch
            imgs = np.array(imgs)
            msks = np.array(msks)
            return imgs, msks
        
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.filenames)
        
    def __len__(self):
        if self.predict:
            # return everything
            return int(np.ceil(len(self.filenames) / self.batch_size))
        else:
            # return full batches only
            return int(len(self.filenames) / self.batch_size)



Load pneumonia locations
Table contains [filename : pneumonia location] pairs per row.

If a filename contains multiple pneumonia, the table contains multiple rows with the same filename but different pneumonia locations.
If a filename contains no pneumonia it contains a single row with an empty pneumonia location.
The code below loads the table and transforms it into a dictionary.

The dictionary uses the filename as key and a list of pneumonia locations in that filename as value.
If a filename is not present in the dictionary it means that it contains no pneumonia.

In [9]:


def load(train_label_file_path):
  pneumonia_locations = {}
  # load table
  with open(train_label_file_path, mode='r') as infile:
      # open reader
      reader = csv.reader(infile)
      # skip header
      next(reader, None)
      # loop through rows
      for rows in reader:
          # retrieve information
          filename = rows[0]
          location = rows[1:5]
          pneumonia = rows[5]
          # if row contains pneumonia add label to dictionary
          # which contains a list of pneumonia locations per filename
          if pneumonia == '1':
              # convert string to float to int
              location = [int(float(i)) for i in location]
              # save pneumonia location in dictionary
              if filename in pneumonia_locations:
                  pneumonia_locations[filename].append(location)
              else:
                  pneumonia_locations[filename] = [location]
  return pneumonia_locations
    

In [None]:
def get_patientids(df_labels, count):
  positive_count = int(count*0.5)

  pneumonio_patients = df_labels[df_labels['Target'] == 1]['patientId'].unique()
  normal_patients = df_labels[df_labels['Target'] == 0]['patientId'].unique()
  
  if(positive_count > len(pneumonio_patients)):
    positive_count = len(pneumonio_patients)

  positive_selection_index = np.random.choice(len(pneumonio_patients), positive_count, replace=False)
  positive_selection = pneumonio_patients[positive_selection_index]

  negative_count = count - len(positive_selection)
  negative_selection_index = np.random.choice(len(normal_patients), negative_count, replace=False)
  negative_selection = normal_patients[negative_selection_index]

  print(f'Pneumonia: {len(positive_selection)}')
  print(f'Normal: {len(negative_selection)}')
  
  patients = np.concatenate([positive_selection,negative_selection])
  random.shuffle(patients)

  filenames = list(map(lambda id: id+'.dcm', patients))
  
  return filenames


In [None]:
df_labels = pd.read_csv(label_path)
selected_files = get_patientids(df_labels, 5000)
print(f'{len(selected_files)}, names:{selected_files[0]}')

In [None]:
#To check whether the above selected filenames are subset of filenames from file system
#filenames = os.listdir(train_path)
#print(set(selected_files).issubset(filenames))