In [0]:
import zipfile
import cv2
import csv
import os 

from __future__ import division

import six
import numpy as np
import pandas as pd
import cv2
import glob
import random

np.random.seed(2016)
random.seed(2016)

from keras import optimizers
from keras.models import Model
from keras.applications.resnet50 import ResNet50
from keras.models import load_model
from keras.layers import Input, Activation, merge, Dense, Flatten, concatenate, GlobalAveragePooling2D, Dropout
from keras.layers.convolutional import Conv2D, MaxPooling2D, AveragePooling2D
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from numpy import expand_dims
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

import matplotlib.pyplot as plt

from google.colab.patches import cv2_imshow
from google.colab import files

**Mount drive**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

**Import external files containing utilities funcitons**

In [0]:
# external notebooks are located in /content/drive/My Drive/Colab Notebooks/

%run '/content/drive/My Drive/Colab Notebooks/dataset_utilities.ipynb'
%run '/content/drive/My Drive/Colab Notebooks/resnet_builder.ipynb'
%run '/content/drive/My Drive/Colab Notebooks/general_utilities.ipynb'

**Removes autoscroll throughout process**

In [0]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

**Global declarations**


In [0]:
conf = dict()

# How many patients will be in train and validation set during training. Range: (0; 1)
conf['train_valid_fraction'] = 0.75

# Batch size for CNN [Depends on GPU and memory available]
conf['batch_size'] = 64

# Number of epochs for CNN training
#conf['nb_epoch'] = 200
conf['nb_epoch'] = 30

# Early stopping. Stop training after epochs without improving on validation
conf['earlystopping_patience'] = 90

# Shape of image for CNN (Larger the better, but you need to increase CNN as well)
#conf['image_shape'] = (4160,4128)
#conf['image_shape'] = (2080,2064)
#conf['image_shape'] = (1024,1024)
conf['image_shape'] = (224,224)
#conf['image_shape'] = (64,64)

#conf['optimizer'] = ('adam',dict())
#conf['optimizer'][1]['lr'] = 0.001
conf['optimizer'] = ('sgd',dict())
conf['optimizer'][1]['lr'] = 0.01
#conf['optimizer'] = ('adadelta',dict())
#conf['optimizer'][1]['lr'] = 1.0

conf['load_in_ram'] = True

# only one of these two fields below can be True
conf['blurred_images'] = False
conf['cropped_images'] = True

conf['use_additional_images'] = True

conf['transfer_learning'] = True

**Hardcoded paths to training files. Note that the "additional" directories have been left out**

In [0]:
# file paths to training 
if conf['blurred_images'] == False and conf['cropped_images'] == False:
  extract_zip_dataset()

  filepaths = []  
  filepaths.append('/content/train/train/Type_1/')
  filepaths.append('/content/train/train/Type_2/')
  filepaths.append('/content/train/train/Type_3/')
elif conf['blurred_images'] == True: 
  filepaths = []  
  filepaths.append('/content/drive/My Drive/blurred_dataset/Type_1/')
  filepaths.append('/content/drive/My Drive/blurred_dataset/Type_2/')
  filepaths.append('/content/drive/My Drive/blurred_dataset/Type_3/')
elif conf['cropped_images'] == True:
  filepaths = []  
  filepaths.append('/content/drive/My Drive/original_dataset_manualcropped/Type_1/')
  filepaths.append('/content/drive/My Drive/original_dataset_manualcropped/Type_2/')
  filepaths.append('/content/drive/My Drive/original_dataset_manualcropped/Type_3/')

  if conf['use_additional_images'] == True:
    filepaths.append('/content/drive/My Drive/full_additional_dataset_automatedcropped/Type_1/')
    filepaths.append('/content/drive/My Drive/full_additional_dataset_automatedcropped/Type_2/')
    filepaths.append('/content/drive/My Drive/full_additional_dataset_automatedcropped/Type_3/')

else:
  raise Exception('Error: no dataset configuration found')

**Get a list of all training files**

In [0]:
allFiles = []

for i, filepath in enumerate(filepaths):
    files = glob.glob(filepath + '*.jpg')
    allFiles = allFiles + files

**Split data into training and validation sets**

In [0]:
if conf['cropped_images'] == False:
  split_point = int(round(conf['train_valid_fraction']*len(allFiles)))

  random.shuffle(allFiles)

  train_list = allFiles[:split_point]
  valid_list = allFiles[split_point:]
  print('Train patients: {}'.format(len(train_list)))
  print('Valid patients: {}'.format(len(valid_list)))
  
elif conf['use_additional_images'] == False:
  train_list = allFiles[:]
  print('Train patients: {}'.format(len(train_list)))

  filepaths_validation = []
  filepaths_validation.append('/content/drive/My Drive/cropped_additional_dataset/Type_1/')
  filepaths_validation.append('/content/drive/My Drive/cropped_additional_dataset/Type_2/')
  filepaths_validation.append('/content/drive/My Drive/cropped_additional_dataset/Type_3/')

  allFiles_validation = []

  for i, filepath in enumerate(filepaths_validation):
      files = glob.glob(filepath + '*.jpg')
      allFiles_validation = allFiles_validation + files

  valid_list = allFiles_validation[:]
  print('Valid patients: {}'.format(len(valid_list)))

else:
  split_point = int(round(conf['train_valid_fraction']*len(allFiles)))

  random.shuffle(allFiles)

  train_list = allFiles[:split_point]
  valid_list = allFiles[split_point:]
  print('Train patients: {}'.format(len(train_list)))
  print('Valid patients: {}'.format(len(valid_list)))
  
print('Train batches: {}'.format(np.floor(len(train_list)/conf['batch_size'])))
print('Valid batches: {}'.format(np.floor(len(valid_list)/conf['batch_size'])))

**Prepare object for data augmentation: in case of loading samples from disk, move validation dataset in a different folder**

In [0]:
# move validation samples in a different directory to allow
# the imagedatagenerator to simply retrieves samples from
# train directory. (see the next cell) 

if conf['load_in_ram'] == False:
  !mkdir valid
  !mkdir valid/Type_1
  !mkdir valid/Type_2
  !mkdir valid/Type_3

  import shutil

  for full_filename in valid_list:
    typeN_folder = full_filename[21:27]
    filename = full_filename[28:]
    destination = 'valid/' + typeN_folder + '/' + filename
    shutil.move(full_filename, destination)     # move file from full_filename to destination

**Define data augmentation operations**

In [0]:
from keras.preprocessing.image import ImageDataGenerator


# data augmentation operations to be applied on trai dataset
augmentator_train = ImageDataGenerator(
		rotation_range=90,
		horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2,
    shear_range=0.2,
)

augmentator_valid = ImageDataGenerator()  # no augmentation for validation

**If samples loaded from disk, define loader**

In [0]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


if conf['load_in_ram'] == False:
  aug_generator_train = augmentator_train.flow_from_directory(
      directory=r"./train/train/",
      target_size=conf['image_shape'],
      color_mode="rgb",
      batch_size=conf['batch_size'],
      class_mode="categorical",
      shuffle=True,
  )

  aug_generator_valid = augmentator_valid.flow_from_directory(
      directory=r"./valid/",
      target_size=conf['image_shape'],
      color_mode="rgb",
      batch_size=conf['batch_size'],
      class_mode="categorical",
      shuffle=False,
      seed=42
  )

  # these two variables will be passed to model.fit_generator
  train_it = aug_generator_train
  val_it = aug_generator_valid

**Otherwise, samples loaded in ram**

In [0]:
if conf['load_in_ram'] == True:
  train_imgs_in_ram, train_labels_in_ram = load_dataset_in_ram(train_list)
  val_imgs_in_ram, val_labels_in_ram = load_dataset_in_ram(valid_list)

  print('train_imgs_in_ram: {}'.format(len(train_imgs_in_ram)))
  print('train_labels_in_ram: {}'.format(len(train_labels_in_ram)))
  print('val_imgs_in_ram: {}'.format(len(val_imgs_in_ram)))
  print('val_labels_in_ram: {}'.format(len(val_labels_in_ram)))  

In [0]:
if conf['load_in_ram'] == True:
  # these two variables will be passed to model.fit_generator
  train_it = augmentator_train.flow(train_imgs_in_ram, train_labels_in_ram, conf['batch_size'])
  val_it = augmentator_valid.flow(val_imgs_in_ram, val_labels_in_ram, conf['batch_size'])

**Sanity-check: show an image**

In [0]:
plt.imshow(train_imgs_in_ram[1]/255.)
plt.show()
print(train_labels_in_ram[248:252])

**Function to init the model**


In [0]:
# define the model architecture(34, 50, etc..) and the loss and the
# optimizer for training.
def init_model():
  nb_classes = 3
  img_rows, img_cols = conf['image_shape'][1], conf['image_shape'][0]
  img_channels = 3

  # This will return a Keras Model 
  model = ResnetBuilder.build_resnet_34((img_channels, img_rows, img_cols), nb_classes)

  name_optimizer, params = conf['optimizer']
  if name_optimizer == 'sgd':
    optimizer = optimizers.SGD(lr=conf['optimizer'][1]['lr'], decay=1e-6, momentum=0.9, nesterov=True)
  elif name_optimizer == 'adam':
    optimizer = optimizers.Adam(learning_rate=conf['optimizer'][1]['lr'], beta_1=0.9, beta_2=0.999, amsgrad=False)
  else:
    optimizer = optimizers.Adadelta(learning_rate=conf['optimizer'][1]['lr'])

  # Configures the model for training
  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

  return model

**Check the model: trying to overfit a minibatch**

In [0]:
# Remember that when trying to overfit a minibatch, 
#  regularization should be turned off
print('Overfitting a minibatch...')

model = init_model()

# get a minibatch from train dataset
batch_func = batch_generator_train(train_list, conf['batch_size'])   # it returns a generator
X, y = next(batch_func)  # return a batch

class_weight = {0: 1.1,
                1: 0.5,
                2: 0.7}

history = model.fit(x=X, y=y, epochs=7, batch_size=32,
                    verbose=1, class_weight=class_weight)

# print stats
show_report(model, X, y)
show_probability_predictions(model, X, y)
show_graphs(history)

**Training our model from scratch**

In [0]:
if conf['transfer_learning'] == False:
  print('Create and compile model...')

  model = init_model()

  # evaluate also the use of ReduceLROnPlateau before making EarlyStopping, for 
  # example give to ReduceLROnPlateau a patience of 3 and to EarlyStopping a 
  # patience of 6.
  callbacks = [
      EarlyStopping(monitor='val_loss', patience=conf['earlystopping_patience'], verbose=1),
      ModelCheckpoint('cervical_best.hdf5', monitor='val_loss', save_best_only=True, verbose=1),
  ]

  class_weight = {0: 1.1,
                  1: 0.5,
                  2: 0.7}

  print('Fit model...')
  # Trains the model on data generated batch by batch 
  history = model.fit_generator(generator=train_it,
                                epochs=conf['nb_epoch'],
                                steps_per_epoch=np.floor(len(train_list)/conf['batch_size']),      # the number of batches to consider in an epoch
                                validation_data=val_it,
                                validation_steps=np.floor(len(valid_list)/conf['batch_size']),
                                verbose=1,
                                callbacks=callbacks,
                                class_weight=class_weight)

**Training model with transfer learning**

In [0]:
if conf['transfer_learning'] == True:
  print('Create and compile model...')
  base_model = ResNet50(weights='imagenet', include_top=False)

  # add a global spatial average pooling layer
  x = base_model.output
  x = GlobalAveragePooling2D()(x)

  # add a fully-connected layer
  x = Dense(512, activation='relu')(x)

  # add a dropout layer for regularization 
  dropout = Dropout(0.3)(x)

  # and a logistic layer
  predictions = Dense(3, activation='softmax')(dropout)

  model = Model(input=base_model.input, output=predictions)

  # freeze all convolutional Resnet layers
  for layer in base_model.layers:
      layer.trainable = False

  model.compile(optimizer=optimizers.SGD(lr=0.001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy']) 

In [0]:
 if conf['transfer_learning'] == True: 
  callbacks = [
      EarlyStopping(monitor='val_loss', patience=conf['earlystopping_patience'], verbose=1),
      ModelCheckpoint('cervical_best.hdf5', monitor='val_loss', save_best_only=True, verbose=1),
  ]

  class_weight = {0: 1.1,
                  1: 0.5,
                  2: 0.7}

  print('Fit model...')
  # Trains the model on data generated batch by batch 
  history = model.fit_generator(generator=train_it,
                                epochs=conf['nb_epoch'],
                                steps_per_epoch=np.floor(len(train_list)/conf['batch_size']),      # the number of batches to consider in an epoch
                                validation_data=val_it,
                                validation_steps=np.floor(len(valid_list)/conf['batch_size']),
                                verbose=1,
                                callbacks=callbacks,
                                class_weight=class_weight)

**Print stats of training**

In [0]:
print('Stats of training')
show_graphs(history, plot_validation=True)

print('\n\nPrediction on validation dataset and show stats\n')
batch_func = batch_generator_train(valid_list, len(valid_list))   # it returns a generator
X, y = next(batch_func)  # return a batch

show_report(model, X, y)

**Save the best state model in local machine**

In [0]:
files.download('cervical_best.hdf5')

**Load the best model found during training**

In [0]:
model = load_model('/content/drive/My Drive/cervical_best.hdf5')

**Prepare for the test phase**

In [0]:
from google.colab import files  
from google.colab import drive
import zipfile  

!pip install kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle

!rm kaggle.json
# Upload kaggle API key file
uploaded = files.upload()     # upload kaggle.json

!rm -rf ../root/.kaggle
!mkdir ../root/.kaggle
!cp kaggle.json ../root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

# Download zip file containing the dataset 
!kaggle competitions download -c intel-mobileodt-cervical-cancer-screening -p /content/drive/My\ Drive/kaggle_dataset

In [0]:
archive = zipfile.ZipFile('/content/drive/My Drive/kaggle_dataset/intel-mobileodt-cervical-cancer-screening.zip')

# Extract zip, for now we will work only on train and 
# test images folders
for file in archive.namelist():
    if file.startswith('test_stg2') :
        # extract the image with name == file(for example file == train/train/Type_3/465.jpg) 
        # in the /content/ folderm
        archive.extract(file, '/content/') 

In [0]:
# check number of test images 
!ls /content/drive/My\ Drive/test/test | wc -l

In [0]:
import subprocess
subprocess.call(["7z", "x", "-pbyecervicalcancer", "/content/test_stg2.7z"])

In [0]:
# check number of test images, here they should be 3506
!ls /content/drive/My\ Drive/test_stg2 | wc -l

In [0]:
archive = zipfile.ZipFile('/content/drive/My Drive/kaggle_dataset/intel-mobileodt-cervical-cancer-screening.zip')

# Extract zip, for now we will work only on train and 
# test images folders
for file in archive.namelist():
  if file.startswith('test'):
      # extract the image with name == file(for example file == train/train/Type_3/465.jpg) 
      # in the /content/ folderm
      archive.extract(file, '/content/test_stg2/') 

In [0]:
# check number of test images, here they should be 4018
!ls /content/test_stg2 | wc -l

**Create submission files with prediction for submission**


In [0]:
!kaggle competitions download -c intel-mobileodt-cervical-cancer-screening -f sample_submission_stg2.csv

**Load the best training model and make prediction on test dataset**

In [0]:
model = load_model('cervical_best.hdf5')    

sample_subm = pd.read_csv("/content/sample_submission_stg2.csv")
ids = sample_subm['image_name'].values

for id in ids:
    print('Predict for image {}'.format(id))
    files = glob.glob("/content/test_stg2/" + id)
    files += glob.glob("/content/drive/My Drive/test/test/" + id)


    image_list = []
    for f in files:
        image = cv2.imread(f)
        image = cv2.resize(image, conf['image_shape'])
        image_list.append(image)
        
    image_list = np.array(image_list)

    predictions = model.predict(image_list, verbose=1, batch_size=1)

    sample_subm.loc[sample_subm['image_name'] == id, 'Type_1'] = predictions[0,0]
    sample_subm.loc[sample_subm['image_name'] == id, 'Type_2'] = predictions[0,1]
    sample_subm.loc[sample_subm['image_name'] == id, 'Type_3'] = predictions[0,2]
    
sample_subm.to_csv("subm.csv", index=False)

In [0]:
#!cat subm.csv