In [1]:
import numpy as np
import pandas as pd
import os


base_dir = '../input/histopathologic-cancer-detection/'
print(os.listdir(base_dir))

In [2]:
labels = pd.read_csv(base_dir + "train_labels.csv")
labels.head()

In [38]:
train_path = base_dir + "train/"
test_path = base_dir + "test/"
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)

### Size of Train & Test

In [39]:
print("Train size: ", len(train_files))
print("Test size: ", len(test_files))

### Counts of Response variable

In [5]:
labels.label.value_counts()

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import altair as alt
sns.set()
from PIL import Image


positive_images = np.random.choice(labels[labels.label==1].id, size=50, replace=False)
negative_images = np.random.choice(labels[labels.label==0].id, size=50, replace=False)

### Image of Positive

In [7]:
fig, ax = plt.subplots(5, 10, figsize=(20,10))

for n in range(5):
    for m in range(10):
        img_id = positive_images[m + n*10]
        image = Image.open(train_path + img_id + ".tif")
        ax[n,m].imshow(image)
        ax[n,m].grid(False)
        ax[n,m].tick_params(labelbottom=False, labelleft=False)

### Image of Negative

In [8]:
fig, ax = plt.subplots(5, 10, figsize=(20,10))

for n in range(5):
    for m in range(10):
        img_id = negative_images[m + n*10]
        image = Image.open(train_path + img_id + ".tif")
        ax[n,m].imshow(image)
        ax[n,m].grid(False)
        ax[n,m].tick_params(labelbottom=False, labelleft=False)

In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation
from tensorflow.keras.layers import BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.applications import VGG16

from shutil import copyfile, move
from tqdm import tqdm
import h5py
import random

In [10]:
dataset_df = pd.read_csv(base_dir + "train_labels.csv")
dataset_df["filename"] = [item.id+".tif" for idx, item in dataset_df.iterrows()]
dataset_df["groundtruth"] = ["cancerous" if item.label==1 else "healthy" for idx, item in dataset_df.iterrows()]
dataset_df.head()


In [11]:
training_sample_percentage = 0.8
training_sample_size = int(len(dataset_df)*training_sample_percentage)
validation_sample_size = len(dataset_df)-training_sample_size

training_df = dataset_df.sample(n=training_sample_size)
validation_df = dataset_df[~dataset_df.index.isin(training_df.index)]

In [12]:
training_batch_size = 64
validation_batch_size = 64
target_size = (96,96)

train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2, 
    width_shift_range=0.1,
    height_shift_range=0.1
)

train_generator = train_datagen.flow_from_dataframe(
    dataframe = training_df,
    x_col='filename',
    y_col='groundtruth',
    directory= train_path,
    target_size=target_size,
    batch_size=training_batch_size,
    shuffle=True,
    class_mode='binary')


validation_datagen = ImageDataGenerator(rescale=1. / 255)
validation_generator = validation_datagen.flow_from_dataframe(
    dataframe = validation_df,
    x_col='filename',
    y_col='groundtruth',
    directory=train_path,
    target_size=target_size,
    shuffle=False,
    batch_size=validation_batch_size,
    class_mode='binary')

### Use Pretrained Model VGG16

In [13]:
input_shape = (96, 96, 3)
pretrained_layers = VGG16(weights='imagenet',include_top = False, input_shape=input_shape)
pretrained_layers.summary()

In [14]:
for layer in pretrained_layers.layers[:-8]:
    layer.trainable = False

for layer in pretrained_layers.layers:
    print(layer, layer.trainable)

In [15]:
dropout_dense_layer = 0.6

model = Sequential()
model.add(pretrained_layers)
    
model.add(GlobalAveragePooling2D())
model.add(Dense(256, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout_dense_layer))

model.add(Dense(1))
model.add(Activation('sigmoid'))

In [16]:
model.summary()

In [17]:
model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

In [18]:
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.5),
             EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

train_step_size = train_generator.n // train_generator.batch_size
validation_step_size = validation_generator.n // validation_generator.batch_size

In [19]:
epochs = 5
history = model.fit_generator(train_generator,
          steps_per_epoch = train_step_size,
          validation_data= validation_generator,
          validation_steps = validation_step_size,
          epochs=epochs,
          verbose=1,
          shuffle=True,
          callbacks=callbacks)

In [23]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy over epochs')
plt.ylabel('Acc')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='best')
plt.show()

In [24]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss over epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='best')
plt.show()

In [25]:
model.load_weights("best_model.h5")

In [45]:
test_datagen = ImageDataGenerator(
    rescale=1. / 255)

test_generator = test_datagen.flow_from_directory(
    directory="../input/dataset/test/",
    target_size=target_size,
    batch_size=1,
    shuffle=False
)


In [46]:
pred=model.predict_generator(test_generator,verbose=1)

In [None]:
csv_file = open(base_dir+"sample_submission.csv","w")
csv_file.write("id,label\n")
for filename, prediction in zip(test_generator.filenames,pred):
    name = filename.split("/")[1].replace(".tif","")
    csv_file.write(str(name)+","+str(prediction[0])+"\n")
csv_file.close()