# **Histopathologic Cancer Detection**
***Identification of Metastatic Tissue in Histopathologic Scans of Lymph Node Sections***    


**The images are labeled as 0 or 1, where 0 = No Tumor Tissue and 1 = Has Tumor Tissue(s)**

In [None]:


import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import cv2
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil


import os
print(os.listdir("../input"))


# **Exploratory Data Analysis**

In [None]:
# Total Samples Available
print(len(os.listdir('../input/train')))
print(len(os.listdir('../input/test')))

### Creatig a DataFrame of Training images

In [None]:
df = pd.read_csv('../input/train_labels.csv')
print('Shape of DataFrame',df.shape)
df.head()

### Deleting 2 images as they caused error in prediction

In [None]:
# removing this image because it caused a training error previously
df[df['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']

# removing this image because it's black
df[df['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']
df.head()

In [None]:
fig = plt.figure(figsize = (6,6)) 
ax = sns.countplot(df.label).set_title('Label Counts', fontsize = 18)
plt.annotate(df.label.value_counts()[0],
            xy = (0,df.label.value_counts()[0] + 2000),
            va = 'bottom',
            ha = 'center',
            fontsize = 12)
plt.annotate(df.label.value_counts()[1],
            xy = (1,df.label.value_counts()[1] + 2000),
            va = 'bottom',
            ha = 'center',
            fontsize = 12)
plt.ylim(0,150000)
plt.ylabel('Count', fontsize = 16)
plt.xlabel('Labels', fontsize = 16)
plt.show()

There is a little imbalance in the lables which needs be rectified.

# **Feature Engineering**

### **Take 80K images from both categories**

In [None]:
SAMPLE_SIZE = 80000

df_negative = df[df['label'] == 0].sample(SAMPLE_SIZE, random_state = 0)

df_positive = df[df['label'] == 1].sample(SAMPLE_SIZE, random_state = 0)


df_train = pd.concat([df_negative, df_positive], axis = 0).reset_index(drop = True)

df_train = shuffle(df_train)


### **Spliting the shuffled images into training and validation sets**

In [None]:

y = df_train['label']

df_train, df_val = train_test_split(df_train, test_size = 0.1, random_state = 0, stratify = y)

**Creating Directory Structure**

In [None]:

basedirectory = 'basedirectory'
os.mkdir(basedirectory)


training_directory = os.path.join(basedirectory, 'training_directory')
os.mkdir(training_directory)


validation_directory = os.path.join(basedirectory, 'validation_directory')
os.mkdir(validation_directory)


no_tumor = os.path.join(training_directory, '0')
os.mkdir(no_tumor)
has_tumor = os.path.join(training_directory, '1')
os.mkdir(has_tumor)

no_tumor = os.path.join(validation_directory, '0')
os.mkdir(no_tumor)
has_tumor = os.path.join(validation_directory, '1')
os.mkdir(has_tumor)



**Transfer the respective images into their respective folders**

In [None]:
df.set_index('id', inplace=True)

traininglist = list(df_train['id'])
validationlist = list(df_val['id'])

for image in traininglist:
    imagename = image + '.tif'
    target = df.loc[image,'label']
    if target == 0:
        label = '0'
    elif target == 1:
        label = '1'
    
    src = os.path.join('../input/train', imagename)
    dest = os.path.join(train_dir, label, imagename)
    shutil.copyfile(src, dest)

for image in validationlist:  
    imagename = image + '.tif'
    target = df.loc[image,'label']
    if target == 0:
        label = '0'
    elif target == 1:
        label = '1'
    src = os.path.join('../input/train', imagename)
    dest = os.path.join(val_dir, label, imagename)
    shutil.copyfile(src, dest)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMAGE_SIZE = 96
training_path = 'base_dir/train_dir'
validation_path = 'base_dir/val_dir'
testing_path = '../input/test'
training_sample_count = len(df_train)
validation_sample_count = len(df_val)
training_batchsize = 32 #10
validation_batchsize = 32 #10

train_steps = np.ceil(training_sample_count / training_batchsize)
val_steps = np.ceil(validation_sample_count / validation_batchsize)
datagen = ImageDataGenerator(rescale=1.0/255)

train_gen = datagen.flow_from_directory(training_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=training_batchsize,
                                        class_mode='categorical')

val_gen = datagen.flow_from_directory(validation_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=validation_batchsize,
                                        class_mode='categorical')

test_gen = datagen.flow_from_directory(validation_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=False)

# **Model Creation** 

In [None]:
#Import Keras
import keras
import tensorflow.keras
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import SeparableConv2D
from keras.layers.core import Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dropout, MaxPooling2D, Flatten, Dense


In [None]:
class CNNNet:
    @staticmethod
    def build(width, height, depth, classes):
            inputShape = (height, width, depth)
            model = Sequential()
            
            model.add(Conv2D(filters = 32, kernel_size = (5,5), padding="same", activation='relu', input_shape= inputShape))
            model.add(Conv2D(filters = 32, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(Conv2D(filters = 32, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(MaxPooling2D(pool_size=(2, 2)))
            model.add(Dropout(0.2))
                      
            model.add(Conv2D(filters = 64, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(Conv2D(filters = 64, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(Conv2D(filters = 64, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(MaxPooling2D(pool_size=(2, 2)))
            model.add(Dropout(0.2))
            
            model.add(Conv2D(filters = 128, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(Conv2D(filters = 128, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(Conv2D(filters = 128, kernel_size = (3,3), padding="same", activation='relu'))
            model.add(MaxPooling2D(pool_size=(2, 2)))
            model.add(Dropout(0.25))
            
            model.add(Flatten())
            model.add(Dense(units = 500, activation = 'relu'))
            model.add(Dropout(0.2))

            model.add(Dense(classes, activation='softmax'))   
            model.summary() 
            return model

In [None]:
class CancerCNN:
    @staticmethod
    def build(width, height, depth, classes):
        model = Sequential()
        inputShape = (height, width, depth)
        chanDim = -1

        model.add(SeparableConv2D(32, (3, 3), padding="same",input_shape = inputShape))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=chanDim))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))

        model.add(SeparableConv2D(64, (3, 3), padding="same"))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=chanDim))
        model.add(SeparableConv2D(64, (3, 3), padding="same"))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=chanDim))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))

        model.add(SeparableConv2D(128, (3, 3), padding="same"))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=chanDim))
        model.add(SeparableConv2D(128, (3, 3), padding="same"))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=chanDim))
        model.add(SeparableConv2D(128, (3, 3), padding="same"))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=chanDim))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        
        model.add(Flatten())
        model.add(Dense(256))
        model.add(Activation("relu"))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))

        model.add(Dense(classes))
        model.add(Activation("softmax"))        
        model.summary()

        return model

In [None]:
model = CNNNet.build(width = 96, height = 96, depth = 3, classes = 2)
from tensorflow.keras.optimizers import SGD, Adam, Adagrad
model.compile(optimizer = Adam(lr=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

# **Model Training**

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
filepath = "saved_model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose = 1, 
                             save_best_only = True, mode = 'max') 

reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor = 0.5, patience = 2, verbose = 1, mode = 'max', min_lr = 0.00001)                              
callbacks_list = [checkpoint, reduce_lr] 

history = model.fit_generator(train_gen, steps_per_epoch = train_steps, 
                    validation_data = val_gen,
                    validation_steps = val_steps,
                    epochs = 8,
                    verbose = 1,
                    callbacks = callbacks_list)

# **Model Evaluation**

### **Compare Training and Validation Metrics**

We can determine our epochs based on the convergence of below graphs.

In [None]:

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='best')
plt.show()


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='best')
plt.show()

### **Load the saved weights**

In [None]:
# Here the best epoch will be used.
model.load_weights('saved_model.h5')
val_loss, val_acc = \
model.evaluate_generator(test_gen, steps=len(df_val))
print('val_loss:', val_loss)
print('val_acc:', val_acc)

### **Validate the model**

In [None]:
predictions = model.predict_generator(test_gen, steps=len(df_val), verbose=1)

In [None]:
df_preds = pd.DataFrame(predictions, columns=['no_tumor', 'has_tumor'])
df_preds.head()

In [None]:
y_true = test_gen.classes
y_pred = df_preds['has_tumor']

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, auc
print('ROC AUC Score = ',roc_auc_score(y_true, y_pred))

In [None]:
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_true, y_pred)
auc_keras = auc(fpr_keras, tpr_keras)

**Let's plot our ROC Curve**

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='area = {:.2f}'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

## **Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
y_pred_binary = predictions.argmax(axis=1)
cm = confusion_matrix(y_true, y_pred_binary)

from mlxtend.plotting import plot_confusion_matrix
fig, ax = plot_confusion_matrix(conf_mat=cm,
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True,
                               cmap = 'Dark2')
plt.show()

## **Classification Report**

In [None]:
from sklearn.metrics import classification_report
# Generate a classification report

report = classification_report(y_true, y_pred_binary, target_names = ['no_tumor', 'has_tumor'])
print(report)

In [None]:
shutil.rmtree('base_dir')

# **Test Predictions**

In [None]:
test_dir = 'test_dir'
os.mkdir(test_dir)

test_images = os.path.join(test_dir, 'test_images')
os.mkdir(test_images)

os.listdir('test_dir')

In [None]:
test_list = os.listdir('../input/test')

for image in test_list:    
    fname = image
    src = os.path.join('../input/test', fname)
    dst = os.path.join(test_images, fname)
    shutil.copyfile(src, dst)
print('Total Test Images = ',len(os.listdir('test_dir/test_images')))

In [None]:
test_path ='test_dir'
test_gen = datagen.flow_from_directory(test_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=False)

In [None]:
num_test_images = 57458 
predictions = model.predict_generator(test_gen, steps=num_test_images, verbose=1)

In [None]:
if predictions.shape[0] == num_test_images:
    print('All Predictions Done!')
else:
    print('Error!')

In [None]:
df_preds = pd.DataFrame(predictions, columns=['no_tumor', 'has_tumor'])
df_preds.head()

In [None]:
test_filenames = test_gen.filenames

df_preds['file_names'] = test_filenames

def extract_id(x):
    a = x.split('/')
    b = a[1].split('.')
    extracted_id = b[0]
    return extracted_id
df_preds['id'] = df_preds['file_names'].apply(extract_id)
df_preds.head()

In [None]:
y_pred = df_preds['has_tumor']
image_id = df_preds['id']