In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
The objective of this study is to use convolutional neural network (CNN) to generate a cancer classifier. Model fitting and hyperparameter tuning will be performed to optimize results and best accuracy in cancer classification. 

## Overview
The study will follow the structure below:
1. Review contents and structure of data. Clean and prepare data for analysis (drop missing values, number of features, length and types of data, etc), as well as perform EDA.
2. Generate data, choose model, and fit
4. Hyperparameter tuning to get best results
5. Conclusion (Discussion of results, areas of improvement)

### 1. Data Inspection and EDA

In [112]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import itertools
import shutil
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPool2D
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import RMSprop, Adam, SGD, Adadelta
from tensorflow.keras.callbacks import ReduceLROnPlateau


from numpy.random import seed
from glob import glob
from matplotlib import pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
import random
import cv2
import os

seed(1234)


In [5]:
#Load data
X_train = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
X_test = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

In [6]:
X_train.head()

In [7]:
X_train.describe()

In [8]:
X_train.info()

Data has 220025 entries with no missing values. There are two columns in the dataset, where one 'id' that represents the tissue/cell, and 'label' which represents whether the tissue is cancerous or not. Here, 0 is non-cancerous and 1 is cancerous.

In [9]:
X_train.label.value_counts()

In [19]:
sns.countplot(x = 'label', data = X_train)

The distribution of each label seems to be skewed slightly to 0 (non-cancerous cell). This is helpful to keep in mind when viewing our final results. For indexing and manipulation purposes, we will save the labels as string values.

In [28]:
X_train['label'] = X_train['label'].astype('str')
print(type(X_train['label'][0]))

In [33]:
#View a sample image
Image.open('../input/histopathologic-cancer-detection/train/' + os.listdir('../input/histopathologic-cancer-detection/train')[0])

This gives us a general idea of what the image looks like as our classification is performed.

In [41]:
#Split data to training and validation sets
lab = X_train['label']
x_train, x_val = train_test_split(X_train, test_size = 0.2, random_state = 124)

In [42]:
print(len(x_train), len(x_val))
print(x_train.shape, x_val.shape)
print(x_train['label'].value_counts())

## 2. Generate data, model, and fit

In [55]:
#Apply .tif extension
x_train['id'] = x_train['id'].apply(lambda x: x + '.tif')
x_val['id'] = x_val['id'].apply(lambda x: x + '.tif')

In [57]:
x_train.head()

In [58]:
x_val.head()

In [66]:
#Generate Data
gen_dat = ImageDataGenerator(rescale = float(1/255), vertical_flip = True, horizontal_flip = True, rotation_range = 10, zoom_range = 0.1, height_shift_range = 0.1, width_shift_range = 0.1)

In [68]:
# Train
gen_train = gen_dat.flow_from_dataframe(dataframe = x_train, directory = "../input/histopathologic-cancer-detection/train", x_col = 'id', y_col = 'label', batch_size = 32, seed = 124, shuffle = False, class_mode = 'binary', target_size = (96, 96))
gen_val = gen_dat.flow_from_dataframe(dataframe = x_val, directory = "../input/histopathologic-cancer-detection/train", x_col = 'id', y_col = 'label', batch_size = 32, seed = 124, shuffle = False, class_mode = 'binary', target_size = (96, 96))

It seems that all the images were validated. Now we will define earlystopping function 

In [74]:

# Define early stopping and allow for extraction of best model
class myCallback(tf.keras.callbacks.Callback):
    def endepoch(self, epoch, logs = {}):
        if(logs.get('val_accuracy') > 0.98):
            self.model.stop_training = True
c = myCallback()

lr_reduction = ReduceLROnPlateau(monitor='val_accuracy',
                                 patience=1, 
                                 verbose=1, 
                                 factor=0.5, 
                                 min_lr=1**(-10))

In [75]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape = (96, 96, 3)),
    tf.keras.layers.Dense(10),
])

In [76]:
model.summary()

Number of trainable parameters doesn't seem right. We'll add more layers according to theoretical and see what we get.

In [81]:
model = tf.keras.models.Sequential()
model.add(Conv2D(32,(3,3),strides=1,padding='Same',activation='relu',input_shape=(96, 96, 3)))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(64,(3,3), strides=1,padding= 'Same', activation='relu'))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(128,(3,3), strides=1,padding= 'Same', activation='relu'))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(256,(3,3), strides=1,padding= 'Same', activation='relu'))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(512, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation = "sigmoid"))


In [82]:
opt = Adam(learning_rate = 0.0025, beta_1 = 0.8, beta_2 = 0.8)
model.compile(optimizer = opt, loss = "binary_crossentropy", metrics = ["accuracy"])

In [None]:
model.summary()

Now the number of trainable parameters is much more intuitive. We are ready for fitting our model.

In [86]:
hist = model.fit_generator(gen_train, validation_data = gen_val, epochs = 25, verbose = 1, 
                           callbacks = [c, lr_reduction, ModelCheckpoint('Generated Models.h5', monitor='val_accuracy', verbose=1,
                             save_best_only=True, mode='max')])

## Evaluation / Assessment
The run took a lot longer than expected. Now that it's finally complete, it is ready for assessment. First, we will observe the first few lines and get a general sense of the values. Then we will plot metrics of both the accuracy and loss to evaluate results. As the final step, we will compute confusion matrix and display the performance metric.

In [87]:
hist = pd.DataFrame(hist.history)
hist.head(5)


In [89]:
# Create plots of training and validation accuracy/loss on top of each other to compare.
x_acc = hist['accuracy']
x_loss = hist['loss']
val_acc = hist['val_accuracy']
val_loss = hist['val_loss']

xax = range(1, len(x_acc) + 1, 1)

plt.plot(xax, x_acc, 'r', label = 'Training')
plt.plot(xax, val_acc, 'b', label = 'Validation')
plt.title('Accuracy Comparison: Training vs Validation')
plt.legend()
plt.figure()

plt.plot(xax, x_loss, 'r', label = 'Training')
plt.plot(xax, val_loss, 'b', label = 'Validation')
plt.title('Loss Comparison: Training vs Validation')
plt.legend()

plt.show()

In [94]:
m = load_model('./Generated Models.h5')
x_val2 = gen_dat.flow_from_dataframe(dataframe = x_val, directory = "../input/histopathologic-cancer-detection/train", x_col = 'id', y_col = 'label', batch_size = 32, seed = 124, shuffle = False, class_mode = 'binary', target_size = (96, 96))
m.evaluate(x_val2, steps = len(x_val), verbose = 1)

yhat_p = m.predict(x_val2, steps = len(x_val), verbose = 1)
yhat = np.round(yhat_p)
perf_metrics = classification_report(x_val2.classes, yhat)
print("Displaying performance metrics... :\n", perf_metrics)

Accuracy is above 0.90 which I think is pretty good, but it can be improved.  
Areas for improvement will be discussed in the conclusion.

In [100]:
#Confusion Matrix
fig, ax = plt.subplots(figsize = (10, 10))
conf_mat = confusion_matrix(x_val2.classes, yhat)
sns.heatmap(conf_mat, annot = True, linewidths = 0.01, cmap = "Blues", linecolor = 'black', ax = ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Validation Set Confusion Matrix")
plt.show()

## Evaluation / Conclusion
The accuracy came out to be slightly higher than 90%, which is pretty good. However, before feeding the data to generate models, the data was not split evenly among class labels. This causes the algorithm to misinterpret the probability and consequently over/underfit, leading to slightly lower accuracy.