# CNN Cancer Detection

In [None]:
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import seaborn as sns
from PIL import Image
from tensorflow import keras
import tensorflow_io as tfio
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import load_model
from sklearn.utils import resample

## Description

Create an algorithm to identify metastatic cancer in small image patches taken from larger digital pathology scans using CNN model

## EDA

I created the train_data_path with "/" at the end to avoid being adding it everytime on the code

In [None]:
train_data_path = '../input/histopathologic-cancer-detection/train/'

In [None]:
len_train = len(os.listdir(train_data_path))
len_train

But, I didn't put for the test data because it's not needed

In [None]:
test_data_path = '../input/histopathologic-cancer-detection/test'

In [None]:
len_test = len(os.listdir(test_data_path))
len_test

Checking both Train and Test Data we can see that all pictures are in .TIF which means they have a higher quality than .PNG or .JPEG. .TIF or .TIFF are used in professinal photografy world and for a investigation of cancer, those images have to have a high-end quality and that's the why those images are in .TIF

They all have the same size 27.94kb

In [None]:
label_data = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv')

In [None]:
label_data

In [None]:
label_data.info()

In [None]:
label_data.describe()

In [None]:
label_count = label_data['label'].value_counts()

In [None]:
label_count

In [None]:
labels = label_data['label'].values

In [None]:
labels

0 means the image has no tumor tissue  
1 means the image has tumor tissue

In [None]:
plt.pie(label_count, labels = [0,1], startangle=90, autopct='%1.1f%%')
plt.title('Tumor Tissue')
plt.xlabel('0 = No Tumor, 1 = Tumor')
plt.ylabel('Propotion')

As the Training data has 40.5% of tumor tissue and 59.5% with no tumor issue, this data is not consided imbalanced.

In [None]:
label_data['id'].value_counts()

There are no duplicates id in the label data

In [None]:
len_train/(len_test+len_train)

In [None]:
len_test/(len_test+len_train)

As the training data is 79.3% of the total data and test data is 20.7%, It is recommended to have the validation data as approximaly the same size as test data, so I will split the training data in 60% and 19.3%, to leave the training data with at least 60% of the total data.

In [None]:
len_val = (len_test+len_train)*0.193
len_val

In [None]:
split_rate = len_val/len_train
split_rate

In [None]:
label_data['id'][10]

Showing few of the imaging and labels of the training data

In [None]:
for i in range(10):
    label_data['id'][i]
    im=cv2.imread(train_data_path + label_data['id'][i] +'.tif')
    f, ax = plt.subplots()
    plt.title(label_data['label'][i])
    ax.imshow(im, resample=True, cmap='gray')

As I had explained above, as the data is not considered imbalanced, so I will use the how train data to divide in training and valid data

## DModel Architecture

I've used the CNN architecture presented by @fmarazzi in this kernel:
https://www.kaggle.com/fmarazzi/baseline-keras-cnn-roc-fast-5min-0-8253-lb

The achitecture is simple: the model has relu as hidden activation fuctions and sigmoid for output because it is a single classification model

In [None]:
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

dropout_conv = 0.3
dropout_dense = 0.3


model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (96, 96, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size)) 
model.add(Dropout(dropout_conv))

model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(512, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(1, activation = "sigmoid"))

model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
label_data['label'] = label_data['label'].astype(str)  # Convert label column to string

My computer couldn't support to run the whole train data, not even a small part, so I used only he head 1000 to train my model,which is not ideal with batches size of 128. Number of epochs were 5 to speed the running process, by the epoch 2 my model is already tending to overfit because of the amount of data train I had to give

In [None]:
# Add the .tif extension to the 'id' column for correct file referencing
label_data['id'] = label_data['id'].apply(lambda x: f"{x}.tif")

# Preparing data generators
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=split_rate)  # Normalize images

batch_size = 128
train_steps = 1000*(1-split_rate) // batch_size  
val_steps = 1000*split_rate // batch_size    

train_gen = train_datagen.flow_from_dataframe(
    dataframe=label_data.head(1000),
    directory=train_data_path,
    x_col='id',
    y_col='label',
    target_size=(96, 96),
    class_mode='binary',
    batch_size=batch_size,
    subset='training'
)

val_gen = train_datagen.flow_from_dataframe(
    dataframe=label_data.head(1000),
    directory=train_data_path,
    x_col='id',
    y_col='label',
    target_size=(96, 96),
    class_mode='binary',
    batch_size=batch_size,
    subset='validation'
)

# Training the model
history = model.fit(
    train_gen,
    steps_per_epoch=train_steps,
    validation_data=val_gen,
    validation_steps=val_steps,
    epochs=5
)

In [None]:
test_ids = [filename[:-4] for filename in os.listdir(test_data_path)]
test_filenames = [os.path.join(test_data_path, filename) for filename in os.listdir(test_data_path)]
test_df = pd.DataFrame()
test_df["id"] = test_ids
test_df["filename"] = test_filenames

In [None]:
test_generator = train_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col="filename",
    y_col=None,
    target_size=(96, 96),
    color_mode="rgb",
    batch_size=batch_size,
    shuffle=False,
    class_mode=None,
    validate_filenames=False
)

In [None]:
test_probs = model.predict(test_generator)
test_labels = np.round(test_probs).astype(int).flatten()
out_df = pd.DataFrame()
out_df["id"] = test_ids
out_df["label"] = test_labels

In [None]:
out_df

In [None]:
out_df.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
pd.read_csv('/kaggle/working/submission.csv')

## Conclusion

My model preditct 50% of the results, the low value is due to the points below:
 - My computer couldn't support to run a database with images like that, it was too big for my processing.
 - The CCN architeture I chose is slower than other ones I saw around
 - I used only 1000 pictures
 - I used only the head 1000

My model proposition was to split the train data in a way we had train with 60%, validation data with 20% and test data with 20%. Train data + test data = 100%, test data was already 20%, so I decided to split train data in train and validation data to make that happen.
If I had a better computer using SGUs to support runnning it, my model will be better trainined and my results would have been much better.