# [Histopathologic Cancer Detection Kaggle Competition](https://www.kaggle.com/c/histopathologic-cancer-detection) Solution by:
Gabriel Mendonça - mendonca.gabriel8@gmail.com

Lucas Ortega Venzel - venzellucas@gmail.com

inspired by: https://www.kaggle.com/soumya044/histopathologic-cancer-detection

## Preparing environment

In [None]:
import os
import numpy as np
import pandas as pd
import zipfile
import tqdm
import matplotlib
import google
import sklearn
import tensorflow as tf

First we need to setup the use of the kaggle API. Download your token and upload the file with the default name to the root folder of the project.

*follow this [link](https://www.kaggle.com/general/51898) for more instructions to get your token.*

In [None]:
uploaded = google.colab.files.upload()

Saving kaggle.json to kaggle.json


In [None]:
# move the token where the API will search for It.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
# force upgrade kaggle API (google colab default version is outdated)
!pip install --upgrade --force-reinstall --no-deps kaggle -q
# download the competition dataset
!kaggle competitions download -c histopathologic-cancer-detection
# extract the zip file
errors = []
with zipfile.ZipFile('histopathologic-cancer-detection.zip') as zf:
  for member in tqdm.tqdm(zf.infolist(), desc='Extracting '):
    try:
      zf.extract(member)
    except zipfile.error as e:
      error.append(member)
print('Extraction finished with ' + str(len(errors)) + ' errors.')

[?25l[K     |█████▋                          | 10kB 22.0MB/s eta 0:00:01[K     |███████████▏                    | 20kB 19.1MB/s eta 0:00:01[K     |████████████████▊               | 30kB 16.0MB/s eta 0:00:01[K     |██████████████████████▎         | 40kB 14.8MB/s eta 0:00:01[K     |███████████████████████████▉    | 51kB 7.2MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 4.9MB/s 
[?25h  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
Downloading histopathologic-cancer-detection.zip to /content
100% 6.30G/6.31G [02:18<00:00, 41.8MB/s]
100% 6.31G/6.31G [02:18<00:00, 48.7MB/s]


Extracting : 100%|██████████| 277485/277485 [03:31<00:00, 1312.34it/s]

Extraction finished with 0 errors.





## Creating Generators from Dataset

In [None]:
df = pd.read_csv('train_labels.csv')
df = df[df['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']
df['id'] = df['id'] + '.tif'
df['label'] = df['label'].apply(str)

In [None]:
ger = tf.keras.preprocessing.image.ImageDataGenerator(
    samplewise_std_normalization=True,
    rotation_range=90,
    horizontal_flip=True,
    vertical_flip=True,
    rescale=1/255,
    validation_split=0.3
)



In [None]:
gerador_treino = ger.flow_from_dataframe(
    df,
    directory='./train',
    x_col="id",
    y_col="label",
    target_size=(96, 96),
    class_mode='categorical',
    batch_size=16,
    shuffle=True,
    seed=0,
    subset='training',
)
gerador_valid = ger.flow_from_dataframe(
    df,
    directory='./train',
    x_col="id",
    y_col="label",
    target_size=(96, 96),
    class_mode='categorical',
    batch_size=16,
    shuffle=True,
    seed=0,
    subset='validation',
)

Found 154017 validated image filenames belonging to 2 classes.
Found 66007 validated image filenames belonging to 2 classes.


## Defining Network Topology

In [None]:
tamanho_kernel = (3,3)
tamanho_pool= (2,2)
filtros1 = 32
filtros2 = 64
filtros3 = 128

dropout_conv = 0.30
dropout_dense = 0.30


model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(filtros1, tamanho_kernel, activation = 'relu', input_shape = (96, 96, 3)))
model.add(tf.keras.layers.Conv2D(filtros1, tamanho_kernel, activation = 'relu'))
model.add(tf.keras.layers.Conv2D(filtros1, tamanho_kernel, activation = 'relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size = tamanho_pool)) 
model.add(tf.keras.layers.Dropout(dropout_conv))

model.add(tf.keras.layers.Conv2D(filtros2, tamanho_kernel, activation ='relu'))
model.add(tf.keras.layers.Conv2D(filtros2, tamanho_kernel, activation ='relu'))
model.add(tf.keras.layers.Conv2D(filtros2, tamanho_kernel, activation ='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size = tamanho_pool))
model.add(tf.keras.layers.Dropout(dropout_conv))

model.add(tf.keras.layers.Conv2D(filtros3, tamanho_kernel, activation ='relu'))
model.add(tf.keras.layers.Conv2D(filtros3, tamanho_kernel, activation ='relu'))
model.add(tf.keras.layers.Conv2D(filtros3, tamanho_kernel, activation ='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size = tamanho_pool))
model.add(tf.keras.layers.Dropout(dropout_conv))

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation = "relu"))
model.add(tf.keras.layers.Dropout(dropout_dense))
model.add(tf.keras.layers.Dense(2, activation = "softmax"))

In [None]:
model.compile(tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', 
              metrics=['AUC'])

## Fitting Model

In [None]:
caminho_modelo = "model.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(caminho_modelo, monitor='val_auc', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.1, patience=2, 
                                   verbose=1, mode='max', min_lr=0.000000001)

earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)                 

historico = model.fit(gerador_treino,
                                steps_per_epoch=gerador_treino.n//gerador_treino.batch_size,
                    validation_data=gerador_valid,
                    validation_steps=gerador_valid.n//gerador_valid.batch_size,
                    epochs = 15, verbose = 1,
                   callbacks = [checkpoint, reduce_lr, earlystop])


Epoch 00001: val_auc improved from -inf to 0.92048, saving model to model.h5


In [None]:
gerador_test = ger.flow_from_directory(
    directory='.',
    classes=['test'],
    target_size=(96, 96),
    class_mode='categorical',
    shuffle=False
)

Found 57458 images belonging to 1 classes.


In [None]:
predictions = model.predict_proba(gerador_test, verbose=1)





In [None]:
submission = pd.DataFrame(gerador_test.filenames, columns=['id'])

In [None]:
submission['label'] = pd.DataFrame(predictions)[1]

In [None]:
submission['id'] = submission['id'].str[5:-4]

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c histopathologic-cancer-detection -f submission.csv -m just_testing

100% 2.87M/2.87M [00:02<00:00, 1.13MB/s]
Successfully submitted to Histopathologic Cancer Detection