# RSNA Screening Breast Cancer Detection
## Authors
- Jan Duinkerken Rodríguez
- René Kayr
- Yichen Huang

## Imports
We use numpy and pandas for preprocessing the data; tensorflow and keras to create the model, train and test it; and matplotlib to generate plots for our metrics and showing the images.

Upgrade tensorflow and cuda to remove `Cleanup Called` warning spam when training

In [None]:
!pip install tensorflow --upgrade --quiet
!yes | apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2

In [None]:
!pip install -q -U keras-tuner pydicom pylibjpeg dicomsdl

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import (Conv2D, 
                                     MaxPooling2D, 
                                     BatchNormalization, 
                                     Dense, 
                                     Dropout,
                                     GlobalMaxPooling2D)
from tensorflow.keras.layers.experimental.preprocessing import RandomRotation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
import copy
import pydicom
import dicomsdl
import seaborn as sns
from skimage import transform
from enum import Enum, auto
from tqdm import tqdm
import shutil
from PIL import Image
import keras_tuner as kt
import pylibjpeg
import cv2

## Distribution Strategy

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver().connect()
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
print('Number of replicas:', strategy.num_replicas_in_sync)

## Loading the dataset

In [None]:
PATH = "/kaggle/input/rsna-breast-cancer-detection"
PNG_PATH = "kaggle/input/rsna-breast-cancer-512-pngs"

# Load the labels
train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

# Define path to the image folders
# TRAIN_DIR = os.path.join(PATH,"train_images")
# TEST_DIR = os.path.join(PATH,"test_images")

## Inspect loaded CSVs

In [None]:
print("-- Training data --")
print("-> Train set shape           = {:,d} rows x {:d} columns".format(train_df.shape[0], train_df.shape[1]))
print("-> Number of unique images   = {:,d}".format(train_df["image_id"].nunique()))
print("-> Number of unique patients = {:,d}".format(train_df["patient_id"].nunique()))
print("-> Number of unique views    = {:d}".format(train_df["view"].nunique()))
print("-> Number of unique sites    = {:d}".format(train_df["site_id"].nunique()))
print("-> Number of unique machines = {:,d}".format(train_df["machine_id"].nunique()))
train_df

In [None]:
print("-- Test data --")
print("-> Test  set shape           = {:,d} rows x {:d} columns".format(test_df.shape[0], test_df.shape[1]))
print("-> Number of unique images   = {:,d}".format(test_df["image_id"].nunique()))
print("-> Number of unique patients = {:,d}".format(test_df["patient_id"].nunique()))
print("-> Number of unique views    = {:d}".format(test_df["view"].nunique()))
print("-> Number of unique sites    = {:d}".format(test_df["site_id"].nunique()))
print("-> Number of unique machines = {:,d}".format(test_df["machine_id"].nunique()))
test_df

In [None]:
patient_id = train_df[train_df.cancer == 1].iloc[0].patient_id

one_patient_df = train_df[train_df.patient_id == patient_id]

images_dir = '/kaggle/input/rsna-breast-cancer-detection/{}_images/{}/{}.dcm'

n_rows = len(one_patient_df)

plt.figure(figsize=(5 * n_rows, 5))
for i in range(n_rows):
    row = one_patient_df.iloc[i]
    
    plt.subplot(1, n_rows, i + 1)
    
    img_arr = dicomsdl.open(images_dir.format('train', row.patient_id, row.image_id)).pixelData()
    plt.imshow(img_arr, cmap = plt.cm.bone)
    plt.text(200, 300, row['view'], fontsize = 14, bbox={'facecolor': 'white', 'pad' : 5})
    plt.text(200, 700, row['cancer'], fontsize = 14, bbox={'facecolor': 'white', 'pad' : 5})


In [None]:
plt.figure(figsize=(5, 8))
sns.countplot(data = train_df, x="laterality", hue="cancer", dodge = False)

In [None]:
plt.figure(figsize=(10, 10))
sns.countplot(data = train_df, x="view", hue="cancer", dodge = False)

In [None]:
train_df.age.hist()

In [None]:
sns.histplot(train_df,
             x = "age",
             hue = "cancer",
             multiple = "dodge",
             shrink = .8,
             bins = 12);

In [None]:
train_df.boxplot(column = "age", by = "cancer")

In [None]:
train_df.view.value_counts()

## Load parts of the dataset to disk

In [None]:
# In addition to the previous inspection, get the actual values
train_subset_no_cancer = train_df[train_df.cancer == 0]
train_subset_cancer = train_df[train_df.cancer == 1]
print(train_subset_no_cancer.shape, train_subset_cancer.shape)
print(train_subset_no_cancer.laterality.value_counts())
print(train_subset_cancer.laterality.value_counts())

In [None]:
# Because they are not numerous, take the cancerous pictures from Left and Right respectively
train_subset_no_cancer_L = train_subset_no_cancer[train_subset_no_cancer.laterality == "L"].iloc[:588,]
train_subset_no_cancer_R = train_subset_no_cancer[train_subset_no_cancer.laterality == "R"].iloc[:570,]
train_subset_main = pd.concat([train_subset_no_cancer_L, train_subset_no_cancer_R, train_subset_cancer])
train_subset_main

In [None]:
!rm -rf /kaggle/working/input_transformed

In [None]:
os.mkdir("/kaggle/working/input_transformed/")
os.mkdir("/kaggle/working/input_transformed/0/")
os.mkdir("/kaggle/working/input_transformed/1/")

In [None]:
image_width = 224
image_height = 224

p_id = train_subset_main.patient_id
i_id = train_subset_main.image_id
cancer = train_subset_main.cancer
for pid, iid, cncr in tqdm(zip(p_id, i_id, cancer)):
    tmpFile = str(pid) + "_" + str(iid) + ".png"
    tmpSrc = "/kaggle/input/rsna-breast-cancer-512-pngs/" + tmpFile
    tmpDst = "/kaggle/working/input_transformed/" + str(cncr) + "/" + tmpFile
    shutil.copyfile(tmpSrc, tmpDst)
    img = Image.open(tmpDst)
    img = img.resize((image_width, image_height), Image.ANTIALIAS)
    img.save(tmpDst)

## Split data into training and validation partitions

In [None]:
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    "/kaggle/working/input_transformed/",
    color_mode = "rgb",
    image_size = (224, 224),
    shuffle = True,
    validation_split = 0.2,
    subset = "training",
    seed = 69)

In [None]:
valid_ds = tf.keras.preprocessing.image_dataset_from_directory(
    "/kaggle/working/input_transformed/",
    color_mode = "rgb",
    image_size = (224, 224),
    shuffle = True,
    validation_split = 0.2,
    subset = "validation",
    seed = 420)

## Build the model, visualize and do hyperparameter tunning

In [None]:
with strategy.scope():
    base_model = tf.keras.applications.resnet_v2.ResNet50V2(
            include_top = False,
            pooling = "max",
            input_shape = (image_height, image_width, 3),
            weights = "imagenet")

    # Keep weights of pretrained model untouched
    for layer in base_model.layers:
        layer.trainable = False

In [None]:
def model_builder(hp):
    with strategy.scope():
        model = tf.keras.Sequential()
        model.add(base_model)
        model.add(tf.keras.layers.Dense(512, activation = 'relu'))
        
        # Choose an optimal value between 32-512
        hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
        model.add(keras.layers.Dense(units=hp_units, activation='relu'))
        
        model.add(tf.keras.layers.Dense(256, activation = 'relu'))
        model.add(tf.keras.layers.Dense(128, activation = 'relu'))

        model.add(tf.keras.layers.Dropout(0.3))
        model.add(tf.keras.layers.Dense(2, activation = 'softmax'))

        for i, layer in enumerate(model.layers):
            if(layer.name == "resnet50v2"):
                layer.trainable = False

        # Tune the learning rate for the optimizer
        # Choose an optimal value from 0.01, 0.001, or 0.0001
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-4, 1e-5, 1e-6, 1e-7])

        model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])

    return model

In [None]:
!rm -rf /kaggle/working/rsna_cancer

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=20,
                     factor=3,
                     directory='/kaggle/working',
                     project_name='rsna_cancer')

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner.search(train_ds, validation_data = valid_ds, epochs=50, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the optimized densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes = True, show_dtype = True)

In [None]:
history = model.fit(train_ds, validation_data = valid_ds, epochs = 140, verbose = 1, workers = 8)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

## Training the model

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
history = hypermodel.fit(train_ds, validation_data = valid_ds, epochs = best_epoch, verbose = 1, workers = 8,)

## Inspect the results

In [None]:
loss = history.history['loss']
acc = history.history['accuracy']
val_loss = history.history['val_loss']
val_acc = history.history['val_accuracy']

epochs = range(1, len(loss) + 1)

plt.figure(figsize=(16, 5))

plt.subplot(1,2,1)
plt.plot(epochs, acc, 'bo', label = 'Training accuracy')
plt.plot(epochs, val_acc, 'r', label = 'Validation accuracy')
plt.legend()

plt.subplot(1,2,2)
plt.plot(epochs, loss, 'bo', label = 'Trainig loss')
plt.plot(epochs, val_loss, 'r', label = 'Validation loss')
plt.legend()

plt.show()

### Save the model

In [None]:
!rm model.h5

In [None]:
hypermodel.save('model.h5')