In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Basics
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')

# Manage files
from PIL import Image
import os

# Others
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
filepath = '/kaggle/input/rsna-breast-cancer-detection/train.csv'
train_data = pd.read_csv(filepath)
train_data.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True


In [4]:
train_data_malignant = train_data[train_data['cancer']==1]
train_data_malignant.shape

(1158, 14)

In [5]:
train_data_benign = train_data[(train_data['cancer']==0) & (train_data['difficult_negative_case']==False)]
train_data_benign.shape

(45843, 14)

In [6]:
# remove patients with implant
train_data_benign_with_no_implant = train_data_benign[train_data_benign['implant'] != 1]
train_data_benign_with_no_implant = train_data_benign_with_no_implant.sample(frac=0.026, random_state=41)
train_data_benign_with_no_implant.shape

(1161, 14)

In [7]:
dataset = pd.concat([train_data_malignant, train_data_benign_with_no_implant])
# shuffle dataset
dataset = dataset.sample(frac=1)
dataset.shape

(2319, 14)

In [8]:
dataset['file_path'] = dataset['patient_id'].astype(str) + '/' + dataset['image_id'].astype(str) + '.dcm'
image_paths = dataset['file_path'].tolist()
dataset.head(2)

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,file_path
48655,2,62475,54481313,R,CC,65.0,0,0,0,,0,,29,False,62475/54481313.dcm
408,2,10432,1434858530,L,MLO,65.0,1,1,1,,0,,48,False,10432/1434858530.dcm


In [9]:
gen_frame = dataset.copy()
gen_frame = gen_frame.drop(columns=['file_path'])
gen_frame['path'] = gen_frame['image_id'].astype(str) + '.jpeg'
gen_frame['cancer'] = gen_frame['cancer'].astype(str)
gen_frame.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,path
48655,2,62475,54481313,R,CC,65.0,0,0,0,,0,,29,False,54481313.jpeg
408,2,10432,1434858530,L,MLO,65.0,1,1,1,,0,,48,False,1434858530.jpeg
32922,1,45263,1916272770,R,CC,74.0,0,0,0,2.0,0,B,216,False,1916272770.jpeg
8414,2,19003,4493744,L,MLO,74.0,1,1,1,,0,,21,False,4493744.jpeg
1036,1,11094,1417771843,L,CC,74.0,1,1,1,0.0,0,A,49,False,1417771843.jpeg


In [10]:
from sklearn.model_selection import train_test_split

training_frame, validation_frame = train_test_split(gen_frame, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [11]:
training_frame.shape

(1623, 15)

In [12]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

root_dir = '/kaggle/input/rsna-data/'

train_gen = ImageDataGenerator(rotation_range=20,
                             width_shift_range=0.2,
                             height_shift_range=0.2,
                             shear_range=0.2,
                             zoom_range=0.2,
                             horizontal_flip=True,
                             vertical_flip=True)

training_generator = train_gen.flow_from_dataframe(training_frame,
                                                directory=root_dir,
                                                x_col='path',
                                                y_col='cancer',
                                                target_size=(250, 250),
                                                class_mode='binary',
                                                batch_size=20)

valid_gen = ImageDataGenerator(rotation_range=20,
                             width_shift_range=0.2,
                             height_shift_range=0.2,
                             shear_range=0.2,
                             zoom_range=0.2,
                             horizontal_flip=True,
                             vertical_flip=True)

validation_generator = valid_gen.flow_from_dataframe(validation_frame,
                                                directory=root_dir,
                                                x_col='path',
                                                y_col='cancer',
                                                target_size=(250, 250),
                                                class_mode='binary',
                                                batch_size=20)

Found 1623 validated image filenames belonging to 2 classes.
Found 696 validated image filenames belonging to 2 classes.


In [13]:
# Download the pre-trained weights. No top means it excludes the fully connected layer it uses for classification.
!wget --no-check-certificate \
    https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5 \
    -O /tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5

--2023-03-06 07:14:43--  https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.215.128, 173.194.214.128, 173.194.217.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.215.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 87910968 (84M) [application/x-hdf]
Saving to: ‘/tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5’


2023-03-06 07:14:44 (238 MB/s) - ‘/tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5’ saved [87910968/87910968]



In [14]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import layers

# Set the weights file you downloaded into a variable
local_weights_file = '/tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'

# Initialize the base model.
# Set the input shape and remove the dense layers.
pre_trained_model = InceptionV3(input_shape = (250, 250, 3), 
                                include_top = False, 
                                weights = None)

# Load the pre-trained weights you downloaded.
pre_trained_model.load_weights(local_weights_file)

# Freeze the weights of the layers.
for layer in pre_trained_model.layers:
    layer.trainable = False

In [15]:
pre_trained_model.summary()

Model: "inception_v3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 250, 250, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 124, 124, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 124, 124, 32) 96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 124, 124, 32) 0           batch_normalization[0][0]        
_______________________________________________________________________________________

In [16]:
# Choose `mixed_7` as the last layer of your base model
last_layer = pre_trained_model.get_layer('mixed7')
print('last layer output shape: ', last_layer.output_shape)
last_output = last_layer.output

last layer output shape:  (None, 13, 13, 768)


In [17]:
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import Model

# Flatten the output layer to 1 dimension
x = layers.Flatten()(last_output)
# Add a fully connected layer with 1,024 hidden units and ReLU activation
x = layers.Dense(1024, activation='relu')(x)
# Add a dropout rate of 0.2
x = layers.Dropout(0.2)(x)                  
# Add a final sigmoid layer for classification
x = layers.Dense  (1, activation='sigmoid')(x)           

# Append the dense network to the base model
model = Model(pre_trained_model.input, x) 

# Print the model summary. See your dense network connected at the end.
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 250, 250, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 124, 124, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 124, 124, 32) 96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 124, 124, 32) 0           batch_normalization[0][0]        
______________________________________________________________________________________________

In [18]:
# Set the training parameters
model.compile(optimizer = RMSprop(learning_rate=0.0001), 
              loss = 'binary_crossentropy', 
              metrics = ['accuracy'])

In [19]:
# Train the model.
history = model.fit(
            training_generator,
            validation_data = validation_generator,
            steps_per_epoch = 80,
            epochs = 15,
            validation_steps = 35,
            verbose = 2)

Epoch 1/15
80/80 - 283s - loss: 21.5992 - accuracy: 0.5016 - val_loss: 5.3611 - val_accuracy: 0.5158
Epoch 2/15
80/80 - 228s - loss: 3.0925 - accuracy: 0.5205 - val_loss: 0.8966 - val_accuracy: 0.5144
Epoch 3/15
80/80 - 223s - loss: 0.9607 - accuracy: 0.5054 - val_loss: 1.4648 - val_accuracy: 0.4986
Epoch 4/15
80/80 - 223s - loss: 0.9057 - accuracy: 0.5218 - val_loss: 0.7826 - val_accuracy: 0.5043
Epoch 5/15
80/80 - 225s - loss: 0.9126 - accuracy: 0.5167 - val_loss: 0.8226 - val_accuracy: 0.5172
Epoch 6/15
80/80 - 227s - loss: 0.8069 - accuracy: 0.5167 - val_loss: 0.9540 - val_accuracy: 0.5115
Epoch 7/15
80/80 - 222s - loss: 0.7923 - accuracy: 0.5142 - val_loss: 0.8117 - val_accuracy: 0.5273
Epoch 8/15
80/80 - 221s - loss: 0.7505 - accuracy: 0.5073 - val_loss: 0.7255 - val_accuracy: 0.5230
Epoch 9/15
80/80 - 223s - loss: 0.7499 - accuracy: 0.5294 - val_loss: 0.7066 - val_accuracy: 0.5158
Epoch 10/15
80/80 - 221s - loss: 0.7921 - accuracy: 0.5167 - val_loss: 0.7586 - val_accuracy: 0.514