<a href="https://colab.research.google.com/github/thisismcgovern/breast_cancer_cnn/blob/main/breast_cancer_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Data and Libraries

1. Importing Data.

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"thisismcgovern","key":"525a4176e4d0859dc8de72d86b9ce884"}'}

In [2]:
import os
import zipfile

os.makedirs("/root/.kaggle", exist_ok=True)
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d paultimothymooney/breast-histopathology-images

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images
License(s): CC0-1.0
Downloading breast-histopathology-images.zip to /content
 99% 3.08G/3.10G [00:24<00:00, 221MB/s]
100% 3.10G/3.10G [00:24<00:00, 137MB/s]


In [4]:
with zipfile.ZipFile("breast-histopathology-images.zip", 'r') as zip_ref:
    zip_ref.extractall("IDC_dataset")

2. Explore Data Sets

In [5]:
import os

base_path = "IDC_dataset"
for root, dirs, files in os.walk(base_path):
    print(f"Root: {root}, Dirs: {dirs}, Files: {len(files)}")
    break

Root: IDC_dataset, Dirs: ['12933', '15902', '13460', '13019', '15512', '12906', '14210', '15839', '12951', '8863', '13025', '16531', '14190', '14211', '9267', '12823', '12817', '12750', '16532', '14189', '13023', '12890', '16534', '9324', '12818', '12879', '12886', '16568', '14157', '12955', '9381', '12907', '12930', '9322', '10291', '9075', '9262', '14304', '9319', '8956', '12880', '12954', '12824', '12822', '10295', '16167', '9081', '12810', '12871', '10308', '10261', '9290', '9265', '8914', '10293', '16895', '9123', '13691', '9255', '8913', '9181', '8980', 'IDC_regular_ps50_idx5', '10282', '12242', '14155', '16014', '13458', '13666', '10304', '12898', '10288', '12869', '12947', '16553', '10254', '10255', '16085', '9226', '9078', '9041', '9261', '9083', '13462', '14191', '10285', '13106', '12901', '15516', '13402', '9178', '12896', '9257', '12867', '16550', '9291', '15633', '8917', '12932', '12820', '12909', '15632', '10278', '10307', '12905', '14209', '13617', '16166', '12934', '912

In [6]:
sample_patient = os.path.join(base_path, "10253")  # replace with any valid folder
sample_images = os.listdir(sample_patient)
print(sample_images[:5])


['0', '1']


In [7]:
import shutil
from pathlib import Path

dest_base = "cleaned_dataset"
os.makedirs(os.path.join(dest_base, "class0"), exist_ok=True)
os.makedirs(os.path.join(dest_base, "class1"), exist_ok=True)

img_count = {"class0": 0, "class1": 0}

for patient_folder in os.listdir(base_path):
    patient_path = os.path.join(base_path, patient_folder)
    if os.path.isdir(patient_path):
        for class_folder in ['0', '1']:
            class_path = os.path.join(patient_path, class_folder)
            if os.path.isdir(class_path):
                for fname in os.listdir(class_path):
                    src = os.path.join(class_path, fname)
                    dst_class = f"class{class_folder}"
                    dst = os.path.join(dest_base, dst_class, f"{patient_folder}_{class_folder}_{fname}")
                    shutil.copy(src, dst)
                    img_count[dst_class] += 1

print("Images copied:")
print(img_count)


Images copied:
{'class0': 198738, 'class1': 78786}


## Spliting Datasets

In [8]:
import tensorflow as tf

batch_size = 32
img_size = (50, 50)  # all images are 50x50 by design

train_ds = tf.keras.utils.image_dataset_from_directory(
    "cleaned_dataset",
    validation_split=0.2,
    subset="training",
    seed=42,
    image_size=img_size,
    batch_size=batch_size
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    "cleaned_dataset",
    validation_split=0.2,
    subset="validation",
    seed=42,
    image_size=img_size,
    batch_size=batch_size
)


Found 277524 files belonging to 2 classes.
Using 222020 files for training.
Found 277524 files belonging to 2 classes.
Using 55504 files for validation.


## Training Prep

In [9]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [10]:
from sklearn.utils import class_weight
import numpy as np

# Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=[0]*198738 + [1]*78786
)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
print(class_weights_dict)


{0: np.float64(0.6982157413277783), 1: np.float64(1.761251999086132)}


## DEFNING A MODEL

In [11]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Rescaling(1./255, input_shape=(50, 50, 3)),  # normalize pixel values
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPooling2D(),

    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),

    layers.Conv2D(128, 3, activation='relu'),
    layers.MaxPooling2D(),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')  # output: binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)






  super().__init__(**kwargs)


## Training and Summary\

In [12]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    class_weight=class_weights_dict
)

Epoch 1/10
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 39ms/step - accuracy: 0.7737 - loss: 0.4976 - val_accuracy: 0.8351 - val_loss: 0.3769
Epoch 2/10
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 5ms/step - accuracy: 0.8268 - loss: 0.4052 - val_accuracy: 0.8571 - val_loss: 0.3593
Epoch 3/10
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 5ms/step - accuracy: 0.8352 - loss: 0.3875 - val_accuracy: 0.8486 - val_loss: 0.3543
Epoch 4/10
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step - accuracy: 0.8401 - loss: 0.3795 - val_accuracy: 0.8475 - val_loss: 0.3677
Epoch 5/10
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 5ms/step - accuracy: 0.8383 - loss: 0.3790 - val_accuracy: 0.8606 - val_loss: 0.3362
Epoch 6/10
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5ms/step - accuracy: 0.8415 - loss: 0.3702 - val_accuracy: 0.8486 - val_loss: 0.3508
Epoch 7/

<keras.src.callbacks.history.History at 0x7f19388c8f50>

In [13]:
model.summary()