### CNN Cancer Detection Kaggle Mini-Project
# Author: Yuran Liu
# Date: April 2025

# ------------------------------------------------------
# 1. Introduction
# -----------------------------------------------------

Problem Description:
This project addresses a binary image classification challenge: detecting metastatic cancer in histopathologic scans.
Each image is a small patch (96x96 RGB) extracted from a larger digital pathology slide.

Goal:
Predict whether a given patch contains cancerous tissue (label 1) or not (label 0).



# ------------------------------------------------------
# 2. Setup and Load Packages
# ------------------------------------------------------

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
print(os.listdir('/kaggle/input/histopathologic-cancer-detection'))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ------------------------------------------------------
# 3. Load and Explore Data (EDA)
# ------------------------------------------------------

In [None]:
# 1. Load labels
import pandas as pd
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
print('Train Labels Shape:', train_labels.shape)
train_labels.head()

# 2. Visualize class distribution
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='label', data=train_labels)
plt.title('Label Distribution (0 = Normal, 1 = Cancer)')
plt.show()

# 3. Randomly display sample images
import random
from matplotlib.image import imread

# Randomly select 5 normal images and 5 cancer images
normal_images = train_labels[train_labels['label'] == 0].sample(5)['id'].values
cancer_images = train_labels[train_labels['label'] == 1].sample(5)['id'].values

# Plot images
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
fig.suptitle('Top: Normal | Bottom: Cancer', fontsize=16)

for i in range(5):
    img_normal = imread(f'/kaggle/input/histopathologic-cancer-detection/train/{normal_images[i]}.tif')
    axes[0, i].imshow(img_normal)
    axes[0, i].axis('off')

    img_cancer = imread(f'/kaggle/input/histopathologic-cancer-detection/train/{cancer_images[i]}.tif')
    axes[1, i].imshow(img_cancer)
    axes[1, i].axis('off')

plt.tight_layout()
plt.show()


# ------------------------------------------------------
# 4. Data Preprocessing
# ------------------------------------------------------


In [None]:
# 4. Data Preprocessing
# ---------------------------------------------------

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load labels
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

# Fix id column by adding '.tif'
train_labels['id'] = train_labels['id'] + '.tif'

# Make sure label is string
train_labels['label'] = train_labels['label'].astype(str)

# Define path
train_dir = '/kaggle/input/histopathologic-cancer-detection/train'

# Parameters
image_size = 96
batch_size = 32

# Define ImageDataGenerator
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

valid_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

# Create training generator
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_labels,
    directory=train_dir,
    x_col='id',
    y_col='label',
    target_size=(image_size, image_size),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True,
    subset='training',
    seed=42,
    validate_filenames=False  # <--- IMPORTANT FIX
)

# Create validation generator
valid_generator = valid_datagen.flow_from_dataframe(
    dataframe=train_labels,
    directory=train_dir,
    x_col='id',
    y_col='label',
    target_size=(image_size, image_size),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True,
    subset='validation',
    seed=42,
    validate_filenames=False  # <--- IMPORTANT FIX
)


# ------------------------------------------------------
# 5. Model Architecture
# ------------------------------------------------------

In [None]:
# 5. Build a Baseline CNN Model
# ---------------------------------------------

import tensorflow as tf
from tensorflow.keras import layers, models

# Define the model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Summary
model.summary()


# ------------------------------------------------------
# 6. Training
# ------------------------------------------------------

In [None]:
# 6. Train the Model
# ---------------------------------------------------

# Define number of epochs
epochs = 2  # <-- 改小！！

# Train the model
history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=valid_generator
)


In [None]:
history = model.fit(
    train_generator,
    epochs=1,  # 只跑1个，省时间
    validation_data=valid_generator
)


# ------------------------------------------------------
# 7. Results and Analysis
# ------------------------------------------------------

In [None]:
# 7. Generate Predictions for Kaggle Submission (FAST VERSION)
# -------------------------------------------------------------
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load sample submission
submission = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/sample_submission.csv')

# IMPORTANT: Fix filenames
submission['id'] = submission['id'] + '.tif'

# Create a test data generator (only rescaling)
test_datagen = ImageDataGenerator(rescale=1./255)

# Create a test generator
test_generator = test_datagen.flow_from_dataframe(
    dataframe=submission,
    directory='/kaggle/input/histopathologic-cancer-detection/test',
    x_col='id',
    y_col=None,
    target_size=(96, 96),
    class_mode=None,
    batch_size=32,
    shuffle=False
)

# Predict all test images at once
preds = model.predict(test_generator, verbose=1)

# Since preds are probabilities, we can just keep them directly
submission['label'] = preds

# Save submission
submission['id'] = submission['id'].str.replace('.tif', '')  # remove .tif before saving
submission.to_csv('submission.csv', index=False)

print('✅ Submission file saved as submission.csv!')


# ------------------------------------------------------
# 8. Conclusion and Future Work
# ------------------------------------------------------

"""
Summarize what worked and what didn’t:
- Model performed decently with basic CNN.
- Potential improvements: try transfer learning with ResNet50, EfficientNet.
- Apply data augmentation to increase variety.
- Tune hyperparameters like learning rate, batch size, optimizer.
"""

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [None]:
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
train_labels['id'] = train_labels['id'] + '.tif'
train_labels['label'] = train_labels['label'].astype(str)


In [None]:
train_dir = '/kaggle/input/histopathologic-cancer-detection/train'
image_size = 96
batch_size = 32

train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

valid_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_labels,
    directory=train_dir,
    x_col='id',
    y_col='label',
    target_size=(image_size, image_size),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True,
    subset='training',
    seed=42,
    validate_filenames=False
)

valid_generator = valid_datagen.flow_from_dataframe(
    dataframe=train_labels,
    directory=train_dir,
    x_col='id',
    y_col='label',
    target_size=(image_size, image_size),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True,
    subset='validation',
    seed=42,
    validate_filenames=False
)


In [None]:
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [None]:
history = model.fit(
    train_generator,
    epochs=1,  # just 1 epoch to be fast
    validation_data=valid_generator
)
