In [52]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'histopathologic-cancer-detection:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F11848%2F862157%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240327%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240327T114230Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4f2148a1199fa670a09d77f86bf80a5ba3a0ec34df7e296d719a5612b74d57f0033fe6f99cf0e87d377dc553921036a8e3f5c7acec2a3aeeb1e9fda949181d61a9b2acfad094fdbcebf180b277463dfe7230302cd58b16676eb6204b0fec2a49005d6d047a8bc2a7302ff1ccdd2d126287b349c81e355d07d79f847eeb99bcb67ee1187442b2b3b7df9412531d30918a1fcc1a1f07830715732666c8ec2e35f762ab105e11616e7e815d5981b3de99c49d27dab8ab794678a1c99b749e6b7b7f8a4d18d9fea1dd082498b321f0466f6507dbe61fc6c624530d1873c338af23db54ed0ab30dab4c9157aa825a2fc6509f80304430975c2488b91bf72da9b2288d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading histopathologic-cancer-detection, 6773228425 bytes compressed
Downloaded and uncompressed: histopathologic-cancer-detection
Data source import complete.


In [66]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

In [67]:
base_model = tf.keras.applications.MobileNetV2(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5


In [95]:
# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Add new classification layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(1, activation='sigmoid')(x)

In [96]:
# This is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [135]:
labels_df = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
labels_df['id'] = labels_df['id'].astype(str) + '.tif'  # Assuming the images are .tif files
labels_df['label'] = labels_df['label'].astype(str)

# Create the ImageDataGenerator for data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Assuming the images are stored in a directory named 'train_dir'
train_generator = train_datagen.flow_from_dataframe(
    dataframe=labels_df,
    directory='/kaggle/input/histopathologic-cancer-detection/train/',
    x_col='id',  # Column in dataframe that contains the filenames
    y_col='label',  # Column in dataframe that contains the labels
    target_size=(128, 128),  # Resize images
    class_mode='binary',  # Since you have a binary classification problem
    batch_size=32  # Adjust based on your memory constraints
)

Found 220025 validated image filenames belonging to 2 classes.


In [98]:
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,  # Ensure this matches your desired steps
    epochs=1,)



In [118]:
model.save('/content/my_model.keras')

In [130]:
sample_submission_df = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/sample_submission.csv')
sample_submission_df['label'] = sample_submission_df['id'].astype(str)
sample_submission_df['id'] = sample_submission_df['id'].apply(lambda x: f"{x}.tif" if not x.endswith('.tif') else x)

In [120]:
sample_submission_df.head()

Unnamed: 0,id,label
0,0b2ea2a822ad23fdb1b5dd26653da899fbd2c0d5.tif,0
1,95596b92e5066c5c52466c90b69ff089b39f2737.tif,0
2,248e6738860e2ebcf6258cdc1f32f299e0c76914.tif,0
3,2c35657e312966e9294eac6841726ff3a748febf.tif,0
4,145782eb7caa1c516acbe2eda34d9a3f31c41fd6.tif,0


In [131]:
import os
for file in os.listdir("/kaggle/input/histopathologic-cancer-detection/test"):
    if '0b2ea2a822ad23fdb1b5dd26653da899fbd2c0d5' in file:
      print(file)

0b2ea2a822ad23fdb1b5dd26653da899fbd2c0d5.tif


In [132]:
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=sample_submission_df,
    directory='/kaggle/input/histopathologic-cancer-detection/test',
    x_col='id',
    y_col='label',# Column in dataframe that contains the filenames  # Column in dataframe that contains the labels
    target_size=(128, 128),  # Match the input size of your model
    class_mode='binary',  # Because we don't have labels
    batch_size=32,
    shuffle=False  # Keep data in same order as DataFrame
)

ValueError: If class_mode="binary" there must be 2 classes. Found 57458 classes.

In [122]:
predictions = model.predict(test_generator, steps=len(test_generator), verbose=1)



In [123]:
# Extract image ids from the test generator filenames
image_ids = [os.path.splitext(os.path.basename(filename))[0] for filename in test_generator.filenames]
# Create a DataFrame with the required submission structure
submission_df = pd.DataFrame({'id': image_ids, 'label': predictions.flatten()})

In [124]:
submission_df.to_csv('/content/submission.csv', index=False)