<a href="https://colab.research.google.com/github/urvashi2004/ML_MiniProjects/blob/main/SkinCancer/SkinCancerMendeleyVGG16ResNet50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from google.colab import drive, files

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
drive_base_path = '/content/drive/My Drive/Colab Notebooks/SkinCancer/Mendeley'
zip_file_path = '/content/drive/My Drive/Colab Notebooks/SkinCancer/Mendeley/mendeleydataset.zip'
csv_file_path = '/content/drive/My Drive/Colab Notebooks/SkinCancer/Mendeley/metadata.csv'
dataset_dir = '/content/dataset'

Mounted at /content/drive


In [7]:
# Ensure dataset extraction
if not os.path.exists(dataset_dir):  # Check if dataset has already been extracted
    os.makedirs(dataset_dir, exist_ok=True)
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_dir)
    print(f"Dataset extracted to {dataset_dir}.")
else:
    print(f"Dataset directory already exists at {dataset_dir}.")

# Check directory structure
print("Checking dataset directory structure...")
if os.path.exists(dataset_dir):
    print("Top-level contents of dataset directory:")
    print(os.listdir(dataset_dir))  # Top-level contents
else:
    raise FileNotFoundError(f"Dataset directory not found at {dataset_dir}. Ensure the zip file was correctly extracted.")

# Handle subdirectory if needed
subdir = os.path.join(dataset_dir, "mendeleydataset")  # Modify based on structure after extraction
if os.path.exists(subdir):
    print(f"Contents of '{subdir}':")
    print(os.listdir(subdir)[:10])  # List first 10 items
else:
    print(f"No subdirectory named 'mendeleydataset' found in {dataset_dir}. Proceeding with top-level folder.")


Extracting dataset...
Dataset extracted to /content/dataset.
Checking dataset directory structure...
Top-level contents of dataset directory:
['mendeleydataset']
Contents of '/content/dataset/mendeleydataset':
['PAT_616_1169_3.png', 'PAT_313_669_908.png', 'PAT_1765_3337_365.png', 'PAT_237_362_52.png', 'PAT_1893_3761_666.png', 'PAT_245_376_24.png', 'PAT_238_364_387.png', 'PAT_1420_1461_93.png', 'PAT_72_110_647.png', 'PAT_837_1582_714.png']


In [9]:
# Reload metadata
labels_df = pd.read_csv(csv_file_path)
print(labels_df['img_id'].head())  # Preview image IDs in the CSV

# Compare with dataset directory contents
if os.path.exists(subdir):
    extracted_files = set(os.listdir(subdir))
    csv_files = set(labels_df['img_id'])
    common_files = extracted_files.intersection(csv_files)

    print(f"Common files: {len(common_files)}")
    print(f"Missing files: {len(csv_files - extracted_files)}")

0    PAT_1516_1765_530.png
1       PAT_46_881_939.png
2    PAT_1545_1867_547.png
3    PAT_1989_4061_934.png
4     PAT_684_1302_588.png
Name: img_id, dtype: object
Common files: 2298
Missing files: 0


In [10]:
# Create directory for the dataset
os.makedirs(dataset_dir, exist_ok=True)

# Extract the single zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)

print(f"Dataset extracted to {dataset_dir}")

labels_df = pd.read_csv(csv_file_path)

# Add 'cancer' or 'non_cancer' labels based on 'biopsed' column
labels_df['label'] = labels_df['biopsed'].apply(lambda x: 'cancer' if x else 'non_cancer')

# Map images to labels
image_paths = []
labels = []

# Use the correct subdirectory for the dataset
image_dir = os.path.join(dataset_dir, "mendeleydataset")

for _, row in labels_df.iterrows():
    img_path = os.path.join(image_dir, row['img_id'])  # Corrected image path
    if os.path.exists(img_path):  # Check if the file exists
        image_paths.append(img_path)
        labels.append(row['label'])

print(f"Found {len(image_paths)} images with labels.")

# Split dataset into training and validation sets
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, stratify=labels, random_state=42
)

Dataset extracted to /content/dataset
Found 2298 images with labels.


In [11]:
# Function to preprocess data
def preprocess_image(img_path, target_size=(224, 224)):
    img = load_img(img_path, target_size=target_size)
    img_array = img_to_array(img) / 255.0
    return img_array

# Custom data generator
def create_data_generator(image_paths, labels, batch_size=32, target_size=(224, 224)):
    while True:
        for i in range(0, len(image_paths), batch_size):
            batch_paths = image_paths[i:i+batch_size]
            batch_labels = labels[i:i+batch_size]
            images = np.array([preprocess_image(path, target_size) for path in batch_paths])
            label_array = np.array([1 if label == 'cancer' else 0 for label in batch_labels])
            yield images, label_array

In [12]:
# Function to define models
def create_model(base_model):
    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    return model

# VGG16 Model
vgg16_base = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
vgg16_base.trainable = False
vgg16_model = create_model(vgg16_base)

# ResNet50 Model
resnet50_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
resnet50_base.trainable = False
resnet50_model = create_model(resnet50_base)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [13]:
# Compile and train models
def train_model(model, train_gen, val_gen, train_steps, val_steps, epochs=10):
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    history = model.fit(
        train_gen,
        validation_data=val_gen,
        steps_per_epoch=train_steps,
        validation_steps=val_steps,
        epochs=epochs
    )
    return history

# Create data generators
batch_size = 32
train_steps = len(train_paths) // batch_size
val_steps = len(val_paths) // batch_size
train_gen = create_data_generator(train_paths, train_labels, batch_size=batch_size)
val_gen = create_data_generator(val_paths, val_labels, batch_size=batch_size)

# Train VGG16
print("Training VGG16 model...")
vgg16_history = train_model(vgg16_model, train_gen, val_gen, train_steps, val_steps)

# Train ResNet50
print("Training ResNet50 model...")
resnet50_history = train_model(resnet50_model, train_gen, val_gen, train_steps, val_steps)

# Save models
vgg16_model.save('vgg16_skin_cancer_model.h5')
resnet50_model.save('resnet50_skin_cancer_model.h5')

Training VGG16 model...
Epoch 1/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 2s/step - accuracy: 0.5183 - loss: 0.7233 - val_accuracy: 0.5893 - val_loss: 0.6715
Epoch 2/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 1s/step - accuracy: 0.5492 - loss: 0.7033 - val_accuracy: 0.5935 - val_loss: 0.6662
Epoch 3/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 1s/step - accuracy: 0.5476 - loss: 0.6964 - val_accuracy: 0.5911 - val_loss: 0.6613
Epoch 4/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 1s/step - accuracy: 0.5830 - loss: 0.6710 - val_accuracy: 0.5981 - val_loss: 0.6596
Epoch 5/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 1s/step - accuracy: 0.5820 - loss: 0.6738 - val_accuracy: 0.6262 - val_loss: 0.6482
Epoch 6/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 1s/step - accuracy: 0.5690 - loss: 0.6768 - val_accuracy: 0.6308 - val_loss: 0.6484
Epoch 7/10
[1m



In [14]:
# Function to predict an image
def predict_image(image_path, model, target_size=(224, 224)):
    img = preprocess_image(image_path, target_size)
    img_array = np.expand_dims(img, axis=0)
    prediction = model.predict(img_array)
    return 'Cancerous' if prediction[0][0] > 0.5 else 'Non-Cancerous'

# Upload and test an image
print("Upload an image to test the model.")
uploaded = files.upload()
uploaded_image_path = list(uploaded.keys())[0]

print(f"Prediction using VGG16: {predict_image(uploaded_image_path, vgg16_model)}")
print(f"Prediction using ResNet50: {predict_image(uploaded_image_path, resnet50_model)}")

Upload an image to test the model.


Saving PAT_27_38_240.png to PAT_27_38_240.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Prediction using VGG16: Cancerous
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
Prediction using ResNet50: Cancerous


In [15]:
# Function to predict an image
def predict_image(image_path, model, target_size=(224, 224)):
    img = preprocess_image(image_path, target_size)
    img_array = np.expand_dims(img, axis=0)
    prediction = model.predict(img_array)
    return 'Cancerous' if prediction[0][0] > 0.5 else 'Non-Cancerous'

# Upload and test an image
print("Upload an image to test the model.")
uploaded = files.upload()
uploaded_image_path = list(uploaded.keys())[0]

print(f"Prediction using VGG16: {predict_image(uploaded_image_path, vgg16_model)}")
print(f"Prediction using ResNet50: {predict_image(uploaded_image_path, resnet50_model)}")

Upload an image to test the model.


Saving PAT_56_88_274.png to PAT_56_88_274.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction using VGG16: Cancerous
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction using ResNet50: Cancerous


In [16]:
# Function to predict an image
def predict_image(image_path, model, target_size=(224, 224)):
    img = preprocess_image(image_path, target_size)
    img_array = np.expand_dims(img, axis=0)
    prediction = model.predict(img_array)
    return 'Cancerous' if prediction[0][0] > 0.5 else 'Non-Cancerous'

# Upload and test an image
print("Upload an image to test the model.")
uploaded = files.upload()
uploaded_image_path = list(uploaded.keys())[0]

print(f"Prediction using VGG16: {predict_image(uploaded_image_path, vgg16_model)}")
print(f"Prediction using ResNet50: {predict_image(uploaded_image_path, resnet50_model)}")

Upload an image to test the model.


Saving PAT_230_350_872.png to PAT_230_350_872.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Prediction using VGG16: Cancerous
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Prediction using ResNet50: Cancerous


In [17]:
# Function to predict an image
def predict_image(image_path, model, target_size=(224, 224)):
    img = preprocess_image(image_path, target_size)
    img_array = np.expand_dims(img, axis=0)
    prediction = model.predict(img_array)
    return 'Cancerous' if prediction[0][0] > 0.5 else 'Non-Cancerous'

# Upload and test an image
print("Upload an image to test the model.")
uploaded = files.upload()
uploaded_image_path = list(uploaded.keys())[0]

print(f"Prediction using VGG16: {predict_image(uploaded_image_path, vgg16_model)}")
print(f"Prediction using ResNet50: {predict_image(uploaded_image_path, resnet50_model)}")

Upload an image to test the model.


Saving PAT_230_1008_673.png to PAT_230_1008_673.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Prediction using VGG16: Cancerous
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Prediction using ResNet50: Cancerous
