<a href="https://colab.research.google.com/github/urvashi2004/ML_MiniProjects/blob/main/SkinCancer/SkinCancerMendeleyVGGResNetOverfit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from google.colab import drive, files

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
drive_base_path = '/content/drive/My Drive/Colab Notebooks/SkinCancer/Mendeley'
zip_file_path = '/content/drive/My Drive/Colab Notebooks/SkinCancer/Mendeley/mendeleydataset.zip'
csv_file_path = '/content/drive/My Drive/Colab Notebooks/SkinCancer/Mendeley/metadata.csv'
dataset_dir = '/content/dataset'

Mounted at /content/drive


In [19]:
# Ensure dataset extraction
if not os.path.exists(dataset_dir):  # Check if dataset has already been extracted
    os.makedirs(dataset_dir, exist_ok=True)
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_dir)
    print(f"Dataset extracted to {dataset_dir}.")
else:
    print(f"Dataset directory already exists at {dataset_dir}.")

# Check directory structure
print("Checking dataset directory structure...")
if os.path.exists(dataset_dir):
    print("Top-level contents of dataset directory:")
    print(os.listdir(dataset_dir))  # Top-level contents
else:
    raise FileNotFoundError(f"Dataset directory not found at {dataset_dir}. Ensure the zip file was correctly extracted.")

# Handle subdirectory if needed
subdir = os.path.join(dataset_dir, "mendeleydataset")  # Modify based on structure after extraction
if os.path.exists(subdir):
    print(f"Contents of '{subdir}':")
    print(os.listdir(subdir)[:10])  # List first 10 items
else:
    print(f"No subdirectory named 'mendeleydataset' found in {dataset_dir}. Proceeding with top-level folder.")


Dataset directory already exists at /content/dataset.
Checking dataset directory structure...
Top-level contents of dataset directory:
['mendeleydataset']
Contents of '/content/dataset/mendeleydataset':
['PAT_616_1169_3.png', 'PAT_313_669_908.png', 'PAT_1765_3337_365.png', 'PAT_237_362_52.png', 'PAT_1893_3761_666.png', 'PAT_245_376_24.png', 'PAT_238_364_387.png', 'PAT_1420_1461_93.png', 'PAT_72_110_647.png', 'PAT_837_1582_714.png']


In [20]:
# Reload metadata
labels_df = pd.read_csv(csv_file_path)
print(labels_df['img_id'].head())  # Preview image IDs in the CSV

# Compare with dataset directory contents
if os.path.exists(subdir):
    extracted_files = set(os.listdir(subdir))
    csv_files = set(labels_df['img_id'])
    common_files = extracted_files.intersection(csv_files)

    print(f"Common files: {len(common_files)}")
    print(f"Missing files: {len(csv_files - extracted_files)}")

0    PAT_1516_1765_530.png
1       PAT_46_881_939.png
2    PAT_1545_1867_547.png
3    PAT_1989_4061_934.png
4     PAT_684_1302_588.png
Name: img_id, dtype: object
Common files: 2298
Missing files: 0


In [10]:
# Add 'cancer' or 'non_cancer' labels based on 'biopsed' column
labels_df['label'] = labels_df['biopsed'].apply(lambda x: 'cancer' if x else 'non_cancer')

# Map images to labels
image_paths = []
labels = []

# Use the correct subdirectory for the dataset
image_dir = os.path.join(dataset_dir, "mendeleydataset")

for _, row in labels_df.iterrows():
    img_path = os.path.join(image_dir, row['img_id'])  # Corrected image path
    if os.path.exists(img_path):  # Check if the file exists
        image_paths.append(img_path)
        labels.append(row['label'])

print(f"Found {len(image_paths)} images with labels.")

# Split dataset into training and validation sets
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, stratify=labels, random_state=42
)

Dataset extracted to /content/dataset
Found 2298 images with labels.


In [21]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = dict(enumerate(class_weights))

# Image preprocessing and augmentation
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)
val_datagen = ImageDataGenerator(rescale=1.0 / 255.0)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=pd.DataFrame({'filename': train_paths, 'class': train_labels}),
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)
val_generator = val_datagen.flow_from_dataframe(
    dataframe=pd.DataFrame({'filename': val_paths, 'class': val_labels}),
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

Found 1838 validated image filenames belonging to 2 classes.
Found 460 validated image filenames belonging to 2 classes.


In [22]:
# Define model function
def build_model(base_model):
    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train VGG16
vgg_base = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
vgg_model = build_model(vgg_base)

print("Training VGG16...")
vgg_model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    class_weight=class_weights
)

# Train ResNet50
resnet_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
resnet_model = build_model(resnet_base)

print("Training ResNet50...")
resnet_model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    class_weight=class_weights
)

# Save models
vgg_model.save('/content/vgg_model.h5')
resnet_model.save('/content/resnet_model.h5')

Training VGG16...
Epoch 1/10


  self._warn_if_super_not_called()


[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 2s/step - accuracy: 0.5358 - loss: 0.7028 - val_accuracy: 0.4913 - val_loss: 0.6932
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 2s/step - accuracy: 0.4785 - loss: 0.6933 - val_accuracy: 0.4217 - val_loss: 0.6934
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 2s/step - accuracy: 0.4735 - loss: 0.6875 - val_accuracy: 0.4152 - val_loss: 0.6953
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 2s/step - accuracy: 0.4386 - loss: 0.6940 - val_accuracy: 0.4217 - val_loss: 0.6933
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 2s/step - accuracy: 0.4912 - loss: 0.6912 - val_accuracy: 0.4174 - val_loss: 0.6934
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 2s/step - accuracy: 0.4734 - loss: 0.6914 - val_accuracy: 0.5848 - val_loss: 0.6931
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



In [24]:
# Function to predict an image
def predict_image(image_path, model, target_size=(224, 224)):
    img = preprocess_image(image_path, target_size)
    img_array = np.expand_dims(img, axis=0)
    prediction = model.predict(img_array)
    return 'Cancerous' if prediction[0][0] > 0.5 else 'Non-Cancerous'

# Upload and test an image
print("Upload an image to test the model.")
uploaded = files.upload()
uploaded_image_path = list(uploaded.keys())[0]

print(f"Prediction using VGG16: {predict_image(uploaded_image_path, vgg16_model)}")
print(f"Prediction using ResNet50: {predict_image(uploaded_image_path, resnet50_model)}")

Upload an image to test the model.


Saving PAT_36_49_935.png to PAT_36_49_935.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Prediction using VGG16: Cancerous
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Prediction using ResNet50: Cancerous


In [25]:
# Function to predict an image
def predict_image(image_path, model, target_size=(224, 224)):
    img = preprocess_image(image_path, target_size)
    img_array = np.expand_dims(img, axis=0)
    prediction = model.predict(img_array)
    return 'Cancerous' if prediction[0][0] > 0.5 else 'Non-Cancerous'

# Upload and test an image
print("Upload an image to test the model.")
uploaded = files.upload()
uploaded_image_path = list(uploaded.keys())[0]

print(f"Prediction using VGG16: {predict_image(uploaded_image_path, vgg16_model)}")
print(f"Prediction using ResNet50: {predict_image(uploaded_image_path, resnet50_model)}")

Upload an image to test the model.


Saving PAT_401_4594_970.png to PAT_401_4594_970.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Prediction using VGG16: Cancerous
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Prediction using ResNet50: Cancerous


In [26]:
# Function to predict an image
def predict_image(image_path, model, target_size=(224, 224)):
    img = preprocess_image(image_path, target_size)
    img_array = np.expand_dims(img, axis=0)
    prediction = model.predict(img_array)
    return 'Cancerous' if prediction[0][0] > 0.5 else 'Non-Cancerous'

# Upload and test an image
print("Upload an image to test the model.")
uploaded = files.upload()
uploaded_image_path = list(uploaded.keys())[0]

print(f"Prediction using VGG16: {predict_image(uploaded_image_path, vgg16_model)}")
print(f"Prediction using ResNet50: {predict_image(uploaded_image_path, resnet50_model)}")

Upload an image to test the model.


Saving PAT_217_963_806.png to PAT_217_963_806.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction using VGG16: Non-Cancerous
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Prediction using ResNet50: Cancerous
