<a href="https://colab.research.google.com/github/urvashi2004/ML_MiniProjects/blob/main/SkinCancer/Skin_Cancer_Detection_VGG%2BGaussianBlur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Downloading latest version
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kmader/skin-cancer-mnist-ham10000?dataset_version_number=2...


100%|██████████| 5.20G/5.20G [01:14<00:00, 75.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2


In [None]:
import os

dataset_path = "/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2"
for root, dirs, files in os.walk(dataset_path):
    print("Directory:", root)
    print("Subdirectories:", dirs)
    print("Files:", files)
    print("-" * 50)


# Latest trial

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# Step 1: Dataset Reduction
def reduce_dataset(metadata, target_classes, samples_per_class=300):
    filtered_metadata = metadata[metadata['dx'].isin(target_classes)]
    reduced_metadata = filtered_metadata.groupby('dx').apply(
        lambda x: x.sample(n=min(len(x), samples_per_class), random_state=42)
    ).reset_index(drop=True)
    return reduced_metadata

In [None]:
# Step 2: Image Preprocessing
def preprocess_image(img_path, target_size=(224, 224)):
    img = cv2.imread(img_path)
    if img is None:
        return np.zeros((target_size[0], target_size[1], 3))  # Placeholder black image
    img = cv2.GaussianBlur(img, (5, 5), 0)  # Noise removal
    img = cv2.resize(img, target_size)
    img = img / 255.0  # Normalize to [0, 1]
    return img

In [None]:
# Step 3: Data Preparation
def prepare_data(metadata, target_size=(224, 224)):
    X = np.array([preprocess_image(img_path) for img_path in metadata['image_path']])
    y = to_categorical(metadata['dx'].factorize()[0])  # One-hot encoding
    return X, y

In [None]:
# Step 4: Model Definition
def build_model(input_shape, num_classes):
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)
    for layer in base_model.layers:
        layer.trainable = False  # Freeze base model layers
    x = Flatten()(base_model.output)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=output)
    return model

In [None]:
# Main Code
dataset_path = "/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2"
metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
metadata = pd.read_csv(metadata_path)

In [None]:
# Map image paths
image_dirs = [
    os.path.join(dataset_path, "HAM10000_images_part_1"),
    os.path.join(dataset_path, "HAM10000_images_part_2"),
    os.path.join(dataset_path, "ham10000_images_part_1"),
    os.path.join(dataset_path, "ham10000_images_part_2"),
]
image_path_mapping = {}
for image_dir in image_dirs:
    for img_name in os.listdir(image_dir):
        if img_name.endswith(".jpg"):
            image_id = img_name.split(".")[0]
            image_path_mapping[image_id] = os.path.join(image_dir, img_name)

metadata['image_path'] = metadata['image_id'].map(image_path_mapping)
metadata = metadata.dropna(subset=['image_path']).reset_index(drop=True)

In [None]:
# Reduce dataset
target_classes = ['nv', 'mel']  # Adjust based on your target classes
reduced_metadata = reduce_dataset(metadata, target_classes, samples_per_class=300)

# Split data
train_metadata, val_metadata = train_test_split(reduced_metadata, test_size=0.2, random_state=42, stratify=reduced_metadata['dx'])


  reduced_metadata = filtered_metadata.groupby('dx').apply(


In [None]:
# Prepare data
X_train, y_train = prepare_data(train_metadata)
X_val, y_val = prepare_data(val_metadata)


In [None]:
# Build and compile model
input_shape = (224, 224, 3)
num_classes = y_train.shape[1]
model = build_model(input_shape, num_classes)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 26s/step - accuracy: 0.6514 - loss: 0.6406 - val_accuracy: 0.7000 - val_loss: 0.6257
Epoch 2/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m434s[0m 25s/step - accuracy: 0.7608 - loss: 0.5695 - val_accuracy: 0.7583 - val_loss: 0.4977
Epoch 3/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 26s/step - accuracy: 0.8343 - loss: 0.4010 - val_accuracy: 0.7667 - val_loss: 0.4884
Epoch 4/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m429s[0m 25s/step - accuracy: 0.8797 - loss: 0.3230 - val_accuracy: 0.7750 - val_loss: 0.4845
Epoch 5/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 26s/step - accuracy: 0.8265 - loss: 0.4091 - val_accuracy: 0.7667 - val_loss: 0.4748
Epoch 6/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m432s[0m 26s/step - accuracy: 0.8207 - loss: 0.3694 - val_accuracy: 0.7417 - val_loss: 0.4855
Epoch 7/20
[1m15/15[0m [3

In [None]:
# Save the model
model.save("skin_cancer_detector.h5")



In [None]:
# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 18s/step - accuracy: 0.7702 - loss: 0.4711
Validation Accuracy: 76.67%


In [None]:
dataset_path = "/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2"
metadata = load_data(dataset_path)

# Check if all images are found
missing_images = metadata[metadata['image_path'].isnull()]
if not missing_images.empty:
    print("Missing images:", missing_images)
else:
    print("All images are correctly mapped!")


All images are correctly mapped!
