Problem statement: To build a CNN based model which can accurately detect melanoma. Melanoma is a type of cancer that can be deadly if not detected early. It accounts for 75% of skin cancer deaths. A solution which can evaluate images and alert the dermatologists about the presence of melanoma has the potential to reduce a lot of manual effort needed in diagnosis.

In [1]:
from google.colab import drive
import tensorflow as tf
from tensorflow.keras.preprocessing import image_dataset_from_directory
import pathlib
import matplotlib.pyplot as plt

# Mount Google Drive
drive.mount('/content/drive')

# Defining the path to the dataset
train_data_dir = pathlib.Path('/content/drive/MyDrive/Skin cancer ISIC The International Skin Imaging Collaboration/Train')
test_data_dir = pathlib.Path('/content/drive/MyDrive/Skin cancer ISIC The International Skin Imaging Collaboration/Test')

#!pip install Augmentor
import Augmentor
import os
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset
batch_size = 32
img_height = 180
img_width = 180

# First, count the number of samples per class before augmentation
train_class_counts = {class_name: len(list((train_data_dir / class_name).glob('*'))) for class_name in os.listdir(train_data_dir)}

# Determine the maximum class sample size
max_samples = max(train_class_counts.values())

# Apply Augmentor to balance classes
for class_name, count in train_class_counts.items():
    path_to_class = os.path.join(train_data_dir, class_name)

    # Create Augmentor pipeline
    p = Augmentor.Pipeline(path_to_class)
    
    # Apply transformations
    p.rotate(probability=0.7, max_left_rotation=10, max_right_rotation=10)
    p.flip_left_right(probability=0.5)
    p.zoom(probability=0.5, min_factor=1.1, max_factor=1.5)
    
    # Generate images to balance dataset
    num_samples_needed = max_samples - count
    if num_samples_needed > 0:
        p.sample(num_samples_needed)

# Reload the dataset after augmentation
train_ds = image_dataset_from_directory(
    train_data_dir,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size)

val_ds = image_dataset_from_directory(
    train_data_dir,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size)

test_ds = image_dataset_from_directory(
    test_data_dir,
    image_size=(img_height, img_width),
    batch_size=batch_size)

# Display class names
class_names = train_ds.class_names
print("Classes:", class_names)

# Initialize dictionary to store one image per class
images_per_class = {class_name: None for class_name in class_names}

# Collect one image per class
for images, labels in train_ds:
    for img, label in zip(images, labels):
        class_name = class_names[label]
        if images_per_class[class_name] is None:
            images_per_class[class_name] = img
        if all(value is not None for value in images_per_class.values()):
            break
    if all(value is not None for value in images_per_class.values()):
        break

# Visualize one image per class
plt.figure(figsize=(15, 15))
for i, (class_name, img) in enumerate(images_per_class.items()):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(img.numpy().astype("uint8"))
    plt.title(class_name)
    plt.axis("off")
plt.show()

# Data Augmentation and Normalization
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical"),
    layers.RandomRotation(0.2),
    layers.RandomZoom(0.2),
])

normalization_layer = layers.Rescaling(1./255)

# Apply augmentation and normalization
train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y))
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))

# Define CNN Model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(img_height, img_width, 3)),
    layers.MaxPooling2D(2, 2),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(class_names), activation='softmax')  # Multi-class classification
])

# Compile Model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Hyper parameter tuning in CNN Model
model = models.Sequential([
    layers.Conv2D(64, (3,3), activation='relu', input_shape=(img_height, img_width, 3)),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(256, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(len(class_names), activation='softmax')
])

# Compile with new learning rate
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


# Early Stopping Callback
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

# Train Model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=[early_stopping]
)

# Plot training & validation accuracy/loss
def plot_learning_curves(history):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history["accuracy"], label="Train Accuracy")
    plt.plot(history.history["val_accuracy"], label="Val Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.title("Training vs Validation Accuracy")

    plt.subplot(1, 2, 2)
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Val Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.title("Training vs Validation Loss")

    plt.show()

# Call function to visualize learning curves
plot_learning_curves(history)

# Explain findings
train_acc = history.history["accuracy"][-1]
val_acc = history.history["val_accuracy"][-1]
train_loss = history.history["loss"][-1]
val_loss = history.history["val_loss"][-1]

print(f"Final Training Accuracy: {train_acc:.4f}")
print(f"Final Validation Accuracy: {val_acc:.4f}")
print(f"Final Training Loss: {train_loss:.4f}")
print(f"Final Validation Loss: {val_loss:.4f}")

if train_acc > val_acc and train_loss < val_loss:
    print("The model is overfitting. Consider using more data, data augmentation, or regularization techniques.")
elif train_acc < val_acc and train_loss > val_loss:
    print("The model is underfitting. Consider increasing the model complexity or training for more epochs.")
else:
    print("The model is well-fitted to the data.")

# Count the number of samples per class after augmentation
final_class_counts = {class_name: len(list((train_data_dir / class_name).glob('*'))) for class_name in os.listdir(train_data_dir)}

# Print the class distribution
print("\nClass distribution after augmentation:")
for class_name, count in final_class_counts.items():
    print(f"Class '{class_name}': {count} samples")

# Determine the class with the least number of samples
least_samples_class = min(final_class_counts, key=final_class_counts.get)
print(f"\nClass with the least number of samples: '{least_samples_class}' with {final_class_counts[least_samples_class]} samples")

# Determine dominant classes
total_samples = sum(final_class_counts.values())
dominant_classes = {class_name: count for class_name, count in final_class_counts.items() if count / total_samples > 0.1}

print("\nClasses that dominate the data (more than 10% of total samples):")
for class_name, count in dominant_classes.items():
    print(f"Class '{class_name}': {count} samples ({count / total_samples:.2%} of total)")


In [None]:
# Evaluate on test dataset
test_loss, test_acc = model.evaluate(test_ds)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")