# Skin Cancer Detection System - Model Development and Analysis

This notebook contains the complete process of developing a skin cancer detection system using deep learning, including:
- Data loading and preprocessing
- Exploratory Data Analysis (EDA)
- Model development and training
- Performance evaluation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## 1. Data Loading and Initial Exploration

In [None]:
# Load metadata
df = pd.read_csv('data/HAM10000_metadata.csv')
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Display basic statistics of the dataset
print("\nClass distribution:")
print(df.dx.value_counts())

# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='dx')
plt.xticks(rotation=45)
plt.title('Distribution of Skin Lesion Types')
plt.show()

## 2. Image Analysis

In [None]:
def analyze_image_properties(data_dir, sample_size=100):
    """Analyze properties of images in the dataset"""
    image_files = np.random.choice(os.listdir(data_dir), sample_size)
    widths, heights = [], []
    
    for img_file in image_files:
        img = Image.open(os.path.join(data_dir, img_file))
        widths.append(img.size[0])
        heights.append(img.size[1])
    
    return widths, heights

widths, heights = analyze_image_properties('data/images')

plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.hist(widths)
plt.title('Image Widths Distribution')
plt.subplot(122)
plt.hist(heights)
plt.title('Image Heights Distribution')
plt.show()

## 3. Data Preparation

In [None]:
from src.data_loader import DataLoader

# Initialize data loader
data_loader = DataLoader(
    data_dir='data/images',
    metadata_path='data/HAM10000_metadata.csv'
)

# Load and split data
(X_train, y_train), (X_val, y_val), (X_test, y_test) = data_loader.load_data()

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

## 4. Model Development and Training

In [None]:
from src.model import create_efficient_net, create_resnet, create_custom_cnn
from src.preprocessing import create_data_augmentation

# Model configurations
INPUT_SHAPE = (224, 224, 3)
NUM_CLASSES = 7
BATCH_SIZE = 32
EPOCHS = 50

# Create models
models = {
    'EfficientNet': create_efficient_net(INPUT_SHAPE, NUM_CLASSES),
    'ResNet': create_resnet(INPUT_SHAPE, NUM_CLASSES),
    'CustomCNN': create_custom_cnn(INPUT_SHAPE, NUM_CLASSES)
}

In [None]:
# Training configuration
data_augmentation = create_data_augmentation()

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=3
    )
]

# Dictionary to store training histories
histories = {}

In [None]:
# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        data_augmentation(X_train),
        y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val),
        callbacks=callbacks
    )
    
    histories[name] = history.history

## 5. Model Comparison and Evaluation

In [None]:
# Plot training histories
plt.figure(figsize=(15, 5))

plt.subplot(121)
for name, history in histories.items():
    plt.plot(history['accuracy'], label=f'{name} (train)')
    plt.plot(history['val_accuracy'], label=f'{name} (val)')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(122)
for name, history in histories.items():
    plt.plot(history['loss'], label=f'{name} (train)')
    plt.plot(history['val_loss'], label=f'{name} (val)')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Evaluate models on test set
test_results = {}
for name, model in models.items():
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    test_results[name] = {
        'accuracy': test_acc,
        'loss': test_loss
    }
    print(f"\n{name} Test Accuracy: {test_acc:.4f}")

## 6. Save Best Model

In [None]:
# Find and save the best model
best_model_name = max(test_results, key=lambda k: test_results[k]['accuracy'])
best_model = models[best_model_name]

os.makedirs('models', exist_ok=True)
best_model.save('models/best_model.h5')
print(f"Best model ({best_model_name}) saved with test accuracy: {test_results[best_model_name]['accuracy']:.4f}")