<a href="https://colab.research.google.com/github/suppakrit-w/Final-DeepLearning/blob/main/Skin_Cancer_Model_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Download and Prepare Data

## Download Dataset

In [None]:
# 1. Install Kaggle API
!pip install -q kaggle

# 2. create folder for kaggle.json
!mkdir -p ~/.kaggle

# 3. get kaggle.json
from google.colab import files
files.upload()

# 4. Move file and set permissions
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 5. Download Dataset
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

# 6. Unzip Dataset
!unzip -q skin-cancer-mnist-ham10000.zip

### Plot distribution of each class

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('HAM10000_metadata.csv')

print(df['dx'].value_counts())

plt.figure(figsize=(10, 6))
df['dx'].value_counts().plot(kind='bar')
plt.title('Class Distribution of HAM10000')
plt.xlabel('Lesion Type')
plt.ylabel('Count')
plt.show()

### Move data into single folder

data_sorted/  
├── train/  
│   ├── nv/  
│   ├── mel/  
│   ├── bcc/  
│   └── ... (all 7 classes)  
└── val/  
    ├── nv/  
    ├── mel/  
    ├── bcc/  
    └── ... (all 7 classes)  

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil
from tqdm.notebook import tqdm

print("Starting image organization script...")

# --- 1. Load Metadata ---
try:
    df = pd.read_csv('HAM10000_metadata.csv')
except FileNotFoundError:
    print("Error: 'HAM10000_metadata.csv' not found.")
    print("Please make sure you have downloaded and unzipped the dataset correctly.")

# --- 2. Create Train/Val Split ---
# We use 'stratify=df['dx']' to ensure both train and val sets
# have the same proportion of classes as the original dataset.
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['dx']
)

print(f"Total images: {len(df)}")
print(f"Training images: {len(train_df)}")
print(f"Validation images: {len(val_df)}")

# --- 3. Define Source and Destination Dirs ---
# (MODIFIED) Define the two source folders
source_dir_1 = 'HAM10000_images_part_1'
source_dir_2 = 'HAM10000_images_part_2'
base_dest_dir = 'data_sorted' # Our new organized folder

# Get all 7 class names
class_names = df['dx'].unique()

# --- 4. Create Directory Structure ---
for split in ['train', 'val']:
    split_dir = os.path.join(base_dest_dir, split)
    os.makedirs(split_dir, exist_ok=True)
    for class_name in class_names:
        class_dir = os.path.join(split_dir, class_name)
        os.makedirs(class_dir, exist_ok=True)

print("Directory structure created.")

# --- 5. Function to copy files (MODIFIED) ---
# This function now checks both source directories
def copy_images(dataframe, split_name):
    print(f"\nCopying {split_name} images...")

    for index, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
        image_filename = row['image_id'] + '.jpg'

        # Check source_dir_1
        source_path_1 = os.path.join(source_dir_1, image_filename)
        # Check source_dir_2
        source_path_2 = os.path.join(source_dir_2, image_filename)

        # Determine the correct source path
        source_path = None
        if os.path.exists(source_path_1):
            source_path = source_path_1
        elif os.path.exists(source_path_2):
            source_path = source_path_2
        else:
            print(f"Warning: Source file not found for {image_filename}")
            continue # Skip this file

        # Define destination
        dest_path = os.path.join(base_dest_dir, split_name, row['dx'], image_filename)

        # Copy the file
        shutil.copyfile(source_path, dest_path)

# --- 6. Run the copy process ---
copy_images(train_df, 'train')
copy_images(val_df, 'val')

print("\n--- Image organization complete! ---")
print(f"Data is now sorted in '{base_dest_dir}'")

# 2. Data pre-processing

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchvision import datasets, transforms, models
import os
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# === Global Constants ===

# --- Hardware ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Data ---
BASE_DIR = 'data_sorted' # โฟลเดอร์ที่จัดเรียงแล้ว
IMAGE_SIZE = 224

# --- Training ---
BATCH_SIZE = 32
EPOCHS = 15
LR = 0.001 # Learning Rate

print(f"Using device: {DEVICE}")

In [None]:
# --- Data Pipeline ---

def get_data_loaders(base_dir, batch_size, image_size):
    """
    สร้างและคืนค่า DataLoaders (train, val)
    พร้อม WeightedRandomSampler สำหรับ train_loader
    """

    # --- Transforms (ImageNet Stats) ---
    data_transforms = {
        'train': transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ColorJitter(brightness=0.1, contrast=0.1),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
    }

    # --- Datasets ---
    train_dir = os.path.join(base_dir, 'train')
    val_dir = os.path.join(base_dir, 'val')

    train_dataset = datasets.ImageFolder(train_dir, transform=data_transforms['train'])
    val_dataset = datasets.ImageFolder(val_dir, transform=data_transforms['val'])

    class_names = train_dataset.classes
    print(f"Classes: {class_names}")

    # --- WeightedRandomSampler (แก้ Imbalance) ---
    print("Setting up WeightedRandomSampler...")
    class_counts = np.bincount(train_dataset.targets)
    class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
    sample_weights = class_weights[train_dataset.targets]

    sampler = WeightedRandomSampler(
        weights=sample_weights,
        num_samples=len(sample_weights),
        replacement=True
    )

    # --- DataLoaders ---
    dataloaders = {
        'train': DataLoader(train_dataset, batch_size=batch_size, sampler=sampler),
        'val': DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    }

    return dataloaders, class_names, len(train_dataset), len(val_dataset)

# 3. Create Model

In [None]:
# --- Model Factory ---

# Our Custom CNN
class MyCustomCNN(nn.Module):
    def __init__(self, num_classes=7):
        super(MyCustomCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2), # 112
            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2), # 56
            nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2) # 28
        )
        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Linear(128 * 28 * 28, 512), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.classifier(x)
        return x

# --- Model factory ---
def create_model(model_name: str, num_classes: int, lr: float):
    """
    สร้าง Model, Criterion, และ Optimizer
    ตามชื่อโมเดลที่เลือก ('resnet', 'efficientnet', 'custom')
    """
    model = None
    optimizer = None

    # Load pre-trained ResNet 50
    if model_name == 'resnet':
        print("Loading pre-trained ResNet50")
        model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        # Freeze model parameter
        for param in model.parameters():
            param.requires_grad = False # Freeze
        num_ftrs = model.fc.in_features
        # Replace ResNet fc layer
        model.fc = nn.Sequential(
            nn.Linear(num_ftrs, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
        optimizer = optim.Adam(model.fc.parameters(), lr=lr)

    # Load pre-trained EfficientNet-B0
    elif model_name == 'efficientnet':
        print("Loading pre-trained EfficientNet-B0")
        model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
        # Freeze model parameter
        for param in model.parameters():
            param.requires_grad = False # Freeze
        num_ftrs = model.classifier[1].in_features
        # Replace EffNet classifier layer
        model.classifier = nn.Sequential(
            nn.Linear(num_ftrs, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
        optimizer = optim.Adam(model.classifier.parameters(), lr=lr)

    elif model_name == 'custom':
        print("Initializing Custom CNN (Baseline)")
        model = MyCustomCNN(num_classes=num_classes)
        optimizer = optim.Adam(model.parameters(), lr=lr)

    else:
        raise ValueError(f"Unknown model name: {model_name}. Choose 'resnet', 'efficientnet', or 'custom'.")

    model = model.to(DEVICE)
    criterion = nn.CrossEntropyLoss()

    return model, criterion, optimizer

## 4. Train Model

In [None]:
# --- Train & Evaluate ---

def train_model(model, criterion, optimizer, dataloaders, dataset_sizes, num_epochs):
    """
    Main Training Loop
    """
    print("\n--- Starting Training ---")

    # Store history for graph plotting
    history = {
        'train_loss': [], 'train_acc': [],
        'val_loss': [], 'val_acc': []
    }

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("-" * 10)

        # -------------------
        #  Training Phase
        # -------------------
        model.train()
        running_loss = 0.0
        correct_train = 0

        for inputs, labels in tqdm(dataloaders['train'], desc="Training"):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            _, predicted = torch.max(outputs.data, 1)

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            correct_train += (predicted == labels).sum().item()

        epoch_train_loss = running_loss / dataset_sizes['train']
        epoch_train_acc = correct_train / dataset_sizes['train']

        history['train_loss'].append(epoch_train_loss)
        history['train_acc'].append(epoch_train_acc)

        # -------------------
        #  Validation Phase
        # -------------------
        model.eval()
        val_loss = 0.0
        correct_val = 0

        with torch.no_grad():
            for inputs, labels in tqdm(dataloaders['val'], desc="Validation"):
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                _, predicted = torch.max(outputs.data, 1)

                val_loss += loss.item() * inputs.size(0)
                correct_val += (predicted == labels).sum().item()

        epoch_val_loss = val_loss / dataset_sizes['val']
        epoch_val_acc = correct_val / dataset_sizes['val']

        history['val_loss'].append(epoch_val_loss)
        history['val_acc'].append(epoch_val_acc)

        print(f"Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.4f}")

    print("--- Finished Training ---")
    return model, history


def evaluate_model(model, dataloader, device, class_names):
    """
    Evaluate Model on Validation/Test set and print Report
    """
    print("\n--- Final Evaluation ---")
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=class_names, digits=4))

    print("\nConfusion Matrix:")
    cm = confusion_matrix(all_labels, all_preds)
    print(cm)

    # Plot confusion matrix using seaborn
    import seaborn as sns
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()


def plot_history(history, model_name):
    """
    Plot Accuracy and Loss from history
    """
    print(f"\n--- Plotting results for {model_name} ---")
    epochs_range = range(1, EPOCHS + 1)

    plt.figure(figsize=(12, 5))

    # กราฟ Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, history['train_acc'], label='Training Accuracy')
    plt.plot(epochs_range, history['val_acc'], label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title(f'Accuracy ({model_name})')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')

    # กราฟ Loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, history['train_loss'], label='Training Loss')
    plt.plot(epochs_range, history['val_loss'], label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title(f'Loss ({model_name})')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    plt.show()

In [None]:
# --- Main Execution ---

# -------------------------------------------------------------------
# Select Model: 'resnet', 'efficientnet', 'custom'
MODEL_NAME_TO_RUN = 'resnet'
# -------------------------------------------------------------------

# 1. Load Dataloader
dataloaders, class_names, train_size, val_size = get_data_loaders(
    base_dir=BASE_DIR,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE
)
dataset_sizes = {'train': train_size, 'val': val_size}

# 2. Create Model
model, criterion, optimizer = create_model(
    model_name=MODEL_NAME_TO_RUN,
    num_classes=len(class_names),
    lr=LR
)

# 3. Train
model, history = train_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    dataloaders=dataloaders,
    dataset_sizes=dataset_sizes,
    num_epochs=EPOCHS
)

# 4. Plot
plot_history(history, MODEL_NAME_TO_RUN)

# 5. Evaluate
evaluate_model(model, dataloaders['val'], DEVICE, class_names)

# 6. Save model
torch.save(model.state_dict(), f"ham10000_{MODEL_NAME_TO_RUN}.pth")
print(f"Model saved to ham10000_{MODEL_NAME_TO_RUN}.pth")

In [None]:
evaluate_model(model, dataloaders['val'], DEVICE, class_names)

## Eff-Net

In [None]:
# --- Main Execution ---

# -------------------------------------------------------------------
# Select Model: 'resnet', 'efficientnet', 'custom'
MODEL_NAME_TO_RUN = 'efficientnet'
# -------------------------------------------------------------------

# 1. Load Dataloader
dataloaders, class_names, train_size, val_size = get_data_loaders(
    base_dir=BASE_DIR,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE
)
dataset_sizes = {'train': train_size, 'val': val_size}

# 2. Create Model
model, criterion, optimizer = create_model(
    model_name=MODEL_NAME_TO_RUN,
    num_classes=len(class_names),
    lr=LR
)

# 3. Train
model, history = train_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    dataloaders=dataloaders,
    dataset_sizes=dataset_sizes,
    num_epochs=EPOCHS
)

# 4. Plot
plot_history(history, MODEL_NAME_TO_RUN)

# 5. Evaluate
evaluate_model(model, dataloaders['val'], DEVICE, class_names)

# 6. Save Model
torch.save(model.state_dict(), f"ham10000_{MODEL_NAME_TO_RUN}.pth")
print(f"Model saved to ham10000_{MODEL_NAME_TO_RUN}.pth")

## Custom CNN

In [None]:
# --- Main Execution ---

# -------------------------------------------------------------------
# Select Model: 'resnet', 'efficientnet', 'custom'
MODEL_NAME_TO_RUN = 'custom'
# -------------------------------------------------------------------

# 1. Load Dataloader
dataloaders, class_names, train_size, val_size = get_data_loaders(
    base_dir=BASE_DIR,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE
)
dataset_sizes = {'train': train_size, 'val': val_size}

# 2. Create Model
model, criterion, optimizer = create_model(
    model_name=MODEL_NAME_TO_RUN,
    num_classes=len(class_names),
    lr=LR
)

# 3. Train
model, history = train_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    dataloaders=dataloaders,
    dataset_sizes=dataset_sizes,
    num_epochs=EPOCHS
)

# 4. Plot
plot_history(history, MODEL_NAME_TO_RUN)

# 5. Evaluate
evaluate_model(model, dataloaders['val'], DEVICE, class_names)

# 6. Save Model
torch.save(model.state_dict(), f"ham10000_{MODEL_NAME_TO_RUN}.pth")
print(f"Model saved to ham10000_{MODEL_NAME_TO_RUN}.pth")