In [6]:
# Diagnostics: run this after selecting the kernel "Python (dental-xray)"
import sys, platform
print("Python:", sys.version)
print("Executable:", sys.executable)
print("Platform:", platform.platform())
try:
    from ultralytics import YOLO
    import torch, numpy as np
    print("ultralytics, torch, numpy OK")
    print("Torch:", torch.__version__)
except Exception as e:
    print("Import error:", e)

Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 11:23:37) [Clang 14.0.6 ]
Executable: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/.venv/bin/python
Platform: macOS-26.0.1-arm64-arm-64bit-Mach-O
ultralytics, torch, numpy OK
Torch: 2.9.0


# Dataset Preparation for Dental X-ray Cavity Detection

## Overview

This notebook prepares the dataset for training YOLO models. It handles:

1. **Data Collection**: Gathers all X-ray images from raw data folder
2. **Dataset Splitting**: Divides images into 70% train, 15% validation, 15% test
3. **YOLO Format**: Ensures labels are in YOLO format (normalized bounding boxes)
4. **Configuration**: Creates `dataset.yaml` file for YOLO training

## Dataset Structure

After running this notebook, you'll have:

```
data/processed/
├── train/
│   ├── images/     # Training X-ray images (70%)
│   └── labels/     # Training YOLO labels (.txt)
├── val/
│   ├── images/     # Validation X-ray images (15%)
│   └── labels/     # Validation YOLO labels (.txt)
├── test/
│   ├── images/     # Test X-ray images (15%)
│   └── labels/     # Test YOLO labels (.txt)
└── dataset.yaml    # YOLO configuration file
```

## Cavity Classes

Our dataset has **4 cavity severity levels**:

- **Class 0**: cavity_class_0 (minimal)
- **Class 1**: cavity_class_1 (mild)
- **Class 2**: cavity_class_2 (moderate)
- **Class 3**: cavity_class_3 (severe)


In [7]:
# Install required dependencies (uncomment if running for the first time)
# !pip install -q ultralytics==8.2.0 opencv-python pillow matplotlib numpy torch torchvision

In [8]:
# ============================================================
# IMPORT LIBRARIES AND SETUP PATHS
# ============================================================

# Import necessary libraries
from ultralytics import YOLO  # For YOLO object detection model
import os, shutil, glob, random, json, math  # Standard Python libraries
from pathlib import Path  # For handling file paths in a cross-platform way
from PIL import Image  # For image processing

# Detect project root correctly when running from notebooks/ folder
# This ensures paths work whether you run from notebooks/ or project root
project_root = (Path('..').resolve() if Path.cwd().name == 'notebooks' else Path('.').resolve())

# Define all important paths for our project
data_raw = project_root / 'data' / 'raw' / 'dental'  # Where raw X-ray images are stored
images_glob = ['*.jpg','*.jpeg','*.png','*.bmp']  # Image file extensions to look for
processed = project_root / 'data' / 'processed'  # Where processed dataset will be saved

# Create separate folders for training, validation, and test data
train_images = processed / 'train' / 'images'  # Training images folder
train_labels = processed / 'train' / 'labels'  # Training labels folder
val_images = processed / 'val' / 'images'  # Validation images folder
val_labels = processed / 'val' / 'labels'  # Validation labels folder
test_images = processed / 'test' / 'images'  # Test images folder
test_labels = processed / 'test' / 'labels'  # Test labels folder

# Create folder for saving trained models
models_dir = project_root / 'models'
models_dir.mkdir(parents=True, exist_ok=True)  # Create if doesn't exist

# Create all required directories
for d in [train_images, train_labels, val_images, val_labels, test_images, test_labels]:
    d.mkdir(parents=True, exist_ok=True)

# Print paths to verify everything is set up correctly
print('Project root:', project_root)
print('Raw data location:', data_raw)

Project root: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection
Raw data location: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/raw/dental


## Step 1: Split Dataset into Train, Validation, and Test Sets

This section:

- Searches for all X-ray images in the raw data folder
- Locates corresponding YOLO labels (if available)
- Randomly splits images into 70% train, 15% validation, 15% test
- Copies images and labels to appropriate folders


In [9]:
# Step 1.1: Find all images in the raw data directory
# We check both 'images' subfolder and root of 'dental' folder
img_dir_candidates = [data_raw / 'images', data_raw]
all_imgs = []  # List to store all found image paths

# Loop through candidate directories and find all images
for d in img_dir_candidates:
    if d.exists():  # Check if directory exists
        for pat in images_glob:  # Check each image extension
            all_imgs += list(d.rglob(pat))  # Recursively find all matching files

all_imgs = sorted(set(all_imgs))  # Remove duplicates and sort
print('Total images found:', len(all_imgs))

# Step 1.2: Locate YOLO label files (if they exist)
# Labels are typically in 'labels' or 'object_detection_labels' folder
possible_label_dirs = [data_raw / 'labels', data_raw / 'object_detection_labels']
label_root = None  # Variable to store the label directory path

for cand in possible_label_dirs:
    if cand.exists():
        label_root = cand
        break  # Stop when we find the first valid label directory

print('Label directory:', label_root)

# Step 1.3: Perform train/val/test split (70/15/15)
random.seed(42)  # Set random seed for reproducibility (same split every time)
random.shuffle(all_imgs)  # Randomly shuffle the images

# Calculate split points
train_split = int(0.70 * len(all_imgs))  # 70% for training
val_split = int(0.85 * len(all_imgs))    # Next 15% for validation (70% + 15% = 85%)

# Split the data
train_list = all_imgs[:train_split]           # First 70% for training
val_list = all_imgs[train_split:val_split]    # Next 15% for validation
test_list = all_imgs[val_split:]              # Remaining 15% for testing

print(f'Training: {len(train_list)} | Validation: {len(val_list)} | Test: {len(test_list)}')

# Step 1.4: Define helper function to copy images and their labels
def move_with_label(img_paths, out_img_dir, out_lbl_dir):
    """
    Copies images and their corresponding label files to output directories.
    
    Args:
        img_paths: List of image file paths to copy
        out_img_dir: Destination directory for images
        out_lbl_dir: Destination directory for labels
    """
    for ip in img_paths:
        rel = ip.stem  # Get filename without extension (e.g., 'p1' from 'p1.png')
        
        # Copy the image file
        dst_img = out_img_dir / ip.name
        shutil.copy2(ip, dst_img)
        
        # Copy the corresponding label file if it exists
        if label_root is not None:
            cand = label_root / f'{rel}.txt'  # Look for .txt file with same name
            if cand.exists():
                shutil.copy2(cand, out_lbl_dir / cand.name)

# Step 1.5: Clear any previous data from processed folders
for d in [train_images, train_labels, val_images, val_labels, test_images, test_labels]:
    for p in d.glob('*'):
        if p.is_file(): 
            p.unlink()  # Delete the file

# Step 1.6: Copy images and labels to train/val/test folders
move_with_label(train_list, train_images, train_labels)
move_with_label(val_list, val_images, val_labels)
move_with_label(test_list, test_images, test_labels)

# Step 1.7: Verify the dataset was created successfully
print('\n✅ Dataset preparation complete!')
print('Processed dataset location:', processed)
print(f'Training: {len(list(train_images.glob("*")))} images, {len(list(train_labels.glob("*.txt")))} labels')
print(f'Validation: {len(list(val_images.glob("*")))} images, {len(list(val_labels.glob("*.txt")))} labels')
print(f'Test: {len(list(test_images.glob("*")))} images, {len(list(test_labels.glob("*.txt")))} labels')

Total images found: 128
Label directory: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/raw/dental/object_detection_labels
Training: 89 | Validation: 19 | Test: 20

✅ Dataset preparation complete!
Processed dataset location: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed
Training: 58 images, 58 labels
Validation: 17 images, 17 labels
Test: 19 images, 19 labels

✅ Dataset preparation complete!
Processed dataset location: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed
Training: 58 images, 58 labels
Validation: 17 images, 17 labels
Test: 19 images, 19 labels


## Step 2: Create dataset.yaml Configuration File

YOLO requires a YAML configuration file that specifies:

- Path to training images
- Path to validation images
- Path to test images
- Number of classes (nc)
- Class names

Our dataset has **4 cavity severity classes**: 0, 1, 2, 3


In [10]:
# Create the dataset.yaml configuration file
dataset_yaml = processed / 'dataset.yaml'

# Write the configuration
# - train: absolute path to training images folder
# - val: absolute path to validation images folder
# - test: absolute path to test images folder
# - nc: number of classes (4 for our cavity severity levels)
# - names: list of class names corresponding to class IDs 0, 1, 2, 3
dataset_yaml.write_text(
    f"train: {train_images.resolve()}\n"  # Absolute path to training images
    f"val: {val_images.resolve()}\n"  # Absolute path to validation images
    f"test: {test_images.resolve()}\n"  # Absolute path to test images
    'nc: 4\n'  # Number of classes (0, 1, 2, 3)
    'names: [cavity_class_0, cavity_class_1, cavity_class_2, cavity_class_3]\n'  # Class names
)

# Display the created configuration file
print('✅ dataset.yaml created successfully!\n')
print('Configuration:')
print(dataset_yaml.read_text())

✅ dataset.yaml created successfully!

Configuration:
train: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed/train/images
val: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed/val/images
test: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed/test/images
nc: 4
names: [cavity_class_0, cavity_class_1, cavity_class_2, cavity_class_3]



## ✅ Dataset Preparation Complete!

You can now proceed to train models:

- **YOLOv8**: Open `02_train_yolov8.ipynb`
- **YOLOv12**: Open `03_train_yolov12.ipynb`
