# Dataset Preparation

Process local dental X-ray dataset from `data/raw/` directory.


In [7]:
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
import yaml

# Find project's root dir
project_root = Path('..').resolve() if Path.cwd().name == 'notebooks' else Path('.').resolve()
raw_dir = project_root / 'data' / 'raw'
processed_dir = project_root / 'data' / 'processed'

# Clear processed directory if it exists
if processed_dir.exists():
    shutil.rmtree(processed_dir)

# Make new dir
processed_dir.mkdir(parents=True, exist_ok=True)

print(f"✓ Project root: {project_root}")
print(f"✓ Raw data: {raw_dir}")
print(f"✓ Processed data: {processed_dir}")

✓ Project root: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection
✓ Raw data: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/raw
✓ Processed data: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed


In [8]:
# Get all images and labels
images_dir = raw_dir / 'images'
labels_dir = raw_dir / 'object_detection_labels'

# Get all image files (.png/.jpg/.jpeg) -> sort
image_files = sorted(list(images_dir.glob('*.png')) + 
                    list(images_dir.glob('*.jpg')) + 
                    list(images_dir.glob('*.jpeg')))

print(f"Found {len(image_files)} images") # 64

# Filter images that have corresponding labels
valid_images = []
for img_path in image_files:
    # Construct image_path.txt & if(exists) inside labels
    label_path = labels_dir / f"{img_path.stem}.txt"
    if label_path.exists():
        valid_images.append(img_path)

print(f"Found {len(valid_images)} images with labels")

# Split dataset: 80% train, 10% val, 10% test
train_imgs, temp_imgs = train_test_split(valid_images, test_size=0.2, random_state=42)
val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)

print(f"\nSplit:")
print(f"  Train: {len(train_imgs)} images")
print(f"  Val:   {len(val_imgs)} images")
print(f"  Test:  {len(test_imgs)} images")

Found 64 images
Found 64 images with labels

Split:
  Train: 51 images
  Val:   6 images
  Test:  7 images


In [9]:
# Create directory structure
for split_name, split_images in [('train', train_imgs), ('val', val_imgs), ('test', test_imgs)]:
    # Create split dir (train/test/split)
    split_dir = processed_dir / split_name
    (split_dir / 'images').mkdir(parents=True, exist_ok=True)
    (split_dir / 'labels').mkdir(parents=True, exist_ok=True)
    
    # Copy images and labels
    for img_path in split_images:
        # Copy image
        shutil.copy(img_path, split_dir / 'images' / img_path.name)
        
        # Copy label
        label_path = labels_dir / f"{img_path.stem}.txt"
        if label_path.exists():
            shutil.copy(label_path, split_dir / 'labels' / f"{img_path.stem}.txt")
    
    print(f"✓ {split_name}: copied {len(split_images)} images + labels")

print("\n✓ Dataset split complete!")

✓ train: copied 51 images + labels
✓ val: copied 6 images + labels
✓ test: copied 7 images + labels

✓ Dataset split complete!


In [10]:
# Create dataset.yaml -> YOLO config
dataset_config = {
    'path': str(processed_dir.resolve()),
    'train': 'train/images',
    'val': 'val/images',
    'test': 'test/images',
    'nc': 4,
    'names': ['cavity_class_0', 'cavity_class_1', 'cavity_class_2', 'cavity_class_3']
}

# Writes the YAML file with keys in the defined order
yaml_path = processed_dir / 'dataset.yaml'
with open(yaml_path, 'w') as f:
    yaml.dump(dataset_config, f, sort_keys=False) # Serialize (unsorted)

print(f"✓ Created {yaml_path}")
print(f"\nDataset configuration:")
print(f"  Path: {dataset_config['path']}")
print(f"  Classes: {dataset_config['nc']}")
print(f"  Train: {len(train_imgs)} images")
print(f"  Val: {len(val_imgs)} images")
print(f"  Test: {len(test_imgs)} images")

# Check dataset.yaml exists -> Ready to train models
if yaml_path.exists():
    print(f"\n✓ {yaml_path.name} exists")
    print("\n✓ Dataset ready for training!")
else:
    print(f"\n✗ {yaml_path.name} not found")

✓ Created /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed/dataset.yaml

Dataset configuration:
  Path: /Users/theatulgupta/Desktop/Deep Learning Project/dental-xray-cavity-detection/data/processed
  Classes: 4
  Train: 51 images
  Val: 6 images
  Test: 7 images

✓ dataset.yaml exists

✓ Dataset ready for training!


### Next -

Train models in `02_train_yolov8.ipynb` and `03_train_yolov12.ipynb`
