In [10]:
import torch
import sys

print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'MPS' if torch.backends.mps.is_available() else 'CPU'}")

Python: 3.11.14 | packaged by conda-forge | (main, Oct 22 2025, 22:56:31) [Clang 19.1.7 ]
PyTorch: 2.5.1
Device: MPS


# Dataset Preparation

Process local dental X-ray dataset from `data/raw/` directory.


In [11]:
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
import yaml

project_root = Path('..').resolve() if Path.cwd().name == 'notebooks' else Path('.').resolve()
raw_dir = project_root / 'data' / 'raw'
processed_dir = project_root / 'data' / 'processed'

# Clear processed directory if it exists
if processed_dir.exists():
    shutil.rmtree(processed_dir)
processed_dir.mkdir(parents=True, exist_ok=True)

print(f"✓ Project root: {project_root}")
print(f"✓ Raw data: {raw_dir}")
print(f"✓ Processed data: {processed_dir}")

✓ Project root: /Users/richa/Documents/Deep Learning Project/Dental-X-Ray-Cavity-Detection
✓ Raw data: /Users/richa/Documents/Deep Learning Project/Dental-X-Ray-Cavity-Detection/data/raw
✓ Processed data: /Users/richa/Documents/Deep Learning Project/Dental-X-Ray-Cavity-Detection/data/processed


In [12]:
# Get all images and labels
images_dir = raw_dir / 'images'
labels_dir = raw_dir / 'object_detection_labels'

# Get all image files
image_files = sorted(list(images_dir.glob('*.png')) + 
                    list(images_dir.glob('*.jpg')) + 
                    list(images_dir.glob('*.jpeg')))

print(f"Found {len(image_files)} images")

# Filter images that have corresponding labels
valid_images = []
for img_path in image_files:
    label_path = labels_dir / f"{img_path.stem}.txt"
    if label_path.exists():
        valid_images.append(img_path)

print(f"Found {len(valid_images)} images with labels")

# Split dataset: 80% train, 10% val, 10% test
train_imgs, temp_imgs = train_test_split(valid_images, test_size=0.2, random_state=42)
val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)

print(f"\nSplit:")
print(f"  Train: {len(train_imgs)} images")
print(f"  Val:   {len(val_imgs)} images")
print(f"  Test:  {len(test_imgs)} images")

Found 64 images
Found 64 images with labels

Split:
  Train: 51 images
  Val:   6 images
  Test:  7 images


In [13]:
# Create directory structure
for split_name, split_images in [('train', train_imgs), ('val', val_imgs), ('test', test_imgs)]:
    split_dir = processed_dir / split_name
    (split_dir / 'images').mkdir(parents=True, exist_ok=True)
    (split_dir / 'labels').mkdir(parents=True, exist_ok=True)
    
    # Copy images and labels
    for img_path in split_images:
        # Copy image
        shutil.copy(img_path, split_dir / 'images' / img_path.name)
        
        # Copy label
        label_path = labels_dir / f"{img_path.stem}.txt"
        if label_path.exists():
            shutil.copy(label_path, split_dir / 'labels' / f"{img_path.stem}.txt")
    
    print(f"✓ {split_name}: copied {len(split_images)} images + labels")

print("\n✓ Dataset split complete!")

✓ train: copied 51 images + labels
✓ val: copied 6 images + labels
✓ test: copied 7 images + labels

✓ Dataset split complete!


In [14]:
# Create dataset.yaml
dataset_config = {
    'path': str(processed_dir.resolve()),
    'train': 'train/images',
    'val': 'val/images',
    'test': 'test/images',
    'nc': 4,
    'names': ['cavity_class_0', 'cavity_class_1', 'cavity_class_2', 'cavity_class_3']
}

yaml_path = processed_dir / 'dataset.yaml'
with open(yaml_path, 'w') as f:
    yaml.dump(dataset_config, f, sort_keys=False)

print(f"✓ Created {yaml_path}")
print(f"\nDataset configuration:")
print(f"  Path: {dataset_config['path']}")
print(f"  Classes: {dataset_config['nc']}")
print(f"  Train: {len(train_imgs)} images")
print(f"  Val: {len(val_imgs)} images")
print(f"  Test: {len(test_imgs)} images")

✓ Created /Users/richa/Documents/Deep Learning Project/Dental-X-Ray-Cavity-Detection/data/processed/dataset.yaml

Dataset configuration:
  Path: /Users/richa/Documents/Deep Learning Project/Dental-X-Ray-Cavity-Detection/data/processed
  Classes: 4
  Train: 51 images
  Val: 6 images
  Test: 7 images


## Verification


In [15]:
# Verify dataset structure
print("Dataset structure verification:\n")

for split in ['train', 'val', 'test']:
    img_count = len(list((processed_dir / split / 'images').glob('*')))
    label_count = len(list((processed_dir / split / 'labels').glob('*.txt')))
    print(f"{split:5s}: {img_count:2d} images, {label_count:2d} labels")

# Check dataset.yaml exists
if yaml_path.exists():
    print(f"\n✓ {yaml_path.name} exists")
    print("\n✓ Dataset ready for training!")
else:
    print(f"\n✗ {yaml_path.name} not found")

Dataset structure verification:

train: 51 images, 51 labels
val  :  6 images,  6 labels
test :  7 images,  7 labels

✓ dataset.yaml exists

✓ Dataset ready for training!


## ✅ Complete

Next: Train models in `02_train_yolov8.ipynb` and `03_train_yolov12.ipynb`
