# Dental X-Ray Dataset Exploration

**Purpose:** Explore and analyze the dental X-ray cavity detection dataset


In [None]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2


sys.path.append(str(Path().resolve().parent / "src"))
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


## 1. Dataset Overview


In [None]:
RAW_DATA_DIR = Path("../data/raw/dental")
IMAGES_DIR = RAW_DATA_DIR / "images"
LABELS_DIR = RAW_DATA_DIR / "object_detection_labels"

images = sorted(list(IMAGES_DIR.glob("*.png")))
labels = sorted(list(LABELS_DIR.glob("*.txt")))

print(f"Total images: {len(images)}")
print(f"Total labels: {len(labels)}")
print(f"\nDataset location: {RAW_DATA_DIR}")


## 2. Visualize Sample Images


In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

sample_indices = [0, 10, 20, 30, 40, 50]

for idx, ax in enumerate(axes):
    if idx < len(images):
        img_path = images[sample_indices[idx]]
        img = Image.open(img_path)
        
        ax.imshow(img, cmap='gray')
        ax.set_title(f"Sample {sample_indices[idx]}: {img_path.name}")
        ax.axis('off')

plt.tight_layout()
plt.show()

print(f"Image dimensions: {Image.open(images[0]).size}")


## 3. Analyze Annotations


In [None]:
annotations_data = []

for label_file in labels:
    with open(label_file, 'r') as f:
        for line in f:
            if line.strip():
                parts = line.strip().split()
                annotations_data.append({
                    'file': label_file.name,
                    'class': int(parts[0]),
                    'x_center': float(parts[1]),
                    'y_center': float(parts[2]),
                    'width': float(parts[3]),
                    'height': float(parts[4])
                })

df = pd.DataFrame(annotations_data)
print(f"Total annotations: {len(df)}")
print(f"\nClass distribution:")
print(df['class'].value_counts().sort_index())


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
class_counts = df['class'].value_counts().sort_index()
ax1.bar(class_counts.index, class_counts.values, color=['red', 'green', 'blue', 'orange'])
ax1.set_xlabel('Class ID')
ax1.set_ylabel('Number of Instances')
ax1.set_title('Class Distribution')
ax1.set_xticks(class_counts.index)

# Pie chart
ax2.pie(class_counts.values, labels=[f'Class {i}' for i in class_counts.index], 
        autopct='%1.1f%%', startangle=90)
ax2.set_title('Class Distribution (%)')

plt.tight_layout()
plt.show()


## 4. Bounding Box Statistics


In [None]:
df['area'] = df['width'] * df['height']

print("Bounding Box Statistics:")
print(df[['width', 'height', 'area']].describe())

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df['width'], bins=30, edgecolor='black')
axes[0].set_xlabel('Width (normalized)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Box Width Distribution')

axes[1].hist(df['height'], bins=30, edgecolor='black', color='green')
axes[1].set_xlabel('Height (normalized)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Box Height Distribution')

axes[2].hist(df['area'], bins=30, edgecolor='black', color='orange')
axes[2].set_xlabel('Area (normalized)')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Box Area Distribution')

plt.tight_layout()
plt.show()


## 5. Visualize Annotations on Images


In [None]:
def visualize_boxes(image_path, label_path, ax):
    """Visualize bounding boxes on an image."""
    
    img = cv2.imread(str(image_path))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    h, w = img.shape[:2]
    
    with open(label_path, 'r') as f:
        for line in f:
            if line.strip():
                class_id, x_center, y_center, width, height = map(float, line.strip().split())
                
                
                x1 = int((x_center - width/2) * w)
                y1 = int((y_center - height/2) * h)
                x2 = int((x_center + width/2) * w)
                y2 = int((y_center + height/2) * h)
                
                
                color = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 165, 0)][int(class_id)]
                cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
                cv2.putText(img, f"C{int(class_id)}", (x1, y1-10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    
    ax.imshow(img)
    ax.axis('off')


fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    if idx < len(images):
        img_path = images[sample_indices[idx]]
        label_path = LABELS_DIR / f"{img_path.stem}.txt"
        visualize_boxes(img_path, label_path, ax)
        ax.set_title(f"Sample {sample_indices[idx]}: {img_path.name}")

plt.tight_layout()
plt.show()


## 6. Summary Statistics


In [None]:
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
print(f"Total Images: {len(images)}")
print(f"Total Annotations: {len(df)}")
print(f"Number of Classes: {df['class'].nunique()}")
print(f"Average Annotations per Image: {len(df)/len(images):.2f}")
print(f"\nClass Distribution:")
for cls in sorted(df['class'].unique()):
    count = (df['class'] == cls).sum()
    print(f"  Class {cls}: {count} instances ({count/len(df)*100:.1f}%)")
print("\n" + "=" * 60)
