In [1]:
# TODO: Import required libraries
import os
import random
import shutil
import matplotlib.pyplot as plt


In [2]:
# ------------------------------------------
# Step 1: Dataset Overview
# ------------------------------------------

# Define dataset root directory (Windows path)
DATASET_ROOT = r"./CarDD_release/CarDD_release/CarDD_COCO"

TRAIN_IMAGES_DIR = os.path.join(DATASET_ROOT, "train2017")
VAL_IMAGES_DIR = os.path.join(DATASET_ROOT, "val2017")
ANNOTATIONS_DIR = os.path.join(DATASET_ROOT, "annotations")

# Inspect available images
train_images = os.listdir(TRAIN_IMAGES_DIR)
val_images = os.listdir(VAL_IMAGES_DIR)

# Inspect annotations
annotations_files = os.listdir(ANNOTATIONS_DIR)

# Print dataset statistics
print(" Dataset Root:", DATASET_ROOT)
print(" Training Images:", len(train_images))
print(" Validation Images:", len(val_images))
print(" Annotation Files:", annotations_files)





 Dataset Root: ./CarDD_release/CarDD_release/CarDD_COCO
 Training Images: 2816
 Validation Images: 810
 Annotation Files: ['image_info.xlsx', 'instances_test2017.json', 'instances_train2017.json', 'instances_val2017.json']


In [3]:
# ------------------------------------------
# Step 2: Subset Selection
# ------------------------------------------

# Number of samples to use
NUM_TRAIN_SAMPLES = 300
NUM_VAL_SAMPLES = 80

# Random seed for reproducibility
random.seed(42)

# Shuffle image lists
random.shuffle(train_images)
random.shuffle(val_images)

# Select subsets
train_subset = train_images[:NUM_TRAIN_SAMPLES]
val_subset = val_images[:NUM_VAL_SAMPLES]

# Store selected file names
print("Selected Training Samples:", len(train_subset))
print("Selected Validation Samples:", len(val_subset))

# Show a few sample names
print("\nSample training images:", train_subset[:5])
print("Sample validation images:", val_subset[:5])


Selected Training Samples: 300
Selected Validation Samples: 80

Sample training images: ['003501.jpg', '003207.jpg', '003104.jpg', '002649.jpg', '000349.jpg']
Sample validation images: ['001494.jpg', '002625.jpg', '000825.jpg', '003127.jpg', '000187.jpg']


In [4]:
# ------------------------------------------
# Step 3: Train / Validation Split
# ------------------------------------------

# Combine all selected images
all_selected_images = train_subset + val_subset

# Shuffle again before split
random.shuffle(all_selected_images)

# 80/20 split
split_ratio = 0.8
split_index = int(len(all_selected_images) * split_ratio)

final_train_images = all_selected_images[:split_index]
final_val_images = all_selected_images[split_index:]

print("Final Training Images:", len(final_train_images))
print("Final Validation Images:", len(final_val_images))

# Preview filenames
print("\nTrain sample:", final_train_images[:5])
print("Val sample:", final_val_images[:5])


Final Training Images: 304
Final Validation Images: 76

Train sample: ['003966.jpg', '003594.jpg', '002499.jpg', '002599.jpg', '002239.jpg']
Val sample: ['000460.jpg', '003207.jpg', '000455.jpg', '003709.jpg', '003180.jpg']


In [5]:
# ------------------------------------------
# Step 4: Folder Structure Creation
# ------------------------------------------

# Output dataset root
OUTPUT_ROOT = "yolo_dataset"

# Define folders
IMAGES_TRAIN_DIR = os.path.join(OUTPUT_ROOT, "images/train")
IMAGES_VAL_DIR = os.path.join(OUTPUT_ROOT, "images/val")
LABELS_TRAIN_DIR = os.path.join(OUTPUT_ROOT, "labels/train")
LABELS_VAL_DIR = os.path.join(OUTPUT_ROOT, "labels/val")

# Create directories
os.makedirs(IMAGES_TRAIN_DIR, exist_ok=True)
os.makedirs(IMAGES_VAL_DIR, exist_ok=True)
os.makedirs(LABELS_TRAIN_DIR, exist_ok=True)
os.makedirs(LABELS_VAL_DIR, exist_ok=True)

print("Created folder structure:")
print(IMAGES_TRAIN_DIR)
print(IMAGES_VAL_DIR)
print(LABELS_TRAIN_DIR)
print(LABELS_VAL_DIR)

# Create lookup sets
train_set = set(train_images)
val_set = set(val_images)

# Copy training images (from correct source)
for img_name in final_train_images:
    if img_name in train_set:
        src_path = os.path.join(TRAIN_IMAGES_DIR, img_name)
    else:
        src_path = os.path.join(VAL_IMAGES_DIR, img_name)

    dst_path = os.path.join(IMAGES_TRAIN_DIR, img_name)
    shutil.copy(src_path, dst_path)

# Copy validation images (from correct source)
for img_name in final_val_images:
    if img_name in train_set:
        src_path = os.path.join(TRAIN_IMAGES_DIR, img_name)
    else:
        src_path = os.path.join(VAL_IMAGES_DIR, img_name)

    dst_path = os.path.join(IMAGES_VAL_DIR, img_name)
    shutil.copy(src_path, dst_path)

print("Images copied successfully!")




Created folder structure:
yolo_dataset\images/train
yolo_dataset\images/val
yolo_dataset\labels/train
yolo_dataset\labels/val
Images copied successfully!


In [16]:
# ------------------------------------------
# Reflection
# ------------------------------------------

"""
Why is data cleanliness important for training?

Data cleanliness is important because the model learns directly from the data.
If the dataset contains noisy, duplicated, corrupted, or wrongly labeled samples,
the model will learn incorrect patterns and perform poorly in real-world scenarios.
Clean data leads to stable training, faster convergence, and better generalization.

In object detection, even small errors in labels can significantly affect model accuracy.


What could go wrong with bad annotations?

Bad annotations can cause:
1. Incorrect bounding boxes, leading to poor object localization and low IoU.
2. Wrong class labels, causing class confusion and misclassification.
3. Missing annotations, which reduce recall because damaged regions are treated as background.
4. Loose or oversized boxes, which include background and reduce detection precision.

A detection model is only as good as its annotations.
"""


'\nWhy is data cleanliness important for training?\n\nData cleanliness is important because the model learns directly from the data.\nIf the dataset contains noisy, duplicated, corrupted, or wrongly labeled samples,\nthe model will learn incorrect patterns and perform poorly in real-world scenarios.\nClean data leads to stable training, faster convergence, and better generalization.\n\nIn object detection, even small errors in labels can significantly affect model accuracy.\n\n\nWhat could go wrong with bad annotations?\n\nBad annotations can cause:\n1. Incorrect bounding boxes, leading to poor object localization and low IoU.\n2. Wrong class labels, causing class confusion and misclassification.\n3. Missing annotations, which reduce recall because damaged regions are treated as background.\n4. Loose or oversized boxes, which include background and reduce detection precision.\n\nA detection model is only as good as its annotations.\n'