In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import os
import shutil

images_folder = "/content/drive/MyDrive/kaggle_dataset/val/images"

labels_folder = "/content/drive/MyDrive/kaggle_dataset/val/labels"

backup_folder = "/content/drive/MyDrive/kaggle_dataset/val/images_without_labels"
os.makedirs(backup_folder, exist_ok=True)

images = set(f.replace(".jpg", "") for f in os.listdir(images_folder) if f.endswith(".jpg"))
labels = set(f.replace(".txt", "") for f in os.listdir(labels_folder) if f.endswith(".txt"))


images_without_labels = images - labels

print(f"Found {len(images_without_labels)} images without labels.")

for img_name in images_without_labels:
    img_path = os.path.join(images_folder, img_name + ".jpg")
    dest_path = os.path.join(backup_folder, img_name + ".jpg")
    if os.path.exists(img_path):
        shutil.move(img_path, dest_path)


print(f"Moved {len(images_without_labels)} images without labels to backup folder: {backup_folder}")





Found 0 images without labels.
Moved 0 images without labels to backup folder: /content/drive/MyDrive/kaggle_dataset/val/images_without_labels


In [None]:
import os
import shutil
import random

def split_data(base_dir, img_folder_name="images", label_folder_name="labels", split_ratio=0.5):
    """
    Splits image and corresponding label files equally into 'valid' and 'test' directories.

    Args:
        base_dir (str): The path to the folder containing the image and label folders.
        img_folder_name (str): The name of the folder containing images (e.g., 'images').
        label_folder_name (str): The name of the folder containing labels (e.g., 'labels').
        split_ratio (float): The ratio for the split (0.5 for a 50/50 split).
    """
    img_dir = os.path.join(base_dir, img_folder_name)
    label_dir = os.path.join(base_dir, label_folder_name)

    # Check if the directories exist
    if not os.path.isdir(img_dir):
        print(f"Error: Image directory not found at {img_dir}")
        return
    if not os.path.isdir(label_dir):
        print(f"Error: Label directory not found at {label_dir}")
        return

    # Get a list of all image filenames (without extension)
    # This assumes that the image and label files share a common base name.
    image_files = [os.path.splitext(f)[0] for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
    if not image_files:
        print("No images found to split.")
        return

    # Shuffle the list to ensure random selection
    random.shuffle(image_files)

    # Determine the split point for 50/50
    split_point = int(len(image_files) * split_ratio)

    # Split the filenames
    valid_files = image_files[:split_point]
    test_files = image_files[split_point:]

    print(f"Total files found: {len(image_files)}")
    print(f"Files for 'valid' set: {len(valid_files)}")
    print(f"Files for 'test' set: {len(test_files)}")

    # Define the new directories
    valid_img_dir = os.path.join(base_dir, "valid", img_folder_name)
    test_img_dir = os.path.join(base_dir, "test", img_folder_name)
    valid_label_dir = os.path.join(base_dir, "valid", label_folder_name)
    test_label_dir = os.path.join(base_dir, "test", label_folder_name)

    # Create the new directories
    for d in [valid_img_dir, test_img_dir, valid_label_dir, test_label_dir]:
        os.makedirs(d, exist_ok=True)
        print(f"Created directory: {d}")

    # --- Moving Function ---
    def move_files(file_list, src_img_dir, src_label_dir, dest_img_dir, dest_label_dir):
        moved_count = 0
        for base_name in file_list:
            # Find the actual image file with its extension
            try:
                img_src_path = next(f for f in os.listdir(src_img_dir) if os.path.splitext(f)[0] == base_name)
                img_ext = os.path.splitext(img_src_path)[1]
            except StopIteration:
                print(f"Warning: Image file not found for base name {base_name}. Skipping.")
                continue

            # Assume label file has a known extension (e.g., '.txt', '.json').
            # We'll try to find the label file in the directory.
            try:
                label_src_path = next(f for f in os.listdir(src_label_dir) if os.path.splitext(f)[0] == base_name)
                label_ext = os.path.splitext(label_src_path)[1]
            except StopIteration:
                print(f"Warning: Label file not found for base name {base_name}. Skipping.")
                # We skip the pair if the label isn't found, to keep them matched.
                continue

            # Full source and destination paths
            src_img = os.path.join(src_img_dir, base_name + img_ext)
            dest_img = os.path.join(dest_img_dir, base_name + img_ext)
            src_label = os.path.join(src_label_dir, base_name + label_ext)
            dest_label = os.path.join(dest_label_dir, base_name + label_ext)

            # Move the files
            try:
                shutil.move(src_img, dest_img)
                shutil.move(src_label, dest_label)
                moved_count += 1
            except FileNotFoundError as e:
                print(f"Error moving files for {base_name}: {e}")

        return moved_count

    # Move files for 'valid' set
    print("\nMoving files to 'valid'...")
    moved_valid = move_files(valid_files, img_dir, label_dir, valid_img_dir, valid_label_dir)
    print(f"Successfully moved {moved_valid} pairs to 'valid'.")

    # Move files for 'test' set
    print("\nMoving files to 'test'...")
    moved_test = move_files(test_files, img_dir, label_dir, test_img_dir, test_label_dir)
    print(f"Successfully moved {moved_test} pairs to 'test'.")

    print("\nData splitting complete.")

# --- Execution ---
if __name__ == "__main__":
    # IMPORTANT: Change this to the path of your main dataset folder
    # This folder must contain 'images' and 'labels' subdirectories.
    DATASET_BASE_PATH = "/content/drive/MyDrive/kaggle_dataset/val"

    # Run the split (50/50 by default)
    split_data(DATASET_BASE_PATH)

Total files found: 10000
Files for 'valid' set: 5000
Files for 'test' set: 5000
Created directory: /content/drive/MyDrive/kaggle_dataset/val/valid/images
Created directory: /content/drive/MyDrive/kaggle_dataset/val/test/images
Created directory: /content/drive/MyDrive/kaggle_dataset/val/valid/labels
Created directory: /content/drive/MyDrive/kaggle_dataset/val/test/labels

Moving files to 'valid'...
Successfully moved 5000 pairs to 'valid'.

Moving files to 'test'...
Successfully moved 5000 pairs to 'test'.

Data splitting complete.
