In [1]:
import os

In [2]:
labels_dir = "/Volumes/MyDataDrive/thesis/code-2/data/new-data/IIT-CDIP/train/labels"
images_dir = "/Volumes/MyDataDrive/thesis/code-2/data/new-data/IIT-CDIP/train/images"

# Here we filter the dataset to only include images that have a label and include only the labels that include the images

# Get all files in both directories
label_files = set(os.listdir(labels_dir))
image_files = set(os.listdir(images_dir))

print(f"Total label files: {len(label_files)}")
print(f"Total image files: {len(image_files)}")

# Extract base names (without extensions) for matching
# For labels, remove the "votes_" prefix before extracting basename
label_basenames = {os.path.splitext(f)[0].replace('votes_', '') for f in label_files if not f.startswith('.') and f.startswith('votes_')}
image_basenames = {os.path.splitext(f)[0] for f in image_files if not f.startswith('.')}

print(f"Label basenames: {len(label_basenames)}")
print(f"Image basenames: {len(image_basenames)}")

# Find intersection - files that exist in both directories
common_basenames = label_basenames.intersection(image_basenames)
print(f"Common basenames (files with both image and label): {len(common_basenames)}")

# Filter files to keep only those with matches
# For labels, we need to check if removing "votes_" prefix gives us a common basename
filtered_label_files = {f for f in label_files if f.startswith('votes_') and os.path.splitext(f)[0].replace('votes_', '') in common_basenames}
filtered_image_files = {f for f in image_files if os.path.splitext(f)[0] in common_basenames}

print(f"Filtered label files: {len(filtered_label_files)}")
print(f"Filtered image files: {len(filtered_image_files)}")

# Show some examples of files that will be kept
print("\nSample files that will be kept:")
for basename in list(common_basenames)[:5]:
    # For labels, we need to find the file with "votes_" prefix
    label_file = next((f for f in filtered_label_files if os.path.splitext(f)[0].replace('votes_', '') == basename), None)
    image_file = next((f for f in filtered_image_files if os.path.splitext(f)[0] == basename), None)
    print(f"  {basename}: {image_file} <-> {label_file}")

# Show files that will be removed
orphaned_labels = label_basenames - common_basenames
orphaned_images = image_basenames - common_basenames

print(f"\nOrphaned labels (no matching image): {len(orphaned_labels)}")
print(f"Orphaned images (no matching label): {len(orphaned_images)}")

if orphaned_labels:
    print("Sample orphaned labels (without votes_ prefix):", list(orphaned_labels)[:5])
if orphaned_images:
    print("Sample orphaned images:", list(orphaned_images)[:5])

# Show actual orphaned files with full names
orphaned_label_files = {f for f in label_files if f.startswith('votes_') and os.path.splitext(f)[0].replace('votes_', '') in orphaned_labels}
orphaned_image_files = {f for f in image_files if os.path.splitext(f)[0] in orphaned_images}

if orphaned_label_files:
    print("Sample orphaned label files:", list(orphaned_label_files)[:5])
if orphaned_image_files:
    print("Sample orphaned image files:", list(orphaned_image_files)[:5])


Total label files: 2030
Total image files: 2238
Label basenames: 2030
Image basenames: 2238
Common basenames (files with both image and label): 1958
Filtered label files: 1958
Filtered image files: 1958

Sample files that will be kept:
  flgw0204_page2: flgw0204_page2.png <-> votes_flgw0204_page2.json
  fkmf0218_page270: fkmf0218_page270.png <-> votes_fkmf0218_page270.json
  fkmf0218_page261: fkmf0218_page261.png <-> votes_fkmf0218_page261.json
  fjvc0035_page26: fjvc0035_page26.png <-> votes_fjvc0035_page26.json
  fhyn0211_page1: fhyn0211_page1.png <-> votes_fhyn0211_page1.json

Orphaned labels (no matching image): 72
Orphaned images (no matching label): 280
Sample orphaned labels (without votes_ prefix): ['fgwh0222_page3', 'fhnp0058_page1', 'fgxn0164_page9', 'fgjl0125_page1', 'fgvk0135_page1']
Sample orphaned images: ['flxh0049_page2', 'flxx0094_page1', 'flxh0252_page2', 'flpv0200_page10', 'fmbf0021_page1']
Sample orphaned label files: ['votes_fjmw0035_page8.json', 'votes_fkmw0192_pa

In [3]:
import shutil
from pathlib import Path

# Create filtered directories
filtered_labels_dir = "/Volumes/MyDataDrive/thesis/code-2/data/new-data/IIT-CDIP/train/labels_filtered"
filtered_images_dir = "/Volumes/MyDataDrive/thesis/code-2/data/new-data/IIT-CDIP/train/images_filtered"

# Create directories if they don't exist
Path(filtered_labels_dir).mkdir(parents=True, exist_ok=True)
Path(filtered_images_dir).mkdir(parents=True, exist_ok=True)

print(f"Created filtered directories:")
print(f"  Labels: {filtered_labels_dir}")
print(f"  Images: {filtered_images_dir}")

# Copy filtered files to new directories
print("\nCopying filtered files...")

# Copy labels
for label_file in filtered_label_files:
    src_path = os.path.join(labels_dir, label_file)
    dst_path = os.path.join(filtered_labels_dir, label_file)
    shutil.copy2(src_path, dst_path)

# Copy images
for image_file in filtered_image_files:
    src_path = os.path.join(images_dir, image_file)
    dst_path = os.path.join(filtered_images_dir, image_file)
    shutil.copy2(src_path, dst_path)

print(f"Successfully copied {len(filtered_label_files)} label files")
print(f"Successfully copied {len(filtered_image_files)} image files")

# Verify the filtered directories
filtered_labels_count = len(os.listdir(filtered_labels_dir))
filtered_images_count = len(os.listdir(filtered_images_dir))

print(f"\nVerification:")
print(f"  Filtered labels directory contains: {filtered_labels_count} files")
print(f"  Filtered images directory contains: {filtered_images_count} files")
print(f"  Match: {filtered_labels_count == filtered_images_count}")

print(f"\nFiltering complete! Dataset reduced from {len(image_files)} images to {len(filtered_image_files)} matched pairs.")


Created filtered directories:
  Labels: /Volumes/MyDataDrive/thesis/code-2/data/new-data/IIT-CDIP/train/labels_filtered
  Images: /Volumes/MyDataDrive/thesis/code-2/data/new-data/IIT-CDIP/train/images_filtered

Copying filtered files...
Successfully copied 1958 label files
Successfully copied 1958 image files

Verification:
  Filtered labels directory contains: 1958 files
  Filtered images directory contains: 1958 files
  Match: True

Filtering complete! Dataset reduced from 2238 images to 1958 matched pairs.
