In [24]:
import pandas as pd
import os
import itertools

### Get bad Images

In [None]:
bad_images = pd.read_csv('bad_images/Chordata_bad_images.csv')['BadImageRelativePath'].tolist()

# Get all filepaths

In [82]:
import shutil 


# base_directory
base_dir = 'chordata_images'
# get the class directories
class_dir = sorted([d for d in sorted(os.listdir(base_dir)) if os.path.isdir(os.path.join(base_dir, d))])

good_files_paths = [
    os.path.join(base_dir, class_name, file_name) for class_name in class_dir for file_name in os.listdir(os.path.join(base_dir, class_name)) 
        if os.path.join(class_name, file_name) not in bad_images 
]


# 1. Define the name for the new directory
destination_folder = 'cleaned_chordata_images'

# 2. Create the destination directory
# os.makedirs() creates parent directories if needed and doesn't raise an error if it already exists (exist_ok=True)
os.makedirs(destination_folder, exist_ok=True)
print(f"Created or found destination folder: {destination_folder}")

# 3. Loop through the good file paths and copy them
copied_count = 0
for original_file_path in good_files_paths:
    # Check if the source file actually exists before trying to copy
    if not os.path.exists(original_file_path):
        print(f"Warning: Source file not found, skipping: {original_file_path}")
        continue

    try:
        # Extract the relative path part (e.g., 'aves/good_bird.jpg')
        # We assume the structure is like 'some_base_dir/class_name/filename.ext'
        # os.path.normpath handles different OS separators
        # os.path.split splits the path into (head, tail) -> ('chordata_images/aves', 'good_bird.jpg')
        # Then we split the head again -> ('chordata_images', 'aves')
        # We take the second element of the second split, which is the class name
        parts = os.path.normpath(original_file_path).split(os.sep)
        if len(parts) < 3:
                print(f"Warning: Could not determine class directory from path, skipping: {original_file_path}")
                continue
        # Assuming the class name is the second to last part
        class_name = parts[-2]
        file_name = parts[-1] # Same as os.path.basename(original_file_path)`

        # Construct the destination subdirectory path (e.g., 'cleaned_chordata_images/aves')
        destination_subdir = os.path.join(destination_folder, class_name)

        # Create the destination subdirectory if it doesn't exist
        os.makedirs(destination_subdir, exist_ok=True)

        # Construct the full path for the destination file
        destination_file_path = os.path.join(destination_subdir, file_name)

        # Copy the file
        shutil.copy2(original_file_path, destination_file_path)
        copied_count += 1

    except Exception as e:
        print(f"Error processing file {original_file_path}: {e}")
    # Copy the file
    # shutil.copy2 attempts to preserve metadata (like modification time)
    try:
        shutil.copy2(original_file_path, destination_file_path)
        copied_count += 1
    except Exception as e:
        print(f"Error copying file {original_file_path} to {destination_file_path}: {e}")



print(f"\nFinished copying.")
print(f"Successfully copied {copied_count} files.")
print(f"Cleaned images are in '{destination_folder}' with original subdirectory structure.")

Created or found destination folder: cleaned_chordata_images

Finished copying.
Successfully copied 17490 files.
Cleaned images are in 'cleaned_chordata_images' with original subdirectory structure.


In [85]:


# base_directory
base_dir = 'cleaned_chordata_images'
# get the class directories
class_dir = sorted([d for d in sorted(os.listdir(base_dir)) if os.path.isdir(os.path.join(base_dir, d))])

good_files_paths = [
    os.path.join(base_dir, class_name, file_name) for class_name in class_dir for file_name in os.listdir(os.path.join(base_dir, class_name)) 
]

len(good_files_paths)


8745