In [4]:
import os
import pandas as pd

def delete_unreferenced_images(csv_file, image_folder):
    # Load the dataset
    data = pd.read_csv(csv_file)

    # Extract all image paths from the dataset
    referenced_images = set()
    if 'image_paths' in data.columns:
        for image_paths in data['image_paths'].dropna():
            image_list = image_paths.strip("[]").replace("'", "").split(", ")
            referenced_images.update([os.path.basename(path) for path in image_list])

    # Get all files in the image folder
    all_images_in_folder = set(os.listdir(image_folder))

    # Find unreferenced images
    unreferenced_images = all_images_in_folder - referenced_images

    # Delete unreferenced images
    for unreferenced_image in unreferenced_images:
        file_path = os.path.join(image_folder, unreferenced_image)
        try:
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        except Exception as e:
            print(f"Error deleting {file_path}: {e}")

# Example usage
csv_file = "cleaned_data.csv"
image_folder = "../input/images"
delete_unreferenced_images(csv_file, image_folder)


Deleted: ../input/images\897e065bd-2.jpg
Deleted: ../input/images\706a9cd5c-1.jpg
Deleted: ../input/images\4084b0a93-2.jpg
Deleted: ../input/images\d7185ffc1-3.jpg
Deleted: ../input/images\70a90deaf-3.jpg
Deleted: ../input/images\4bb92c19d-1.jpg
Deleted: ../input/images\ec84f5484-3.jpg
Deleted: ../input/images\4b3efcfe0-3.jpg
Deleted: ../input/images\0f1a3ce5a-2.jpg
Deleted: ../input/images\963520a2c-1.jpg
Deleted: ../input/images\f4948a23d-2.jpg
Deleted: ../input/images\a2b34e983-3.jpg
Deleted: ../input/images\c33c82dd5-2.jpg
Deleted: ../input/images\3859838cf-2.jpg
Deleted: ../input/images\e50d0a67b-1.jpg
Deleted: ../input/images\4518f180e-1.jpg
Deleted: ../input/images\f5d514b10-1.jpg
Deleted: ../input/images\dee525fcd-3.jpg
Deleted: ../input/images\c45e47c4c-2.jpg
Deleted: ../input/images\1e1727250-3.jpg
Deleted: ../input/images\8db729086-3.jpg
Deleted: ../input/images\158bc6f24-2.jpg
Deleted: ../input/images\19287db4e-1.jpg
Deleted: ../input/images\9679275a3-3.jpg
Deleted: ../inpu

In [None]:
import os
import pandas as pd

def delete_unreferenced_and_extra_images(csv_file, image_folder, max_photos=3):
    # Load the dataset
    data = pd.read_csv(csv_file)

    # Extract all image paths from the dataset
    referenced_images = set()
    pet_image_map = {}

    if 'image_paths' in data.columns:
        for image_paths in data['image_paths'].dropna():
            image_list = image_paths.strip("[]").replace("'", "").split(", ")
            for path in image_list:
                image_name = os.path.basename(path)
                pet_id = image_name.rsplit("-", 1)[0]  # Extract pet ID
                referenced_images.add(image_name)
                
                # Group images by pet ID
                if pet_id not in pet_image_map:
                    pet_image_map[pet_id] = []
                pet_image_map[pet_id].append(image_name)

    # Get all files in the image folder
    all_images_in_folder = set(os.listdir(image_folder))

    # Find unreferenced images
    unreferenced_images = all_images_in_folder - referenced_images

    # Delete unreferenced images
    for unreferenced_image in unreferenced_images:
        file_path = os.path.join(image_folder, unreferenced_image)
        try:
            os.remove(file_path)
            print(f"Deleted unreferenced: {file_path}")
        except Exception as e:
            print(f"Error deleting {file_path}: {e}")

    # Delete extra images per pet
    for pet_id, images in pet_image_map.items():
        if len(images) > max_photos:
            # Sort images by suffix number to retain the first 'max_photos'
            images.sort(key=lambda x: int(x.rsplit("-", 1)[-1].split(".")[0]))
            images_to_delete = images[max_photos:]
            for image_name in images_to_delete:
                file_path = os.path.join(image_folder, image_name)
                try:
                    os.remove(file_path)
                    print(f"Deleted extra: {file_path}")
                except Exception as e:
                    print(f"Error deleting {file_path}: {e}")

# Example usage
csv_file = "cleaned_data2.csv"
image_folder = "../input/images"
delete_unreferenced_and_extra_images(csv_file, image_folder, max_photos=3)
