In [59]:
import os
import shutil
from collections import Counter
from PIL import Image

In [60]:
# Define the category structure
categories = {
    'flowering': [
        'African Violet (Saintpaulia ionantha)', 'Anthurium (Anthurium andraeanum)', 'Begonia (Begonia spp.)',
        'Bird of Paradise (Strelitzia reginae)', 'Chrysanthemum', 'Daffodils (Narcissus spp.)', 
        'Hyacinth (Hyacinthus orientalis)', 'Iron Cross begonia (Begonia masoniana)', 'Lilium (Hemerocallis)', 
        'Orchid', 'Poinsettia (Euphorbia pulcherrima)', 'Polka Dot Plant (Hypoestes phyllostachya)', 'Tulip'
    ],
    'palms_and_ferns': [
        'Areca Palm (Dypsis lutescens)', 'Asparagus Fern (Asparagus setaceus)', 'Birds Nest Fern (Asplenium nidus)', 
        'Boston Fern (Nephrolepis exaltata)', 'Parlor Palm (Chamaedorea elegans)', 'Ponytail Palm (Beaucarnea recurvata)', 
        'Sago Palm (Cycas revoluta)'
    ],
    'succulents_and_cacti': [
        'Aloe Vera', 'Jade plant (Crassula ovata)', 'Kalanchoe', 'Money Tree (Pachira aquatica)', 
        'Venus Flytrap', 'Yucca'
    ],
    'foliage': [
        'Calathea', 'Cast Iron Plant (Aspidistra elatior)', 'Chinese Money Plant (Pilea peperomioides)', 
        'Chinese evergreen (Aglaonema)', 'Dracaena', 'Dumb Cane (Dieffenbachia spp.)', 'Elephant Ear (Alocasia spp.)', 
        'English Ivy (Hedera helix)', 'Monstera Deliciosa (Monstera deliciosa)', 'Peace lily', 'Pothos (Ivy arum)', 
        'Prayer Plant (Maranta leuconeura)', 'Rattlesnake Plant (Calathea lancifolia)', 'Rubber Plant (Ficus elastica)', 
        'Schefflera', 'Snake plant (Sanseviera)', 'Tradescantia', 'ZZ Plant (Zamioculcas zamiifolia)'
    ]
}


In [65]:
# Define the source directory where your images are stored
source_dir = 'house_plant_species'  # replace with the actual path to your dataset
# Define the target directory where images will be moved
pre_clean = 'cleaned_dataset'
target_dir = 'dataset'  # replace with where you'd like to organize the dataset

In [66]:
# List to store file extensions
file_extensions = []

# Walk through the directory and subdirectories
for root, dirs, files in os.walk(source_dir):
    for file in files:
        # Extract file extension and add to list
        file_extension = os.path.splitext(file)[1].lower()  # Get the file extension and convert to lowercase
        file_extensions.append(file_extension)

# Count occurrences of each file extension
extension_counts = Counter(file_extensions)

# Print results
print("Image format counts:")
for ext, count in extension_counts.items():
    print(f"{ext}: {count} files")

Image format counts:
.jpeg: 799 files
.jpg: 12166 files
.png: 452 files
.webp: 154 files
.jpe: 6 files
.jfif: 6 files
.gif: 4 files


In [67]:
# Formats to keep
keep_formats = ['.jpg', '.jpeg', '.png']

# Formats to convert
convert_formats = ['.webp', '.jpe', '.jfif', '.gif']

In [68]:
# Ensure the target directory exists
if not os.path.exists(pre_clean):
    os.makedirs(pre_clean)

# Function to convert images to JPEG (if necessary)
def convert_to_jpg(file_path, target_path):
    try:
        img = Image.open(file_path)
        img = img.convert('RGB')  # Ensure RGB format
        img.save(target_path, 'JPEG')
        print(f"Converted and moved {file_path} to {target_path}")
    except Exception as e:
        print(f"Error converting {file_path}: {e}")

In [69]:
 #Function to move or convert files
def process_image(file_path, root, target_folder):
    file_name, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()
    
    # Target path for the cleaned file
    target_path = os.path.join(target_folder, os.path.basename(file_name) + '.jpg')
    
    # If the file is in a keep format, just move it to the new directory
    if file_extension in keep_formats:
        shutil.copy(file_path, target_path)
        print(f"Moved {file_path} to {target_path}")
    elif file_extension in convert_formats:
        # Convert the file if it's in a remove format
        convert_to_jpg(file_path, target_path)


In [70]:
# Walk through the source directory and process the images
for root, dirs, files in os.walk(source_dir):
    for file in files:
        file_path = os.path.join(root, file)
        # Define the target folder structure in the new directory
        relative_path = os.path.relpath(root, source_dir)
        target_folder = os.path.join(pre_clean, relative_path)

        # Ensure target subfolders exist
        if not os.path.exists(target_folder):
            os.makedirs(target_folder)
        
        # Process the image (move or convert)
        process_image(file_path, root, target_folder)

Moved house_plant_species\African Violet (Saintpaulia ionantha)\African_1.jpeg to cleaned_dataset\African Violet (Saintpaulia ionantha)\African_1.jpg
Moved house_plant_species\African Violet (Saintpaulia ionantha)\African_10.JPG to cleaned_dataset\African Violet (Saintpaulia ionantha)\African_10.jpg
Moved house_plant_species\African Violet (Saintpaulia ionantha)\African_100.jpg to cleaned_dataset\African Violet (Saintpaulia ionantha)\African_100.jpg
Moved house_plant_species\African Violet (Saintpaulia ionantha)\African_101.jpg to cleaned_dataset\African Violet (Saintpaulia ionantha)\African_101.jpg
Moved house_plant_species\African Violet (Saintpaulia ionantha)\African_102.JPG to cleaned_dataset\African Violet (Saintpaulia ionantha)\African_102.jpg
Moved house_plant_species\African Violet (Saintpaulia ionantha)\African_103.JPG to cleaned_dataset\African Violet (Saintpaulia ionantha)\African_103.jpg
Moved house_plant_species\African Violet (Saintpaulia ionantha)\African_104.jpeg to cle

In [71]:
# Function to count image formats in the target directory
def count_image_formats(directory):
    file_extensions = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_extension = os.path.splitext(file)[1].lower()
            file_extensions.append(file_extension)

    extension_counts = Counter(file_extensions)
    return extension_counts

# Count the image formats in the cleaned target directory
print("\nImage format counts in the cleaned directory:")
format_counts = count_image_formats(pre_clean)
for ext, count in format_counts.items():
    print(f"{ext}: {count} files")


Image format counts in the cleaned directory:
.jpg: 13557 files
