In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img


# Set the path to your original dataset (update this path as needed)
base_dir = r'C:\Users\91947\Desktop\dap and iot\test\pro'     
output_base_dir = r'C:\Users\91947\Desktop\dap and iot\test\result\pro'

# List of categories (subfolder names)
categories = ["Normal", "Osteoporosis", "Osteopenia"]

# Number of augmented images to generate per original image
n_aug_per_image = 5

# Create an instance of ImageDataGenerator with desired augmentation parameters.
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Prepare output directories and report dictionary
report = {}

for category in categories:
    # Define input and output paths for the current category
    input_path = os.path.join(base_dir, category)
    output_path = os.path.join(output_base_dir, category)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Get list of image files (assuming jpg or png files)
    image_files = [f for f in os.listdir(input_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    original_count = len(image_files)
    
    # Counter for naming augmented files sequentially (starting at 1)
    counter = 1
    
    # Process each image file in the folder
    for file_name in image_files:
        file_path = os.path.join(input_path, file_name)
        
        try:
            # Load image and convert to numpy array
            img = load_img(file_path)
            x = img_to_array(img)
            # Reshape to (1, height, width, channels)
            x = np.expand_dims(x, axis=0)
            
            # Generate augmented images for each original image
            aug_iter = datagen.flow(x, batch_size=1)
            for i in range(n_aug_per_image):
                # Generate one augmented image
                batch = next(aug_iter)
                aug_img = array_to_img(batch[0])
                # Create file name with the desired format (e.g., Normal_1.jpg, Normal_2.jpg, etc.)
                new_file_name = f"{category}_{counter}.jpg"
                save_path = os.path.join(output_path, new_file_name)
                aug_img.save(save_path)
                counter += 1
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    # Calculate the number of augmented images generated for this category
    augmented_count = counter - 1  # since counter started at 1
    report[category] = {"Original Images": original_count, "Augmented Images": augmented_count}

# %% [code]
# Print a summary report
print("Data Augmentation Report:")
for category, counts in report.items():
    print(f"\nCategory: {category}")
    print(f" - Original Images: {counts['Original Images']}")
    print(f" - Augmented Images Generated: {counts['Augmented Images']}")
    increase = counts['Augmented Images'] - counts['Original Images']
    print(f" - Total Additional Images: {increase}")


In [1]:
import os
import cv2
import hashlib

# Path to your dataset directory
base_dir = r'C:\Users\91947\Desktop\dap and iot\test\result\pro' 

# List of class folders
categories = ['Normal', 'Osteoporosis', 'Osteopenia']

# Dictionaries to store image hashes for each class to identify duplicates
unique_hashes = {cat: {} for cat in categories}

# Counters for reporting
total_files = 0
removed_files = 0

print("Starting data cleaning...")

for cat in categories:
    cat_dir = os.path.join(base_dir, cat)
    print(f"\nProcessing folder: {cat_dir}")
    # List image files (adjust extensions as needed)
    image_files = [f for f in os.listdir(cat_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    for img_file in image_files:
        total_files += 1
        img_path = os.path.join(cat_dir, img_file)
        
        # Try to read the image
        img = cv2.imread(img_path)
        if img is None:
            print(f"Corrupt or unreadable image: {img_path}. Removing file.")
            os.remove(img_path)
            removed_files += 1
            continue
        
        # Compute hash of the image (using PNG encoding for consistency)
        try:
            success, buffer = cv2.imencode('.png', img)
            if not success:
                print(f"Encoding failed for image: {img_path}. Removing file.")
                os.remove(img_path)
                removed_files += 1
                continue
            img_hash = hashlib.md5(buffer.tobytes()).hexdigest()
        except Exception as e:
            print(f"Error computing hash for {img_path}: {e}. Removing file.")
            os.remove(img_path)
            removed_files += 1
            continue
        
        # Check for duplicates: if the same hash exists, remove the duplicate file.
        if img_hash in unique_hashes[cat]:
            print(f"Duplicate image found: {img_path}. Removing duplicate.")
            os.remove(img_path)
            removed_files += 1
        else:
            unique_hashes[cat][img_hash] = img_path

print("\nData Cleaning Completed.")
print(f"Total files processed: {total_files}")
print(f"Total files removed: {removed_files}")


Starting data cleaning...

Processing folder: C:\Users\91947\Desktop\dap and iot\test\result\pro\Normal

Processing folder: C:\Users\91947\Desktop\dap and iot\test\result\pro\Osteoporosis

Processing folder: C:\Users\91947\Desktop\dap and iot\test\result\pro\Osteopenia

Data Cleaning Completed.
Total files processed: 8211
Total files removed: 0
