In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/YangBOT/Dataset/DatasetV2_67/Rubber-leaf_resize_label

/content/drive/.shortcut-targets-by-id/14RYTmLbBRnGHmLgU_C0HdMCwRUWiVbaN/YangBOT/Dataset/DatasetV2_67/Rubber-leaf_resize_label


In [None]:
import os
import cv2
import numpy as np
from collections import defaultdict
import shutil

def are_images_identical(image1, image2):
    if image1.shape != image2.shape:
        return False
    difference = cv2.subtract(image1, image2)
    return not np.any(difference)

def find_duplicate_images(folder_path):
    images = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder_path, filename)
            images[filename] = cv2.imread(img_path)

    duplicates = defaultdict(list)
    checked_images = set()
    group_id = 1

    for image1_name, image1 in images.items():
        if image1_name in checked_images:
            continue
        duplicates[f'dup{group_id}'].append(image1_name)
        for image2_name, image2 in images.items():
            if image1_name != image2_name and image2_name not in checked_images:
                if are_images_identical(image1, image2):
                    duplicates[f'dup{group_id}'].append(image2_name)
                    checked_images.add(image2_name)
        checked_images.add(image1_name)
        group_id += 1

    duplicates = {k: v for k, v in duplicates.items() if len(v) > 1}
    return duplicates

def save_report(duplicates, report_path):
    total_dup = len(duplicates)
    total_duplicates = 0
    with open(report_path, 'w') as file:
        for group, images in duplicates.items():
            file.write(f"{group}: {', '.join(images)}\n")
            file.write(f"จำนวนภาพใน {group}: {len(images)}\n")

            total_duplicates += len(images)
        file.write(f"ภาพ dup : {total_dup}\n")
        file.write(f"จำนวนภาพซ้ำทั้งหมด: {total_duplicates}")

def move_duplicate_images_with_label(duplicates, source_folder, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    moved_files = []

    for group, images in duplicates.items():
        for image in images[1:]:  # Skip the first image
            # Move image file
            source_path = os.path.join(source_folder, image)
            destination_path = os.path.join(destination_folder, image)
            shutil.move(source_path, destination_path)
            moved_files.append(image)

            # Move corresponding label file
            label_file = image.split('.')[0] + '.txt'
            source_path_label = os.path.join(source_folder, label_file)
            destination_path_label = os.path.join(destination_folder, label_file)
            if os.path.exists(source_path_label):
                shutil.move(source_path_label, destination_path_label)
                moved_files.append(label_file)

    print(f"Moved duplicate images and their labels to {destination_folder}")

    report_path = os.path.join(destination_folder, 'moved_files_report.txt')
    with open(report_path, 'w') as report_file:
        report_file.write("Moved Files:\n")
        for file in moved_files:
            report_file.write(f"{file}\n")
        report_file.write(f"\nTotal moved files: {len(moved_files)}")
        report_file.write(f"\nTotal moved images: {len([f for f in moved_files if not f.endswith('.txt')])}")
        report_file.write(f"\nTotal moved labels: {len([f for f in moved_files if f.endswith('.txt')])}")

    print(f"Report of moved files saved to {report_path}")

    return moved_files


def move_duplicate_images(duplicates, source_folder, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    moved_images = []

    for group, images in duplicates.items():
        for image in images[1:]:  # Skip the first image
            source_path = os.path.join(source_folder, image)
            destination_path = os.path.join(destination_folder, image)
            shutil.move(source_path, destination_path)
            moved_images.append(image)

    print(f"Moved duplicate images to {destination_folder}")

    report_path = os.path.join(destination_folder, 'moved_images_report.txt')
    with open(report_path, 'w') as report_file:
        report_file.write("Moved Images:\n")
        for image in moved_images:
            report_file.write(f"{image}\n")
        report_file.write(f"\nTotal moved images: {len(moved_images)}")

    print(f"Report of moved images saved to {report_path}")

    return moved_images




In [None]:
folder_path = '/content/drive/MyDrive/YangBOT/Dataset/DatasetV2_67/Rubber-leaf_resize_label/65SKA/65SKA/65SKA16_2/65SKA16_2mixLFS_PM'
report_path = '/content/drive/MyDrive/YangBOT/Dataset/DatasetV2_67/Rubber-leaf_resize_label/65SKA/65SKA/65SKA16_2/duplicate_report_65SKA16_2mixLFS_PM.txt'
duplicate_images = find_duplicate_images(folder_path)
save_report(duplicate_images, report_path)
print(f"Duplicate images report saved to {report_path}")


Name_dir_duplicate = 'Duplicate_Images_65SKA16_2mixLFS_PM'
destination_folder = os.path.join(os.path.dirname(folder_path), Name_dir_duplicate)
#move_duplicate_images(duplicate_images, folder_path, destination_folder)
move_duplicate_images_with_label(duplicate_images, folder_path, destination_folder)

Duplicate images report saved to /content/drive/MyDrive/YangBOT/Dataset/DatasetV2_67/Rubber-leaf_resize_label/65SKA/65SKA/65SKA16_2/duplicate_report_65SKA16_2mixLFS_PM.txt
Moved duplicate images and their labels to /content/drive/MyDrive/YangBOT/Dataset/DatasetV2_67/Rubber-leaf_resize_label/65SKA/65SKA/65SKA16_2/Duplicate_Images_65SKA16_2mixLFS_PM
Report of moved files saved to /content/drive/MyDrive/YangBOT/Dataset/DatasetV2_67/Rubber-leaf_resize_label/65SKA/65SKA/65SKA16_2/Duplicate_Images_65SKA16_2mixLFS_PM/moved_files_report.txt


[]