In [1]:
import hashlib
import os
from collections import defaultdict

In [2]:
def calculate_md5(file_path):
    md5 = hashlib.md5()
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(4096), b""):
            md5.update(chunk)
    return md5.hexdigest()

In [3]:
def calculate_file_hashes(directory):
    file_hashes = defaultdict(list)

    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.lower().endswith((".png", ".jpg", ".jpeg", ".gif")):
                file_path = os.path.join(root, filename)
                hash_value = calculate_md5(file_path)
                file_hashes[hash_value].append(file_path)

    return file_hashes

In [4]:
def find_duplicate_images(directory):
    image_hashes = calculate_file_hashes(directory)
    duplicate_images = []

    for _, file_paths in image_hashes.items():
        if len(file_paths) > 1:
            duplicate_images.append(file_paths)

    return duplicate_images

In [5]:
parent_directory = "./data/images_cleaned_train"

In [6]:
duplicate_images = find_duplicate_images(parent_directory)

In [9]:
for lst in duplicate_images:
    if l:=len(lst) != 2:
        print(l)

In [14]:
for lst in duplicate_images:
    a, b = lst
    if os.path.dirname(a) != os.path.dirname(b):
        print(a, b)

./data/images_cleaned_train/prunus_subhirtella/17194.jpg ./data/images_cleaned_train/prunus_virginiana/11710.jpg
./data/images_cleaned_train/prunus_subhirtella/5979.jpg ./data/images_cleaned_train/prunus_virginiana/1690.jpg
./data/images_cleaned_train/prunus_subhirtella/7410.jpg ./data/images_cleaned_train/prunus_virginiana/538.jpg
./data/images_cleaned_train/prunus_subhirtella/9471.jpg ./data/images_cleaned_train/prunus_virginiana/12718.jpg
./data/images_cleaned_train/prunus_subhirtella/10692.jpg ./data/images_cleaned_train/prunus_virginiana/4712.jpg
./data/images_cleaned_train/prunus_subhirtella/2041.jpg ./data/images_cleaned_train/prunus_virginiana/1196.jpg
./data/images_cleaned_train/prunus_subhirtella/8941.jpg ./data/images_cleaned_train/prunus_virginiana/3372.jpg
./data/images_cleaned_train/prunus_subhirtella/735.jpg ./data/images_cleaned_train/prunus_virginiana/15958.jpg
./data/images_cleaned_train/prunus_subhirtella/15241.jpg ./data/images_cleaned_train/prunus_virginiana/1553.j

In [15]:
for lst in duplicate_images:
    a, b = lst
    if os.path.dirname(a) == os.path.dirname(b):
        print(a, b)

./data/images_cleaned_train/maclura_pomifera/16247.jpg ./data/images_cleaned_train/maclura_pomifera/14210.jpg
./data/images_cleaned_train/maclura_pomifera/178.jpg ./data/images_cleaned_train/maclura_pomifera/2841.jpg
./data/images_cleaned_train/maclura_pomifera/7736.jpg ./data/images_cleaned_train/maclura_pomifera/6335.jpg
./data/images_cleaned_train/maclura_pomifera/8483.jpg ./data/images_cleaned_train/maclura_pomifera/28.jpg
./data/images_cleaned_train/maclura_pomifera/10534.jpg ./data/images_cleaned_train/maclura_pomifera/14101.jpg
./data/images_cleaned_train/maclura_pomifera/8431.jpg ./data/images_cleaned_train/maclura_pomifera/11279.jpg
./data/images_cleaned_train/maclura_pomifera/12264.jpg ./data/images_cleaned_train/maclura_pomifera/13467.jpg
./data/images_cleaned_train/maclura_pomifera/5350.jpg ./data/images_cleaned_train/maclura_pomifera/11760.jpg
./data/images_cleaned_train/maclura_pomifera/9705.jpg ./data/images_cleaned_train/maclura_pomifera/10.jpg
./data/images_cleaned_tra

In [16]:
for lst in duplicate_images:
    a, b = lst
    if os.path.dirname(a) == os.path.dirname(b):
        os.remove(b)

In [17]:
duplicate_images2 = find_duplicate_images(parent_directory)

In [18]:
for lst in duplicate_images2:
    if l:=len(lst) != 2:
        print(l)

In [19]:
for lst in duplicate_images2:
    a, b = lst
    if os.path.dirname(a) == os.path.dirname(b):
        print(a, b)

In [22]:
len(duplicate_images2)

32

In [28]:
set([(os.path.dirname(a), os.path.dirname(b)) for a, b in duplicate_images2])

{('./data/images_cleaned_train/magnolia_tripetala',
  './data/images_cleaned_train/magnolia_stellata'),
 ('./data/images_cleaned_train/prunus_subhirtella',
  './data/images_cleaned_train/prunus_virginiana')}

In [34]:
magnolia = [lst for lst in duplicate_images2 if os.path.dirname(lst[0]) == './data/images_cleaned_train/magnolia_tripetala']

In [35]:
prunus = [lst for lst in duplicate_images2 if os.path.dirname(lst[0]) == './data/images_cleaned_train/prunus_subhirtella']

In [36]:
len(magnolia), len(prunus)

(11, 21)

In [39]:
magnolia[0]

['./data/images_cleaned_train/magnolia_tripetala/7789.jpg',
 './data/images_cleaned_train/magnolia_stellata/10702.jpg']

In [41]:
from subprocess import Popen

In [42]:
honeyview = "/mnt/c/Program Files/Honeyview/Honeyview.exe"

In [50]:
Popen(f"\"{honeyview}\" {magnolia[0][0]}", shell=True)

<Popen: returncode: None args: '"/mnt/c/Program Files/Honeyview/Honeyview.ex...>

In [48]:
for a, _ in magnolia: Popen(f"\"{honeyview}\" {a}", shell=True)

In [49]:
for a, _ in magnolia: os.remove(a)

In [51]:
Popen(f"\"{honeyview}\" {prunus[0][0]}", shell=True)

<Popen: returncode: None args: '"/mnt/c/Program Files/Honeyview/Honeyview.ex...>

In [52]:
prunus[0]

['./data/images_cleaned_train/prunus_subhirtella/17194.jpg',
 './data/images_cleaned_train/prunus_virginiana/11710.jpg']

In [56]:
Popen(f"\"{honeyview}\" {prunus[0][1]}", shell=True)

<Popen: returncode: None args: '"/mnt/c/Program Files/Honeyview/Honeyview.ex...>

In [54]:
for _, b in prunus: Popen(f"\"{honeyview}\" {b}", shell=True)

In [57]:
for _, b in prunus: os.remove(b)

In [58]:
find_duplicate_images(parent_directory)

[]