### Step 1: Download 3 datasets. Add number labels (1, 2, 3) to each dataset.

### Step 2: Rename the images in each dataset to have the dataset number as a suffix. 
(e.g. cardboard1_1.jpg, cardboard1_2.jpg, cardboard2_1.jpg, etc.)

In [1]:
import os
import shutil

SRC_PATH = "data/archive/wanglei1220_data"
TRG_PATH = "data/archive/wanglei1220_data_renamed"
os.makedirs(TRG_PATH, exist_ok=True)

dataset_index = os.path.basename(SRC_PATH).split("_")[0]
for class_name in os.listdir(SRC_PATH):
    print(class_name)
    os.makedirs(os.path.join(TRG_PATH, class_name), exist_ok=True)
    for i, file_name in enumerate(os.listdir(os.path.join(SRC_PATH, class_name))):
        basename, ext = file_name.split(".")
        new_name = f"{basename}_data{dataset_index}.{ext}"
        shutil.copy(os.path.join(SRC_PATH, class_name, file_name), os.path.join(TRG_PATH, class_name, new_name))

metal


### Step 3: Merge the 3 datasets into a single dataset.
There are 5 classes:
- cardboard_paper
- metal
- plastic
- glass
- others: battery, biological, clothes, shoes, trash

In [21]:
MERGE_PATH = "data/merged"
os.makedirs(MERGE_PATH, exist_ok=True)

classes_list = ["cardboard_paper", "metal", "plastic", "glass", "battery", "biological", "clothes", "shoes", "trash"]
for class_name in classes_list:
    os.makedirs(os.path.join(MERGE_PATH, class_name), exist_ok=True)
    
data_list = [
    "data/1_garbage_classification_renamed",
    "data/2_garbage-dataset_renamed",
    "data/3_TrashType_Image_Dataset_renamed",
    "data/yolo_data"
]

for data_path in data_list:
    print(data_path)
    for class_name in os.listdir(data_path):
        if "glass" in class_name:
            # copy that folder content to `glass` folder
            for file_name in os.listdir(os.path.join(data_path, class_name)):
                shutil.copy(os.path.join(data_path, class_name, file_name), os.path.join(MERGE_PATH, "glass", file_name))
        elif class_name in "cardboard_paper":
            # copy that folder content to `cardboard_paper` folder
            for file_name in os.listdir(os.path.join(data_path, class_name)):
                shutil.copy(os.path.join(data_path, class_name, file_name), os.path.join(MERGE_PATH, "cardboard_paper", file_name))
        else:
            # copy that folder content to corresponding folder
            for file_name in os.listdir(os.path.join(data_path, class_name)):
                shutil.copy(os.path.join(data_path, class_name, file_name), os.path.join(MERGE_PATH, class_name, file_name))

data/1_garbage_classification_renamed
data/2_garbage-dataset_renamed
data/3_TrashType_Image_Dataset_renamed
data/yolo_data


### Step 4: check and remove duplicated images

In [30]:
import os
import imagehash
from PIL import Image

def find_and_remove_duplicates(folder_path):
    # Dictionary to store unique image hashes and their file paths
    hash_dict = {}

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Check if the file is an image (you can add more image types if needed)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
            try:
                # Open the image and calculate its perceptual hash
                with Image.open(file_path) as img:
                    img_hash = imagehash.phash(img)
                
                # If this hash is already in hash_dict, it means we found a duplicate
                if img_hash in hash_dict:
                    print(f"Duplicate found: {filename} is a duplicate of {hash_dict[img_hash]}")
                    os.remove(file_path)  # Remove the duplicate image
                else:
                    # If unique, add the hash and file path to the dictionary
                    hash_dict[img_hash] = file_path
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

# Specify the path to the folder containing images
folder_path = '/home/namdng/Documents/linhtinh/garbage_classifier/data/merged_filter_dups/trash'
find_and_remove_duplicates(folder_path)

Duplicate found: trash186_data1.jpg is a duplicate of /home/namdng/Documents/linhtinh/garbage_classifier/data/merged_filter_dups/trash/trash_734_data2.jpg
Duplicate found: trash278_data1.jpg is a duplicate of /home/namdng/Documents/linhtinh/garbage_classifier/data/merged_filter_dups/trash/trash_280_data2.jpg
Duplicate found: trash_584_data2.jpg is a duplicate of /home/namdng/Documents/linhtinh/garbage_classifier/data/merged_filter_dups/trash/trash62_data1.jpg
Duplicate found: trash_355_data2.jpg is a duplicate of /home/namdng/Documents/linhtinh/garbage_classifier/data/merged_filter_dups/trash/trash453_data1.jpg
Duplicate found: trash_737_data2.jpg is a duplicate of /home/namdng/Documents/linhtinh/garbage_classifier/data/merged_filter_dups/trash/trash64_data1.jpg
Duplicate found: trash_203_data2.jpg is a duplicate of /home/namdng/Documents/linhtinh/garbage_classifier/data/merged_filter_dups/trash/trash250_data1.jpg
Duplicate found: trash157_data1.jpg is a duplicate of /home/namdng/Docum

In [22]:
# check hashing algo
# two very similar images
img1_path = "data/check_imghash/MirOx.png"
img2_path = "data/check_imghash/e3FsG.png"

import imagehash

hash1 = imagehash.phash(Image.open(img1_path))
hash2 = imagehash.phash(Image.open(img2_path))

print(hash1 - hash2)

4


## Add-on codes

### Extract data from YOLO dataset

In [12]:
import os

SRC_PATH = "/home/namdng/Documents/linhtinh/garbage_classifier/GARBAGE CLASSIFICATION 3.v2-gc1.coco/valid"
YOLO_DATA_PATH = "data/yolo_data"
os.makedirs(YOLO_DATA_PATH, exist_ok=True)

yolo_class_names = ["BIODEGRADABLE", "CARDBOARD", "GLASS", "METAL", "PAPER", "PLASTIC"]
yolo_class_names = [name.lower() for name in yolo_class_names]
# create each folder
for class_name in yolo_class_names:
    os.makedirs(os.path.join(YOLO_DATA_PATH, class_name), exist_ok=True)

# copy images to corresponding folder
for file_name in os.listdir(SRC_PATH):
    if file_name.endswith(".jpg"):
        file_name_contain_class = file_name.split("_")[0].lower()
        # choose which class to copy to
        choosen_class = ""
        for class_name in yolo_class_names:
            if class_name in file_name_contain_class:
                choosen_class = class_name
                break

        if choosen_class != "":
            shutil.copy(os.path.join(SRC_PATH, file_name), os.path.join(YOLO_DATA_PATH, choosen_class, file_name))
        else:
            print(f"Cannot find class for {file_name}")
