In [4]:
from pathlib import Path
import hashlib
import shutil
import torch
from PIL import Image
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
source_folders = [
    Path("D:\\data_set\\Speed_Bump\\bump_detection_dataset\\test\\bump"),
    Path("D:\\data_set\\Speed_Bump\\bump_detection_dataset\\train\\bump")
]

# TARGET folders
unique_dir = Path("D:\\data_set\\final_data_set\\unique")
dup_dir = Path("D:\\data_set\\final_data_set\\duplicates")

unique_dir.mkdir(parents=True, exist_ok=True)
dup_dir.mkdir(parents=True, exist_ok=True)

# Hash store
hashes = set()
unique_count = 0
dup_count = 0

def get_hash(file):
    with open(file, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

# Process all images
for folder in source_folders:
    for img in folder.glob("*.*"):
        try:
            h = get_hash(img)
            if h not in hashes:
                hashes.add(h)
                shutil.copy(img, unique_dir / f"img_{unique_count:05d}{img.suffix}")
                unique_count += 1
            else:
                shutil.copy(img, dup_dir / f"dup_{dup_count:05d}{img.suffix}")
                dup_count += 1
        except Exception as e:
            print("Error:", img, e)

print("Unique images:", unique_count)
print("Duplicate images:", dup_count)

Unique images: 4178
Duplicate images: 81


In [None]:
UNIQUE_EXACT = Path("D:\\data_set\\final_data_set\\unique")
UNIQUE_CLIP = Path("D:\\data_set\\final_data_set\\unique_clip")
DUP_CLIP = Path("D:\\data_set\\final_data_set\\duplicates_clip")

UNIQUE_CLIP.mkdir(parents=True, exist_ok=True)
DUP_CLIP.mkdir(parents=True, exist_ok=True)

# Load CLIP model on GPU

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

print("CLIP model loaded on", device)

# Helper: compute CLIP embedding
def compute_embedding(path):
    img = Image.open(path).convert("RGB")
    inputs = processor(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = model.get_image_features(**inputs)
    emb = emb / emb.norm(p=2)        
    return emb.cpu().numpy()[0]      


image_paths = sorted(list(UNIQUE_EXACT.glob("*.*")))

print("Total images to check:", len(image_paths))

embeddings = []
kept_images = []
removed_count = 0

SIM_THRESHOLD = 0.90   

for img_path in tqdm(image_paths, desc="Filtering images"):
    try:
        emb = compute_embedding(img_path)
    except:
        continue

    keep = True
    for existing_emb in embeddings:
        sim = np.dot(emb, existing_emb)

        if sim > SIM_THRESHOLD:
            shutil.copy(img_path, DUP_CLIP / img_path.name)
            removed_count += 1
            keep = False
            break
    if keep:
        embeddings.append(emb)
        shutil.copy(img_path, UNIQUE_CLIP / img_path.name)
        kept_images.append(img_path)

print("CLIP Similarity Filtering Completed")
print("Unique images kept:", len(kept_images))
print("Visually similar removed:", removed_count)


Using device: cuda


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


CLIP model loaded on cuda
Total images to check: 4178


Filtering images: 100%|██████████| 4178/4178 [04:43<00:00, 14.71it/s]

CLIP Similarity Filtering Completed
Unique images kept: 365
Visually similar removed: 3813





In [None]:
SOURCE_DIR = Path("D:\\data_set\\final_data_set\\unique_clip")
TARGET_DIR = Path("D:\\data_set\\final_data_set\\processed")

TARGET_DIR.mkdir(parents=True, exist_ok=True)

print("Source images:", len(list(SOURCE_DIR.glob("*.*"))))

# Preprocessing Loop (RGB + Resize)

count = 0

for img_path in SOURCE_DIR.glob("*.*"):
    try:
        img = Image.open(img_path)
        # Convert ALL formats to RGB
        img = img.convert("RGB")
        # Resize to 512x512
        img = img.resize((512, 512))
        out_path = TARGET_DIR / f"speedbump_{count:05d}.jpg"
        img.save(out_path, quality=95)
        count += 1
    except Exception as e:
        print("Error processing:", img_path, e)

print("Preprocessing complete!")
print("Total processed images:", count)

Source images: 365
Preprocessing complete!
Total processed images: 365
