# Data preprocessing

In [7]:
import os
import shutil
import hashlib
import numpy as np
from pathlib import Path
from tqdm import tqdm
from PIL import Image
import cv2
import torch
from transformers import CLIPModel, CLIPProcessor
import warnings
warnings.filterwarnings("ignore")

In [None]:
#CONFIG
# Raw data_set location
source_folders = [
    Path(r"D:\data_set\Speed_Bump\bump_detection_dataset\train\bump"),
    Path(r"D:\data_set\Speed_Bump\bump_detection_dataset\test\bump")
]
#Destination location
BASE_DIR     = Path(r"D:\CAP6415_F25_project-Finding-and-solving-hard-to-generate-examples\Data_set")
#Inside the Data_set
UNIQUE_EXACT = BASE_DIR / "unique"
UNIQUE_CLIP  = BASE_DIR / "unique_clip"
POOR_DIR     = BASE_DIR / "poor_blur_detection"
PROCESSED    = BASE_DIR / "processed"

for folder in [UNIQUE_EXACT, UNIQUE_CLIP, POOR_DIR, PROCESSED]:
    folder.mkdir(parents=True, exist_ok=True)

## Removal of Duplicate Images 

In [3]:
# REMOVE EXACT DUPLICATES (MD5 HASH)
def get_hash(file):
    with open(file, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

print("Removing exact duplicate images")
hashes = set()
unique_count = 0

for folder in source_folders:
    for img in tqdm(list(folder.glob("*.*"))):
        try:
            h = get_hash(img)
            if h not in hashes:
                hashes.add(h)
                shutil.copy(img, UNIQUE_EXACT / f"img_{unique_count:05d}{img.suffix}")
                unique_count += 1
        except:
            continue

print(f"Unique images after Hash filtering: {unique_count}")

Removing exact duplicate images


100%|██████████| 2982/2982 [00:39<00:00, 76.01it/s]
100%|██████████| 1277/1277 [00:19<00:00, 64.12it/s]

Unique images after Hash filtering: 4178





## Removal of Visually Similar images using CLIP

In [4]:
# REMOVE VISUALLY SIMILAR (CLIP)
print("CLIP-based similarity filtering")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def compute_embedding(path):
    img = Image.open(path).convert("RGB")
    inputs = processor(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = model.get_image_features(**inputs)
    emb = emb / emb.norm(p=2)
    return emb.cpu().numpy()[0]

image_paths = sorted(list(UNIQUE_EXACT.glob("*.*")))
embeddings = []
SIM_THRESHOLD = 0.90

for img_path in tqdm(image_paths, desc="CLIP filtering"):
    try:
        emb = compute_embedding(img_path)
    except:
        continue

    if all(np.dot(emb, e) < SIM_THRESHOLD for e in embeddings):
        embeddings.append(emb)
        shutil.copy(img_path, UNIQUE_CLIP / img_path.name)

print("Remaining images after CLIP:", len(list(UNIQUE_CLIP.glob('*.*'))))

CLIP-based similarity filtering


CLIP filtering: 100%|██████████| 4178/4178 [04:42<00:00, 14.77it/s]

Remaining images after CLIP: 354





## Resize Images for Trainning

In [5]:
# PREPROCESS (resize + RGB)
print("Preprocess (resize + RGB)")
RES = 512
count = 0

for img_path in tqdm(list(UNIQUE_CLIP.glob("*.*"))):
    try:
        img = Image.open(img_path).convert("RGB")
        img = img.resize((RES, RES))
        out_path = PROCESSED / f"speedbump_{count:05d}.jpg"
        img.save(out_path, quality=95)
        count += 1
    except Exception as e:
        print("Error:", img_path, e)

print(f"Processed images saved: {count}")

Preprocess (resize + RGB)


100%|██████████| 354/354 [00:11<00:00, 31.31it/s]

Processed images saved: 354





## Removal of Blur Images

In [6]:
# BLUR DETECTION (REMOVE POOR IMAGES)
print("Blur Filtering")
def blur_score(path):
    img = cv2.imread(path, 0)
    if img is None:
        return None
    return cv2.Laplacian(img, cv2.CV_64F).var()

bad = []
BLUR_THRESHOLD = 120

for fname in os.listdir(PROCESSED):
    if fname.endswith(".jpg"):
        path = PROCESSED / fname
        score = blur_score(str(path))
        if score is None or score < BLUR_THRESHOLD:
            bad.append(fname)

print("Blurry images detected:", len(bad))

for fname in bad:
    shutil.move(PROCESSED / fname, POOR_DIR / fname)

print("Final Training Images:", len(list(PROCESSED.glob('*.*'))))
print("Poor Images Moved:", len(bad))

Blur Filtering
Blurry images detected: 132
Final Training Images: 222
Poor Images Moved: 132
