In [6]:
import os, re, shutil
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow import keras

# ---------- CONFIGURATIONS ----------
INPUT_DIR  = "data/raw" #adjust - should point to folder with raw images
OUTPUT_DIR = "data/dataset" # output directory, where data split will happen.
QUAR_DIR   = "data/quarantine"
REPORTS    = "data/reports"
REPORT_CSV = os.path.join(REPORTS, "preprocessing_log.csv") # for logging purposes

# time window
START = "060000"
END   = "062000" #this is in summer. In Winter time END should be later. 

# color dominance
TWO_CHANNEL_DOMINANCE  = 0.90

# raindrop SVM + embedder paths
SVM_PATH     = "best models/raindrop models/raindrop_svm.pkl"
EMBEDDER_PATH = "best models/raindrop models/mobilenetv2_embedder.keras"   # if missing, build one
IMG_SIZE      = 224


# ---------- UTIL ----------
def ensure_dirs():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(REPORTS, exist_ok=True)
    for sub in ["time_0600_0620", "raindrops", "dark_or_tinted"]:
        os.makedirs(os.path.join(QUAR_DIR, sub), exist_ok=True)

def iter_images(folder):
    for fn in os.listdir(folder):
        if fn.lower().endswith((".jpg", ".jpeg", ".png")):
            yield fn

# Check time in file name and if in time window
def time_in_window(filename):
    m = re.match(r"^(\d{8})_(\d{6})", os.path.splitext(filename)[0], re.IGNORECASE)
    if not m:
        return False, None
    t = m.group(2)
    return (START <= t <= END), t


def colour_dominance(img_bgr):
    mean_channels = np.mean(img_bgr, axis=(0, 1))  # [B,G,R] channels
    total = np.sum(mean_channels)
    if not np.isfinite(total) or total == 0:
        return (True,[0,0,0]) #handle all black images
    ratios = mean_channels / total
    top_two = np.sort(ratios)[::-1][:2].sum()
    return top_two > TWO_CHANNEL_DOMINANCE, ratios

def move_file(src, dst):
    os.makedirs(os.path.dirname(dst), exist_ok=True)
    shutil.copy2(src, dst)

# ----- LOAD MODELS -----
def load_svm():
    return joblib.load(SVM_PATH)

def load_embedder():
    # try to load saved embedder first
    if os.path.exists(EMBEDDER_PATH):
        print(f"Loading embedder from {EMBEDDER_PATH}")
        return keras.models.load_model(EMBEDDER_PATH)
    # fallback: recreate MobileNetV2 embedder
    print("Loading MobileNetV2 from imagenet (fallback)")
    return MobileNetV2(weights="imagenet", include_top=False, pooling="avg")

def svm_predict_raindrop(img_bgr, clf, embedder):
    # BGR -> RGB
    img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

    # centre-crop to square 
    h, w = img.shape[:2]
    c = min(h, w)
    y0 = (h - c) // 2
    x0 = (w - c) // 2
    img = img[y0:y0+c, x0:x0+c]

    # resize to MobileNetV2 size
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE)).astype("float32")

    # preprocess for MobileNet
    x = preprocess_input(img[None, ...])

    # get embedding (1, 1280)
    feat = embedder.predict(x, verbose=0).flatten()

    # SVM predict
    pred_class = clf.predict([feat])[0]       # 0 = clean, 1 = raindrop
    prob = clf.predict_proba([feat])[0][int(pred_class)]
    return (pred_class == 1), float(prob)  # True if raindrop


# ---- MAIN ----
def main():
    ensure_dirs()

    if os.path.exists(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # load ML parts
    clf = load_svm()
    embedder = load_embedder()

    logs = []

    files = sorted(list(iter_images(INPUT_DIR)))
    for fn in tqdm(files, desc="Preprocessing"):
        src = os.path.join(INPUT_DIR, fn)

        # 0) time window
        in_window, hhmmss = time_in_window(fn)
        if in_window:
            dst = os.path.join(QUAR_DIR, "time_0600_0620", fn)
            move_file(src, dst)
            logs.append(dict(file=fn, action="quarantine", filter="time_window",
                             reason="time_0600_0620", meta=hhmmss))
            continue

        img = cv2.imread(src)
        if img is None:
            logs.append(dict(file=fn, action="skip", filter="read", reason="read_error", meta=""))
            continue

        # 1) raindrop by SVM
        is_drop, p = svm_predict_raindrop(img, clf, embedder)
        if is_drop:
            dst = os.path.join(QUAR_DIR, "raindrops", fn)
            move_file(src, dst)
            logs.append(dict(file=fn, action="quarantine", filter="raindrop_svm",
                             reason="raindrop", meta=f"p={p:.3f}"))
            continue

        # 2) colour dominance
        dom, ratios = colour_dominance(img)
        if dom:
            dst = os.path.join(QUAR_DIR, "dark_or_tinted", fn)
            move_file(src, dst)
            r = ",".join([f"{x:.3f}" for x in ratios[::-1]])  # R,G,B
            logs.append(dict(file=fn, action="quarantine", filter="colour_dominance",
                             reason="two_channel_dominance", meta=f"R,G,B={r}"))
            continue

        # 3) keep
        dst = os.path.join(OUTPUT_DIR, fn)
        move_file(src, dst)
        logs.append(dict(file=fn, action="keep", filter="none", reason="", meta=""))

    # write report
    df = pd.DataFrame(logs, columns=["file", "action", "filter", "reason", "meta"])
    os.makedirs(REPORTS, exist_ok=True)
    df.to_csv(REPORT_CSV, index=False)
    print(f"\nReport written to: {REPORT_CSV}")
    print(f"Kept: {(df.action=='keep').sum()}, "
          f"Quarantined: {(df.action=='quarantine').sum()}, "
          f"Skipped: {(df.action=='skip').sum()}")

if __name__ == "__main__":
    main()

Loading embedder from mobilenetv2_embedder.keras


Preprocessing: 100%|██████████| 11746/11746 [38:49<00:00,  5.04it/s] 



Report written to: data/reports/preprocessing_log.csv
Kept: 9156, Quarantined: 2590, Skipped: 0
