In [None]:
# Check GPU
!nvidia-smi

# Fresh install Ultralytics (YOLO) + torch (Colab usually bundles torch already)
!pip -q install --upgrade ultralytics==8.* rich

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Sun Nov 23 18:42:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import os
import io
import sys
import math
import json
import zipfile
import random
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt

# -----------------------------
# CONFIG (edit these as needed)
# -----------------------------
BASE = Path.cwd()
IMG_ZIP_1 = None
IMG_ZIP_2 = None
TXT_ZIP   = None

# DATA_ROOT      = BASE / "dataset"     # dataset root
DATA_ROOT = Path("/content/drive/MyDrive/Faizan/dataset")

IMG_DIR        = DATA_ROOT / "IMG"
LBL_DIR        = DATA_ROOT / "TXT"
PSEUDO_LBL_DIR = DATA_ROOT / "TXT_pseudo"

OUTDIR         = BASE / "eda_outputs"
VIS_AUDIT_DIR  = OUTDIR / "visual_audit_all"
VIS_STRAT_DIR  = OUTDIR / "visual_audit_by_class"
CROPS_DIR      = OUTDIR / "object_crops"
REPORTS_DIR    = OUTDIR / "reports"

RANDOM_AUDIT_SAMPLES = 30
CLASS_AUDIT_SAMPLES  = 12
MAX_CROPS_PER_CLASS  = 0

# Valid extensions
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
LABEL_EXTS = {".txt"}

# Class map (as per your separated labels)
CLASS_MAP = {0: "dry", 1: "water-filled"}
CLASS_COLORS = {0: (255, 0, 0), 1: (0, 128, 255)}  # red=dry, blue=water-filled

# Random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# -----------------------------
# Helpers
# -----------------------------
def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def safe_unzip(zip_path: Path, dest_dir: Path):
    """Safe unzip that preserves structure and ignores macOS artifacts."""
    if not zip_path or not zip_path.exists():
        return
    with zipfile.ZipFile(zip_path, 'r') as z:
        for zi in z.infolist():
            name = zi.filename
            if name.endswith('/'):
                (dest_dir / name).mkdir(parents=True, exist_ok=True)
                continue
            base_name = Path(name).name
            if "__MACOSX" in name or base_name.startswith(".DS_Store"):
                continue
            target_path = dest_dir / name
            target_path.parent.mkdir(parents=True, exist_ok=True)
            with z.open(zi, 'r') as src, open(target_path, 'wb') as dst:
                dst.write(src.read())

def list_files(root: Path, exts):
    files = []
    if not root.exists():
        return files
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts and "__MACOSX" not in str(p):
            files.append(p)
    return files

def build_stem_map(paths):
    m = defaultdict(list)
    for f in paths:
        m[f.stem].append(f)
    return m

def try_read_lines(path: Path):
    """Robust line reader for labels that may have encoding artifacts."""
    try:
        return path.read_text(encoding="utf-8").splitlines()
    except UnicodeDecodeError:
        return path.read_text(encoding="latin-1", errors="ignore").splitlines()

def yolo_line_to_vals(line: str):
    """Parse a YOLO label line -> (class, x, y, w, h) floats."""
    parts = line.split()
    if len(parts) != 5:
        return None
    try:
        cls = int(float(parts[0]))
        x, y, w, h = map(float, parts[1:])
        return cls, x, y, w, h
    except Exception:
        return None

def draw_labeled_boxes(img_path: Path, rows_df: pd.DataFrame, out_path: Path, class_map, thickness=3):
    """Draw class-colored boxes with text labels (robust text sizing)."""
    try:
        im = Image.open(img_path).convert("RGB")
    except Exception:
        return
    draw = ImageDraw.Draw(im)
    try:
        font = ImageFont.load_default()
    except Exception:
        font = None

    W, H = im.size
    def clamp(v, lo, hi): return max(lo, min(hi, v))

    for _, r in rows_df.iterrows():
        x1 = clamp(float(r["bbox_x1"]), 0, W - 1)
        y1 = clamp(float(r["bbox_y1"]), 0, H - 1)
        x2 = clamp(float(r["bbox_x2"]), 0, W - 1)
        y2 = clamp(float(r["bbox_y2"]), 0, H - 1)
        cls = int(r["class_id"])
        color = CLASS_COLORS.get(cls, (255, 255, 0))

        # rectangle
        draw.rectangle([x1, y1, x2, y2], outline=color, width=thickness)

        # label background using textbbox (safe fallback otherwise)
        label = class_map.get(cls, str(cls))
        try:
            bbox = draw.textbbox((0, 0), label, font=font)
            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
        except Exception:
            tw, th = (len(label) * 6), 10
        bx2 = x1 + tw + 6
        by2 = y1 + th + 6
        draw.rectangle([x1, y1, bx2, by2], fill=color)
        draw.text((x1 + 3, y1 + 3), label, fill=(255, 255, 255), font=font)

    out_path.parent.mkdir(parents=True, exist_ok=True)
    im.save(out_path)

def save_crop(img_path: Path, box, out_path: Path, pad=2):
    try:
        im = Image.open(img_path).convert("RGB")
    except Exception:
        return
    x1, y1, x2, y2 = map(int, box)
    x1 = max(x1 - pad, 0); y1 = max(y1 - pad, 0)
    x2 = min(x2 + pad, im.width - 1); y2 = min(y2 + pad, im.height - 1)
    if x2 <= x1 or y2 <= y1:
        return
    crop = im.crop((x1, y1, x2, y2))
    out_path.parent.mkdir(parents=True, exist_ok=True)
    crop.save(out_path)

# -----------------------------
# MAIN EDA
# -----------------------------
def main():
    # Prepare dirs
    for d in [DATA_ROOT, IMG_DIR, LBL_DIR, OUTDIR, VIS_AUDIT_DIR, VIS_STRAT_DIR, CROPS_DIR, REPORTS_DIR]:
        ensure_dir(d)

    # 0) Prefer TXT_pseudo if present & non-empty
    candidate_lbl_dirs = []
    if PSEUDO_LBL_DIR.exists() and any(PSEUDO_LBL_DIR.rglob("*.txt")):
        candidate_lbl_dirs.append(PSEUDO_LBL_DIR)
    if LBL_DIR.exists() and any(LBL_DIR.rglob("*.txt")):
        candidate_lbl_dirs.append(LBL_DIR)
    label_root = candidate_lbl_dirs[0] if candidate_lbl_dirs else LBL_DIR
    print(f"[INFO] Using labels from: {label_root}")

    # 1) Extract (if archives provided)
    safe_unzip(IMG_ZIP_1, IMG_DIR)
    safe_unzip(IMG_ZIP_2, IMG_DIR)
    safe_unzip(TXT_ZIP,   label_root)

    # 2) Collect files
    img_files = list_files(IMG_DIR, IMAGE_EXTS)
    lbl_files = list_files(label_root, LABEL_EXTS)

    print(f"Found {len(img_files)} images, {len(lbl_files)} YOLO label files.")

    img_by_stem = build_stem_map(img_files)
    lbl_by_stem = build_stem_map(lbl_files)

    # 3) Image metadata
    image_meta = []
    for stem, files in img_by_stem.items():
        img_path = files[0]
        fmt = (img_path.suffix or "").lower().strip(".")
        try:
            with Image.open(img_path) as im:
                w, h = im.size
        except Exception as e:
            print(f"[WARN] Could not open image {img_path}: {e}")
            w = h = None
        if w and h:
            image_meta.append({
                "stem": stem,
                "image_path": str(img_path),
                "width": w,
                "height": h,
                "aspect_ratio": w / h if h else np.nan,
                "format": fmt
            })
    image_df = pd.DataFrame(image_meta)

    # 4) Parse YOLO labels
    rows = []
    empty_label_files = []
    invalid_label_lines = []
    file_with_zero_boxes = []
    images_without_labels = []
    labels_without_images = []
    boxes_per_image = Counter()

    # labels that don't have images
    for stem, files in lbl_by_stem.items():
        if stem not in img_by_stem:
            labels_without_images.append(str(files[0]))

    # parse labels
    for stem, files in lbl_by_stem.items():
        lbl_path = files[0]
        size = lbl_path.stat().st_size if lbl_path.exists() else 0
        if size == 0:
            empty_label_files.append(str(lbl_path))
            continue

        try:
            raw_lines = [ln.strip() for ln in try_read_lines(lbl_path) if ln.strip()]
        except Exception as e:
            print(f"[WARN] Could not read label {lbl_path}: {e}")
            continue

        if len(raw_lines) == 0:
            file_with_zero_boxes.append(str(lbl_path))

        # image dims
        if stem in img_by_stem:
            img_path = img_by_stem[stem][0]
            try:
                with Image.open(img_path) as im:
                    W, H = im.size
            except Exception as e:
                print(f"[WARN] Could not open image for labels {lbl_path}: {e}")
                continue
        else:
            W, H = 1, 1
            img_path = Path("")

        local_box_count = 0
        for line in raw_lines:
            vals = yolo_line_to_vals(line)
            if vals is None:
                invalid_label_lines.append({"label_path": str(lbl_path), "line": line})
                continue

            cls, x, y, w, h = vals
            # enforce 0/1 only if using TXT_pseudo; otherwise let it be
            if label_root == PSEUDO_LBL_DIR and cls not in (0, 1):
                print(f"[WARN] Non {0,1} class in TXT_pseudo: {lbl_path} -> got {cls}")
            class_name = CLASS_MAP.get(int(cls), str(int(cls)))

            # pixel conversions
            xc_px = x * W
            yc_px = y * H
            bw_px = max(0.0, w * W)
            bh_px = max(0.0, h * H)
            x1 = max(0.0, xc_px - bw_px / 2)
            y1 = max(0.0, yc_px - bh_px / 2)
            x2 = min(float(W - 1), xc_px + bw_px / 2)
            y2 = min(float(H - 1), yc_px + bh_px / 2)

            rows.append({
                "stem": stem,
                "image_path": str(img_path) if img_path else "",
                "label_path": str(lbl_path),
                "class_id": int(cls),
                "class_name": class_name,
                "x_center_norm": float(np.clip(x, 0.0, 1.0)),
                "y_center_norm": float(np.clip(y, 0.0, 1.0)),
                "w_norm": float(np.clip(w, 0.0, 1.0)),
                "h_norm": float(np.clip(h, 0.0, 1.0)),
                "img_width": int(W), "img_height": int(H),
                "bbox_xc": float(xc_px), "bbox_yc": float(yc_px),
                "bbox_w": float(bw_px), "bbox_h": float(bh_px),
                "bbox_x1": float(x1), "bbox_y1": float(y1), "bbox_x2": float(x2), "bbox_y2": float(y2)
            })
            local_box_count += 1
        boxes_per_image[stem] += local_box_count

    # images without labels
    for stem in img_by_stem:
        if stem not in lbl_by_stem:
            images_without_labels.append(str(img_by_stem[stem][0]))

    ann_df = pd.DataFrame(rows)

    # 5) Quality checks
    def within_01(x):
        return (x >= 0.0) & (x <= 1.0)

    if not ann_df.empty:
        zero_area = ann_df[(ann_df["bbox_w"] <= 0) | (ann_df["bbox_h"] <= 0)]
        out_of_range = ann_df[~(within_01(ann_df["x_center_norm"]) & within_01(ann_df["y_center_norm"]) &
                                within_01(ann_df["w_norm"]) & within_01(ann_df["h_norm"]))]

        eps = 1e-6
        out_of_bounds = ann_df[(ann_df["bbox_x1"] < -eps) | (ann_df["bbox_y1"] < -eps) |
                               (ann_df["bbox_x2"] > ann_df["img_width"] + eps) |
                               (ann_df["bbox_y2"] > ann_df["img_height"] + eps)]
    else:
        zero_area = ann_df.iloc[0:0]
        out_of_range = ann_df.iloc[0:0]
        out_of_bounds = ann_df.iloc[0:0]

    quality_issues = {
        "images_without_labels_count": len(images_without_labels),
        "labels_without_images_count": len(labels_without_images),
        "empty_label_files_count": len(empty_label_files),
        "invalid_label_lines_count": len(invalid_label_lines),
        "zero_area_boxes_count": int(zero_area.shape[0]),
        "out_of_range_norm_count": int(out_of_range.shape[0]),
        "pixel_out_of_bounds_count": int(out_of_bounds.shape[0]),
    }

    # 6) Summary numbers
    if not ann_df.empty:
        per_image_counts = ann_df.groupby("stem").size().rename("potholes_per_image").reset_index()
        avg_potholes = per_image_counts["potholes_per_image"].mean()
        max_potholes = per_image_counts["potholes_per_image"].max()
        num_zero_pothole_images = len(images_without_labels)
    else:
        per_image_counts = pd.DataFrame(columns=["stem", "potholes_per_image"])
        avg_potholes = 0.0
        max_potholes = 0
        num_zero_pothole_images = len(images_without_labels)

    # class counts (use named & id)
    class_counter = Counter(ann_df["class_id"].astype(int).tolist()) if not ann_df.empty else Counter()

    summary = {
        "num_images": len(img_files),
        "num_labels": len(lbl_files),
        "num_boxes": int(ann_df.shape[0]) if not ann_df.empty else 0,
        "unique_classes_ids": sorted(list(class_counter.keys())),
        "class_counts_id": dict(class_counter),
        "class_counts_named": {CLASS_MAP.get(k, str(k)): v for k, v in sorted(class_counter.items())},
        "avg_potholes_per_image": float(avg_potholes),
        "max_potholes_in_image": int(max_potholes),
        "num_zero_pothole_images": int(num_zero_pothole_images),
        "labels_source": str(label_root)
    }

    print("\n=== SUMMARY ===")
    for k, v in summary.items():
        print(f"{k}: {v}")
    print("=== QUALITY ISSUES ===")
    for k, v in quality_issues.items():
        print(f"{k}: {v}")

    # Per-image presence of each class & co-occurrence
    per_image_presence = []
    both_types = 0
    if not ann_df.empty:
        grp = ann_df.groupby("stem")["class_id"].agg(lambda x: sorted(set(map(int, x)))).reset_index()
        grp["has_dry"] = grp["class_id"].apply(lambda s: 0 in s)
        grp["has_water_filled"] = grp["class_id"].apply(lambda s: 1 in s)
        grp["has_both"] = grp.apply(lambda r: r["has_dry"] and r["has_water_filled"], axis=1)
        both_types = int(grp["has_both"].sum())
        per_image_presence = grp
        print(f"Images containing BOTH subtypes: {both_types}")

    # 7) Save CSVs
    ensure_dir(OUTDIR)
    (OUTDIR / "image_metadata.csv").write_text(image_df.to_csv(index=False))
    (OUTDIR / "annotations_expanded.csv").write_text(ann_df.to_csv(index=False))
    (OUTDIR / "per_image_counts.csv").write_text(per_image_counts.to_csv(index=False))
    if len(per_image_presence):
        per_image_presence.to_csv(OUTDIR / "image_class_presence.csv", index=False)

    pd.DataFrame({"images_without_labels": images_without_labels}).to_csv(OUTDIR / "images_without_labels.csv", index=False)
    pd.DataFrame({"labels_without_images": labels_without_images}).to_csv(OUTDIR / "labels_without_images.csv", index=False)
    pd.DataFrame(empty_label_files, columns=["empty_label_files"]).to_csv(OUTDIR / "empty_label_files.csv", index=False)
    pd.DataFrame(invalid_label_lines).to_csv(OUTDIR / "invalid_label_lines.csv", index=False)

    # Summary tables (nice)
    summary_rows = [
        {"Metric": "Total images", "Value": len(img_files)},
        {"Metric": "Total YOLO label files", "Value": len(lbl_files)},
        {"Metric": "Total bounding boxes", "Value": summary["num_boxes"]},
        {"Metric": "Unique classes (IDs)", "Value": ", ".join(map(str, summary["unique_classes_ids"])) if summary["unique_classes_ids"] else "(none)"},
        {"Metric": "Class counts (named)", "Value": json.dumps(summary["class_counts_named"])},
        {"Metric": "Avg potholes per image", "Value": round(summary["avg_potholes_per_image"], 3)},
        {"Metric": "Max potholes in any image", "Value": summary["max_potholes_in_image"]},
        {"Metric": "Zero-pothole images", "Value": summary["num_zero_pothole_images"]},
        {"Metric": "Labels source", "Value": summary["labels_source"]},
        {"Metric": "Images with both subtypes", "Value": both_types},
    ] + [{"Metric": f"Class {k} ({CLASS_MAP.get(k, k)}) count", "Value": v} for k, v in sorted(class_counter.items())]
    pd.DataFrame(summary_rows).to_csv(OUTDIR / "eda_summary.csv", index=False)

    pd.DataFrame(list(quality_issues.items()), columns=["Quality Check", "Count"]).to_csv(
        OUTDIR / "quality_checks_summary.csv", index=False
    )

    # 8) Visualizations (each in its own figure; default Matplotlib styles)
    def save_show(fig_path: Path):
        plt.tight_layout()
        plt.savefig(fig_path, dpi=120)
        plt.close()

    if not image_df.empty:
        # widths
        plt.figure()
        plt.hist(image_df["width"].values, bins=30)
        plt.title("Image Width Distribution")
        plt.xlabel("Width (px)"); plt.ylabel("Count")
        save_show(OUTDIR / "hist_image_widths.png")

        # heights
        plt.figure()
        plt.hist(image_df["height"].values, bins=30)
        plt.title("Image Height Distribution")
        plt.xlabel("Height (px)"); plt.ylabel("Count")
        save_show(OUTDIR / "hist_image_heights.png")

        # aspect ratios
        plt.figure()
        plt.hist(image_df["aspect_ratio"].dropna().values, bins=30)
        plt.title("Image Aspect Ratio Distribution (W/H)")
        plt.xlabel("Aspect Ratio"); plt.ylabel("Count")
        save_show(OUTDIR / "hist_image_aspect_ratios.png")

        # formats
        fmt_counts = image_df["format"].fillna("unknown").value_counts()
        if len(fmt_counts) > 0:
            plt.figure()
            plt.bar(fmt_counts.index.astype(str), fmt_counts.values)
            plt.title("Image Formats")
            plt.xlabel("Format"); plt.ylabel("Count")
            plt.xticks(rotation=45, ha="right")
            save_show(OUTDIR / "bar_image_formats.png")

    # class distribution (named)
    if len(class_counter):
        items = sorted(class_counter.items())
        labels = [f'{CLASS_MAP.get(k, str(k))}\n({k})' for k, _ in items]
        vals = [v for _, v in items]
        plt.figure()
        plt.bar(labels, vals)
        plt.title("Class Distribution (dry vs water-filled)")
        plt.xlabel("Class"); plt.ylabel("Count (boxes)")
        save_show(OUTDIR / "bar_class_distribution_named.png")

    # potholes per image (including zero boxes)
    if not per_image_counts.empty or len(images_without_labels) > 0:
        pit = per_image_counts.set_index("stem")["potholes_per_image"] if not per_image_counts.empty else pd.Series(dtype=int)
        all_stems = set(img_by_stem.keys())
        zero_stems = list(all_stems - set(pit.index))
        if len(zero_stems) > 0:
            zero_df = pd.DataFrame({"stem": zero_stems, "potholes_per_image": 0})
            combined_counts = pd.concat([per_image_counts, zero_df], ignore_index=True)
        else:
            combined_counts = per_image_counts.copy()

        plt.figure()
        bins = np.arange(combined_counts["potholes_per_image"].min() if not combined_counts.empty else 0,
                         (combined_counts["potholes_per_image"].max() if not combined_counts.empty else 0) + 2)
        if len(bins) < 2:
            bins = np.arange(0, 2)
        plt.hist(combined_counts["potholes_per_image"].values, bins=bins)
        plt.title("Number of Potholes per Image")
        plt.xlabel("Potholes per image"); plt.ylabel("Count of images")
        save_show(OUTDIR / "hist_potholes_per_image.png")

    # multivariate bbox stats (overall + per class)
    if not ann_df.empty:
        # overall
        plt.figure()
        plt.hist(ann_df["bbox_w"].values, bins=40)
        plt.title("Bounding Box Widths (px) - All")
        plt.xlabel("Width (px)"); plt.ylabel("Count")
        save_show(OUTDIR / "hist_bbox_widths_all.png")

        plt.figure()
        plt.hist(ann_df["bbox_h"].values, bins=40)
        plt.title("Bounding Box Heights (px) - All")
        plt.xlabel("Height (px)"); plt.ylabel("Count")
        save_show(OUTDIR / "hist_bbox_heights_all.png")

        plt.figure()
        plt.scatter(ann_df["bbox_w"].values, ann_df["bbox_h"].values, s=5, alpha=0.5)
        plt.title("bbox_width vs bbox_height - All")
        plt.xlabel("bbox_width (px)"); plt.ylabel("bbox_height (px)")
        save_show(OUTDIR / "scatter_bbox_w_vs_h_all.png")

        # heatmap (centers normalized to 640x640)
        xc_norm640 = (ann_df["x_center_norm"].values * 640.0).astype(float)
        yc_norm640 = (ann_df["y_center_norm"].values * 640.0).astype(float)
        heat, xedges, yedges = np.histogram2d(xc_norm640, yc_norm640, bins=[64, 64], range=[[0,640],[0,640]])
        plt.figure()
        plt.imshow(heat.T, origin='lower', extent=[0,640,0,640], aspect='equal')
        plt.title("Heatmap of Pothole Centers (All, normalized to 640x640)")
        plt.xlabel("x (0-640)"); plt.ylabel("y (0-640)")
        save_show(OUTDIR / "heatmap_pothole_centers_all.png")

        # size vs vertical position
        plt.figure()
        plt.scatter(ann_df["bbox_yc"].values, ann_df["bbox_h"].values, s=5, alpha=0.5)
        plt.title("bbox_height vs y_center (px) - All")
        plt.xlabel("y_center (px)"); plt.ylabel("bbox_height (px)")
        save_show(OUTDIR / "scatter_bbox_h_vs_y_all.png")

        # ---- Per-class analyses (identify & localize diverse types)
        for cls_id, cls_name in CLASS_MAP.items():
            sub = ann_df[ann_df["class_id"] == cls_id]
            if sub.empty:
                continue

            plt.figure()
            plt.hist(sub["bbox_w"].values, bins=40)
            plt.title(f"Bounding Box Widths (px) - {cls_name}")
            plt.xlabel("Width (px)"); plt.ylabel("Count")
            save_show(OUTDIR / f"hist_bbox_widths_{cls_name}.png")

            plt.figure()
            plt.hist(sub["bbox_h"].values, bins=40)
            plt.title(f"Bounding Box Heights (px) - {cls_name}")
            plt.xlabel("Height (px)"); plt.ylabel("Count")
            save_show(OUTDIR / f"hist_bbox_heights_{cls_name}.png")

            plt.figure()
            plt.scatter(sub["bbox_w"].values, sub["bbox_h"].values, s=5, alpha=0.5)
            plt.title(f"bbox_width vs bbox_height - {cls_name}")
            plt.xlabel("bbox_width (px)"); plt.ylabel("bbox_height (px)")
            save_show(OUTDIR / f"scatter_bbox_w_vs_h_{cls_name}.png")

            xc = (sub["x_center_norm"].values * 640.0).astype(float)
            yc = (sub["y_center_norm"].values * 640.0).astype(float)
            heat_c, xe, ye = np.histogram2d(xc, yc, bins=[64, 64], range=[[0,640],[0,640]])
            plt.figure()
            plt.imshow(heat_c.T, origin='lower', extent=[0,640,0,640], aspect='equal')
            plt.title(f"Heatmap of Centers - {cls_name} (normalized to 640x640)")
            plt.xlabel("x (0-640)"); plt.ylabel("y (0-640)")
            save_show(OUTDIR / f"heatmap_centers_{cls_name}.png")

            plt.figure()
            plt.scatter(sub["bbox_yc"].values, sub["bbox_h"].values, s=5, alpha=0.5)
            plt.title(f"bbox_height vs y_center (px) - {cls_name}")
            plt.xlabel("y_center (px)"); plt.ylabel("bbox_height (px)")
            save_show(OUTDIR / f"scatter_bbox_h_vs_y_{cls_name}.png")

    # 9) Visual audits & (optional) crops
    if not ann_df.empty:
        # Mixed random audit
        by_img = ann_df.groupby("image_path")
        all_imgs_with_boxes = [p for p in by_img.groups.keys() if p]
        sample_paths = random.sample(all_imgs_with_boxes, min(RANDOM_AUDIT_SAMPLES, len(all_imgs_with_boxes)))
        for spath in sample_paths:
            sub = by_img.get_group(spath)
            out_name = VIS_AUDIT_DIR / (Path(spath).stem + "_viz.png")
            draw_labeled_boxes(Path(spath), sub, out_name, CLASS_MAP, thickness=3)

        # Stratified per-class audit (ensure examples of each class)
        for cls_id, cls_name in CLASS_MAP.items():
            sub_cls = ann_df[ann_df["class_id"] == cls_id]
            if sub_cls.empty:
                continue
            stems = sub_cls["stem"].unique().tolist()
            random.shuffle(stems)
            stems = stems[: min(CLASS_AUDIT_SAMPLES, len(stems))]
            class_dir = VIS_STRAT_DIR / f"{cls_id}_{cls_name}"
            ensure_dir(class_dir)
            for st in stems:
                img_path = sub_cls[sub_cls["stem"] == st]["image_path"].iloc[0]
                rows_this_img = sub_cls[sub_cls["image_path"] == img_path]
                out_name = class_dir / (Path(img_path).stem + f"_viz_{cls_name}.png")
                draw_labeled_boxes(Path(img_path), rows_this_img, out_name, CLASS_MAP, thickness=3)

        # Optional: per-object crops per class
        if MAX_CROPS_PER_CLASS > 0:
            for cls_id, cls_name in CLASS_MAP.items():
                sub_cls = ann_df[ann_df["class_id"] == cls_id]
                if sub_cls.empty:
                    continue
                class_crop_dir = CROPS_DIR / f"{cls_id}_{cls_name}"
                ensure_dir(class_crop_dir)
                sample_rows = sub_cls.sample(n=min(MAX_CROPS_PER_CLASS, len(sub_cls)), random_state=RANDOM_SEED)
                for i, r in sample_rows.iterrows():
                    img_path = Path(r["image_path"])
                    box = (r["bbox_x1"], r["bbox_y1"], r["bbox_x2"], r["bbox_y2"])
                    crop_out = class_crop_dir / f"{img_path.stem}_{i}.png"
                    save_crop(img_path, box, crop_out, pad=2)

    print("\nAll figures and CSVs saved to:", OUTDIR)
    print("Visual audit (mixed):", VIS_AUDIT_DIR)
    print("Visual audit (stratified by class):", VIS_STRAT_DIR)
    if MAX_CROPS_PER_CLASS > 0:
        print("Object crops:", CROPS_DIR)

if __name__ == "__main__":
    main()


[INFO] Using labels from: /content/drive/MyDrive/Faizan/dataset/TXT_pseudo
Found 713 images, 713 YOLO label files.

=== SUMMARY ===
num_images: 713
num_labels: 713
num_boxes: 1157
unique_classes_ids: [0, 1]
class_counts_id: {0: 656, 1: 501}
class_counts_named: {'dry': 656, 'water-filled': 501}
avg_potholes_per_image: 1.6227208976157084
max_potholes_in_image: 16
num_zero_pothole_images: 0
labels_source: /content/drive/MyDrive/Faizan/dataset/TXT_pseudo
=== QUALITY ISSUES ===
images_without_labels_count: 0
labels_without_images_count: 0
empty_label_files_count: 0
invalid_label_lines_count: 0
zero_area_boxes_count: 0
out_of_range_norm_count: 0
pixel_out_of_bounds_count: 0
Images containing BOTH subtypes: 0

All figures and CSVs saved to: /content/eda_outputs
Visual audit (mixed): /content/eda_outputs/visual_audit_all
Visual audit (stratified by class): /content/eda_outputs/visual_audit_by_class
