In [68]:
from dotenv import load_dotenv
load_dotenv()

import os, json, glob
from tqdm import tqdm
import numpy as np
from PIL import Image, ImageDraw

### **Data preprocessing**

In [None]:
# Functions to remove unwanted labels from annotation JSON files.
DROP_LABELS = {'Bearing', 'EJoint', 'Drainage', 'PEquipment', 'JTape', 'WConccor'}

def clean_annotation(in_json_path: str, out_json_path: str, drop=DROP_LABELS) -> int:
    """Removes shapes whose 'label' is in DROP_LABELS. Returns # removed."""
    with open(in_json_path, "r", encoding="utf-8") as f:
        ann = json.load(f)

    shapes = ann.get("shapes", [])
    kept = [s for s in shapes if s.get("label") not in drop]
    removed = len(shapes) - len(kept)
    ann["shapes"] = kept

    os.makedirs(os.path.dirname(out_json_path), exist_ok=True)
    with open(out_json_path, "w", encoding="utf-8") as f:
        json.dump(ann, f, ensure_ascii=False, indent=2)

    return removed

def batch_clean_annotation(in_ann_dir: str, out_ann_dir: str, pattern="*.json") -> None:
    jsons = glob.glob(os.path.join(in_ann_dir, pattern))
    total_removed = 0
    for jp in tqdm(jsons, desc="Cleaning annotations", total=len(jsons)):
        rel = os.path.relpath(jp, in_ann_dir)
        outp = os.path.join(out_ann_dir, rel)
        total_removed += clean_annotation(jp, outp)
    print(f"Done. Processed {len(jsons)} files. Removed {total_removed} shapes total.")

In [None]:
# Clean dataset annotations
pattern = "*.json"

# Train
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/dacl10k_v2_devphase/annotations/train"
out_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"
batch_clean_annotation(in_ann_dir, out_ann_dir, pattern)

# Validation
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/dacl10k_v2_devphase/annotations/validation"
out_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/validation"
batch_clean_annotation(in_ann_dir, out_ann_dir, pattern)

Cleaning annotations: 100%|██████████| 6935/6935 [00:02<00:00, 2351.91it/s]


Done. Processed 6935 files. Removed 5859 shapes total.


Cleaning annotations: 100%|██████████| 975/975 [00:00<00:00, 2085.62it/s]

Done. Processed 975 files. Removed 885 shapes total.





In [None]:
# Function to draw annotations and compute overlaps
def compute_overlap(img_path, ann_path, classes):
    w, h = Image.open(img_path).size
    # one boolean mask per class
    planes = np.zeros((len(classes), h, w), dtype=bool)

    with open(ann_path, "r", encoding="utf-8") as f:
        ann = json.load(f)

    # Loop over shapes (one shape is one mask)
    for shp in ann.get("shapes", []):
        label = shp.get("label", "")

        # Skip unwanted labels (should not be present after cleaning)
        if label not in classes:
            continue

        # Get points for each shape (mask)
        idx = classes.index(label)
        pts = shp.get("points", [])

        # Skip invalid polygons if they exist
        if len(pts) < 3:
            continue

        # Generate mask for this polygon, draw it to a temporary image, and add it to the corresponding plane (accumulating all masks for that class)
        poly = [(float(x), float(y)) for x, y in pts]
        m = Image.new("1", (w, h), 0)
        ImageDraw.Draw(m).polygon(poly, outline=1, fill=1)
        planes[idx] |= np.array(m, dtype=bool)

    # pixels covered by >= 2 classes
    overlap = planes.sum(axis=0) >= 2
    return planes, overlap

# Relevant directories and classes
IMAGES_DIR = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/dacl10k_v2_devphase/images/train"
ANNS_DIR   = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"
DEFECTS = ['Crack', 'ACrack', 'Wetspot', 'Efflorescence', 'Rust', 'Rockpocket', 'Hollowareas', 'Cavity', 'Spalling', 'Graffiti', 'Weathering', 'Restformwork', 'ExposedRebars']

# Get annotations
jsons = sorted(glob.glob(os.path.join(ANNS_DIR, "*.json")))
n_imgs = 0
n_overlap_imgs = 0
overlap_pixels_total = 0

# Compute overlaps
for jp in tqdm(jsons, desc="Computing overlaps", total=len(jsons)):
    stem = os.path.splitext(os.path.basename(jp))[0]
    ip = os.path.join(IMAGES_DIR, f"{stem}.jpg")
    if not os.path.exists(ip):
        continue
    n_imgs += 1
    _, ov = compute_overlap(ip, jp, DEFECTS)
    n = int(ov.sum()) # number of overlapping pixels
    if n > 0:
        n_overlap_imgs += 1
        overlap_pixels_total += n

print(f"Processed images: {n_imgs}")
print(f"Images with overlaps: {n_overlap_imgs} ({100.0*n_overlap_imgs/max(1,n_imgs):.1f}%)")
print(f"Total overlapping pixels: {overlap_pixels_total}")

Computing overlaps: 100%|██████████| 6935/6935 [04:23<00:00, 26.32it/s] 

Processed images: 6935
Images with overlaps: 4042 (58.3%)
Total overlapping pixels: 915524988





**Avg DACL10K image:** $1920×1440 ≈ 2.764.800$ pixels
**Dataset size:** $ 6935$ images  
**Total pixels:** $2.764.800 \times 6935 = 19173888000$ pixels  

**Ratio:** $915.524.988 / 19173888000 ≈ 0.0478 → ~4.8\%$

Because the dataset annotations are multilabel and overlap in ~5% of pixels, we collapse overlaps to a single class using a fixed priority hierarchy. This enables consistent multiclass training and evaluation for U-Net, SegFormer, and Mask2Former. A multilabel experiment will be explored separately.

In [78]:
# Highest priority first (wins on overlap)
PRIORITY = [
    'Crack', 'ExposedRebars', 'Spalling', 'Rust', 'ACrack',
    'Rockpocket', 'Hollowareas', 'Efflorescence', 'Cavity',
    'Wetspot', 'Weathering', 'Restformwork', 'Graffiti',
]

PRIORITY_DICT = {cls: i+1 for i, cls in enumerate(PRIORITY)}  # 0 = background

def rasterize_polygon_mask(w, h, pts):
    m = Image.new('1', (w, h), 0)                   # 1-bit mask
    ImageDraw.Draw(m).polygon([(float(x), float(y)) for x,y in pts],
                              outline=1, fill=1)
    return np.array(m, dtype=bool)

def labelme_to_multiclass_png(json_path, out_png_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        ann = json.load(f)

    # image size
    w, h = int(ann["imageWidth"]), int(ann["imageHeight"])

    # per-class boolean planes (K,H,W)
    planes = {cls: np.zeros((h, w), dtype=bool) for cls in PRIORITY}

    # accumulate polygons into planes
    for shp in ann.get("shapes", []):
        label = shp.get("label", "")
        pts = shp.get("points", [])
        if label in planes and len(pts) >= 3:
            planes[label] |= rasterize_polygon_mask(w, h, pts)

    # collapse to single-channel mask (uint8), background = 0
    Y = np.zeros((h, w), dtype=np.uint8)
    # iterate from LOWEST → HIGHEST so highest priority overwrites last
    for cls in reversed(PRIORITY):
        Y[planes[cls]] = PRIORITY_DICT[cls]

    os.makedirs(os.path.dirname(out_png_path), exist_ok=True)
    Image.fromarray(Y, mode='L').save(out_png_path)   # grayscale PNG

def batch_convert_labelme_to_png(ann_dir, out_dir, pattern="*.json"):
    paths = sorted(glob.glob(os.path.join(ann_dir, pattern)))
    for jp in tqdm(paths, desc="Converting to PNG", total=len(paths)):
        stem = os.path.splitext(os.path.basename(jp))[0]
        outp = os.path.join(out_dir, f"{stem}.png")
        labelme_to_multiclass_png(jp, outp)
    print(f"Done. Wrote {len(paths)} masks to: {out_dir}")


In [79]:
# Train
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"
out_mask_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/train"
batch_convert_labelme_to_png(in_ann_dir, out_mask_dir)

# Validation
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/validation"
out_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/validation"
batch_clean_annotation(in_ann_dir, out_ann_dir, pattern)

Converting to PNG: 100%|██████████| 6935/6935 [02:17<00:00, 50.27it/s] 


Done. Wrote 6935 masks to: /Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/train


Cleaning annotations: 100%|██████████| 975/975 [00:00<00:00, 2939.26it/s]

Done. Processed 975 files. Removed 0 shapes total.



