In [1]:
from dotenv import load_dotenv
load_dotenv()

import os, json, glob
from tqdm import tqdm
import numpy as np
from PIL import Image, ImageDraw
import cv2
import random
import shutil

### **Data preprocessing**

In [None]:
# Functions to remove unwanted labels from annotation JSON files.
DROP_LABELS = {'Bearing', 'EJoint', 'Drainage', 'PEquipment', 'JTape', 'WConccor'}

def clean_annotation(in_json_path: str, out_json_path: str, drop=DROP_LABELS) -> int:
    """Removes shapes whose 'label' is in DROP_LABELS. Returns # removed."""
    with open(in_json_path, "r", encoding="utf-8") as f:
        ann = json.load(f)

    shapes = ann.get("shapes", [])
    kept = [s for s in shapes if s.get("label") not in drop]
    removed = len(shapes) - len(kept)
    ann["shapes"] = kept

    os.makedirs(os.path.dirname(out_json_path), exist_ok=True)
    with open(out_json_path, "w", encoding="utf-8") as f:
        json.dump(ann, f, ensure_ascii=False, indent=2)

    return removed

def batch_clean_annotation(in_ann_dir: str, out_ann_dir: str, pattern="*.json") -> None:
    jsons = glob.glob(os.path.join(in_ann_dir, pattern))
    total_removed = 0
    for jp in tqdm(jsons, desc="Cleaning annotations", total=len(jsons)):
        rel = os.path.relpath(jp, in_ann_dir)
        outp = os.path.join(out_ann_dir, rel)
        total_removed += clean_annotation(jp, outp)
    print(f"Done. Processed {len(jsons)} files. Removed {total_removed} shapes total.")

In [None]:
# Clean dataset annotations
pattern = "*.json"

# Train
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/annotations/train"
out_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"
batch_clean_annotation(in_ann_dir, out_ann_dir, pattern)

# Validation
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/annotations/test"
out_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/test"
batch_clean_annotation(in_ann_dir, out_ann_dir, pattern)

Cleaning annotations: 100%|██████████| 6935/6935 [00:02<00:00, 2351.91it/s]


Done. Processed 6935 files. Removed 5859 shapes total.


Cleaning annotations: 100%|██████████| 975/975 [00:00<00:00, 2085.62it/s]

Done. Processed 975 files. Removed 885 shapes total.





In [None]:
# Function to draw annotations and compute overlaps
def compute_overlap(img_path, ann_path, classes):
    w, h = Image.open(img_path).size
    # one boolean mask per class
    planes = np.zeros((len(classes), h, w), dtype=bool)

    with open(ann_path, "r", encoding="utf-8") as f:
        ann = json.load(f)

    # Loop over shapes (one shape is one mask)
    for shp in ann.get("shapes", []):
        label = shp.get("label", "")

        # Skip unwanted labels (should not be present after cleaning)
        if label not in classes:
            continue

        # Get points for each shape (mask)
        idx = classes.index(label)
        pts = shp.get("points", [])

        # Skip invalid polygons if they exist
        if len(pts) < 3:
            continue

        # Generate mask for this polygon, draw it to a temporary image, and add it to the corresponding plane (accumulating all masks for that class)
        poly = [(float(x), float(y)) for x, y in pts]
        m = Image.new("1", (w, h), 0)
        ImageDraw.Draw(m).polygon(poly, outline=1, fill=1)
        planes[idx] |= np.array(m, dtype=bool)

    # pixels covered by >= 2 classes
    overlap = planes.sum(axis=0) >= 2
    return planes, overlap

# Relevant directories and classes
IMAGES_DIR = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/images/train"
ANNS_DIR   = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"
DEFECTS = ['Crack', 'ACrack', 'Wetspot', 'Efflorescence', 'Rust', 'Rockpocket', 'Hollowareas', 'Cavity', 'Spalling', 'Graffiti', 'Weathering', 'Restformwork', 'ExposedRebars']

# Get annotations
jsons = sorted(glob.glob(os.path.join(ANNS_DIR, "*.json")))
n_imgs = 0
n_overlap_imgs = 0
overlap_pixels_total = 0

# Compute overlaps
for jp in tqdm(jsons, desc="Computing overlaps", total=len(jsons)):
    stem = os.path.splitext(os.path.basename(jp))[0]
    ip = os.path.join(IMAGES_DIR, f"{stem}.jpg")
    if not os.path.exists(ip):
        continue
    n_imgs += 1
    _, ov = compute_overlap(ip, jp, DEFECTS)
    n = int(ov.sum()) # number of overlapping pixels
    if n > 0:
        n_overlap_imgs += 1
        overlap_pixels_total += n

print(f"Processed images: {n_imgs}")
print(f"Images with overlaps: {n_overlap_imgs} ({100.0*n_overlap_imgs/max(1,n_imgs):.1f}%)")
print(f"Total overlapping pixels: {overlap_pixels_total}")

Computing overlaps: 100%|██████████| 6935/6935 [04:23<00:00, 26.32it/s] 

Processed images: 6935
Images with overlaps: 4042 (58.3%)
Total overlapping pixels: 915524988





In [None]:
# Count total number of pixels to compute ratio
path = "data/images/train"
img_paths = [os.path.join(folder, file) for folder, _, files in os.walk(path) for file in files]
img_shapes = [cv2.imread(img_path).shape for img_path in tqdm(img_paths)]
img_pixels = [shape[0]*shape[1] for shape in tqdm(img_shapes)]

print (f"Total pixels in dataset: {sum(img_pixels)}")
print (f"Ratio of overlapping pixels: {overlap_pixels_total / sum(img_pixels):.4f}")

100%|██████████| 6935/6935 [01:01<00:00, 113.57it/s]
100%|██████████| 6935/6935 [00:00<00:00, 6706824.59it/s]

Total pixels in dataset: 29073367793
Ratio of overlapping pixels: 0.0315





Because the dataset annotations are multilabel and overlap in ~3% of pixels, we collapse overlaps to a single class using a fixed priority hierarchy. This enables consistent multiclass training and evaluation for U-Net, SegFormer, and Mask2Former. A multilabel experiment will be explored separately.

In [78]:
# Highest priority first (wins on overlap)
PRIORITY = [
    'Crack', 'ExposedRebars', 'Spalling', 'Rust', 'ACrack',
    'Rockpocket', 'Hollowareas', 'Efflorescence', 'Cavity',
    'Wetspot', 'Weathering', 'Restformwork', 'Graffiti',
]

PRIORITY_DICT = {cls: i+1 for i, cls in enumerate(PRIORITY)}  # 0 = background

def rasterize_polygon_mask(w, h, pts):
    m = Image.new('1', (w, h), 0)                   # 1-bit mask
    ImageDraw.Draw(m).polygon([(float(x), float(y)) for x,y in pts],
                              outline=1, fill=1)
    return np.array(m, dtype=bool)

def labelme_to_multiclass_png(json_path, out_png_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        ann = json.load(f)

    # image size
    w, h = int(ann["imageWidth"]), int(ann["imageHeight"])

    # per-class boolean planes (K,H,W)
    planes = {cls: np.zeros((h, w), dtype=bool) for cls in PRIORITY}

    # accumulate polygons into planes
    for shp in ann.get("shapes", []):
        label = shp.get("label", "")
        pts = shp.get("points", [])
        if label in planes and len(pts) >= 3:
            planes[label] |= rasterize_polygon_mask(w, h, pts)

    # collapse to single-channel mask (uint8), background = 0
    Y = np.zeros((h, w), dtype=np.uint8)
    # iterate from LOWEST → HIGHEST so highest priority overwrites last
    for cls in reversed(PRIORITY):
        Y[planes[cls]] = PRIORITY_DICT[cls]

    os.makedirs(os.path.dirname(out_png_path), exist_ok=True)
    Image.fromarray(Y, mode='L').save(out_png_path)   # grayscale PNG

def batch_convert_labelme_to_png(ann_dir, out_dir, pattern="*.json"):
    paths = sorted(glob.glob(os.path.join(ann_dir, pattern)))
    for jp in tqdm(paths, desc="Converting to PNG", total=len(paths)):
        stem = os.path.splitext(os.path.basename(jp))[0]
        outp = os.path.join(out_dir, f"{stem}.png")
        labelme_to_multiclass_png(jp, outp)
    print(f"Done. Wrote {len(paths)} masks to: {out_dir}")


In [None]:
# Train
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"
out_mask_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/train"
batch_convert_labelme_to_png(in_ann_dir, out_mask_dir)

# Test
in_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/test"
out_ann_dir = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/test"
batch_clean_annotation(in_ann_dir, out_ann_dir, pattern)

Converting to PNG: 100%|██████████| 6935/6935 [02:17<00:00, 50.27it/s] 


Done. Wrote 6935 masks to: /Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/train


Cleaning annotations: 100%|██████████| 975/975 [00:00<00:00, 2939.26it/s]

Done. Processed 975 files. Removed 0 shapes total.





In [2]:
# Count occurrences of each label
LABELS = [
    'Crack', 'ExposedRebars', 'Spalling', 'Rust', 'ACrack',
    'Rockpocket', 'Hollowareas', 'Efflorescence', 'Cavity',
    'Wetspot', 'Weathering', 'Restformwork', 'Graffiti',
]

# Function to count labels in annotations
def count_labels(ann_paths, LABELS):

    # Initialize counts
    label_counts = {label: 0 for label in LABELS}

    # Loop over annotations and count labels
    for ann_path in tqdm(ann_paths, desc="Counting labels", total=len(ann_paths)):
        with open(ann_path, 'r', encoding='utf-8') as f:
            ann = json.load(f)
        labels_in_img = set()
        for shp in ann.get("shapes", []):
            label = shp.get("label", "")
            if label in LABELS:
                labels_in_img.add(label)
        for label in labels_in_img:
            label_counts[label] += 1
    
    return label_counts

In [3]:
# Train
ann_dir_train = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"

# Get all annotation paths
ann_paths_train = [os.path.join(ann_dir_train, sp) for sp in os.listdir(ann_dir_train)]
label_counts_train = count_labels(ann_paths_train, LABELS)
print("Label counts in training set:")
for label, count in label_counts_train.items():
    print(f"  {label}: {count}")

Counting labels: 100%|██████████| 6935/6935 [00:00<00:00, 11014.28it/s]

Label counts in training set:
  Crack: 1727
  ExposedRebars: 773
  Spalling: 3262
  Rust: 3450
  ACrack: 332
  Rockpocket: 311
  Hollowareas: 1097
  Efflorescence: 1523
  Cavity: 1188
  Wetspot: 965
  Weathering: 2704
  Restformwork: 713
  Graffiti: 795





In [4]:
# Test
ann_dir_test = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/test"

# Get all annotation paths
ann_paths_test = [os.path.join(ann_dir_test, sp) for sp in os.listdir(ann_dir_test)]
label_counts_test = count_labels(ann_paths_test, LABELS)
print("Label counts in validation set:")
for label, count in label_counts_test.items():
    print(f"  {label}: {count}")


Counting labels: 100%|██████████| 975/975 [00:00<00:00, 8656.21it/s]

Label counts in validation set:
  Crack: 254
  ExposedRebars: 104
  Spalling: 477
  Rust: 465
  ACrack: 42
  Rockpocket: 52
  Hollowareas: 155
  Efflorescence: 206
  Cavity: 167
  Wetspot: 144
  Weathering: 392
  Restformwork: 133
  Graffiti: 146





In [5]:
# Compute ratios of validation to training counts
ratios = {
    label: label_counts_test[label] / max(1, label_counts_train[label])
    for label in LABELS
}

# Show ratios. Split will be done so that validation set has similar distribution.
for label, ratio in ratios.items():
    print(f"{label}: {ratio:.4f}")

Crack: 0.1471
ExposedRebars: 0.1345
Spalling: 0.1462
Rust: 0.1348
ACrack: 0.1265
Rockpocket: 0.1672
Hollowareas: 0.1413
Efflorescence: 0.1353
Cavity: 0.1406
Wetspot: 0.1492
Weathering: 0.1450
Restformwork: 0.1865
Graffiti: 0.1836


In [6]:
# Perform splitting
MIN = 0.13 # min ratio of val to train samples for each class
r = 0.15 # split ratio
n_val = int(len(ann_paths_train) * r)

# Random Split with similar distribution
success = False
while not success:
    # Split randomly
    ann_paths_val = random.sample(ann_paths_train, n_val)
    ann_paths_new_train = [ap for ap in ann_paths_train if ap not in ann_paths_val]

    # Count labels
    label_counts_val = count_labels(ann_paths_val, LABELS)
    label_counts_new_train = count_labels(ann_paths_new_train, LABELS)

    # Compute ratios
    ratios = {
    label: label_counts_val[label] / max(1, label_counts_new_train[label])
    for label in LABELS
    }

    # Check if all ratios are above MIN
    success = all(ratio >= MIN for ratio in ratios.values())

    if success:
        print ("\nRatios for validation to new training set:")
        for label, ratio in ratios.items():
            print(f"  {label}: {ratio:.4f}")

print("\nFinal label counts in new training set:")
for label, count in label_counts_new_train.items():
    print(f"  {label}: {count}")

Counting labels: 100%|██████████| 1040/1040 [00:00<00:00, 23116.71it/s]
Counting labels: 100%|██████████| 5895/5895 [00:00<00:00, 24325.34it/s]


Ratios for validation to new training set:
  Crack: 0.1788
  ExposedRebars: 0.1856
  Spalling: 0.1763
  Rust: 0.1739
  ACrack: 0.2029
  Rockpocket: 0.1604
  Hollowareas: 0.1645
  Efflorescence: 0.1724
  Cavity: 0.1797
  Wetspot: 0.1812
  Weathering: 0.1839
  Restformwork: 0.1593
  Graffiti: 0.1691

Final label counts in new training set:
  Crack: 1465
  ExposedRebars: 652
  Spalling: 2773
  Rust: 2939
  ACrack: 276
  Rockpocket: 268
  Hollowareas: 942
  Efflorescence: 1299
  Cavity: 1007
  Wetspot: 817
  Weathering: 2284
  Restformwork: 615
  Graffiti: 680





In [15]:
# Split images, annotaitions, and masks into new train/val sets
train_samples = [os.path.splitext(os.path.basename(p))[0] for p in ann_paths_new_train]
val_samples = [os.path.splitext(os.path.basename(p))[0] for p in ann_paths_val]

# Create validation directories
ann_dir_val = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/validation"
mask_dir_val = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/validation"
img_dir_val = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/images/validation"
for dir in [ann_dir_val, mask_dir_val, img_dir_val]:
    os.makedirs(dir, exist_ok=True)

# Train directories
ann_dir_train = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/train"
mask_dir_train = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/train"
img_dir_train = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/images/train"

# Move validation samples to validation directories
for sample in val_samples:

    # Annotations
    src_ann = os.path.join(ann_dir_train, f"{sample}.json")
    dst_ann = os.path.join(ann_dir_val, f"{sample}.json")
    if os.path.exists(src_ann):
        shutil.move(src_ann, dst_ann)

    # Masks
    src_mask = os.path.join(mask_dir_train, f"{sample}.png")
    dst_mask = os.path.join(mask_dir_val, f"{sample}.png")
    if os.path.exists(src_mask):
        shutil.move(src_mask, dst_mask)

    # Images
    src_img = os.path.join(img_dir_train, f"{sample}.jpg")
    dst_img = os.path.join(img_dir_val, f"{sample}.jpg")
    if os.path.exists(src_img):
        shutil.move(src_img, dst_img)

# Integrity checks
ann_paths_train = [os.path.join(ann_dir_train, sp) for sp in os.listdir(ann_dir_train)]
mask_paths_train = [os.path.join(mask_dir_train, sp) for sp in os.listdir(mask_dir_train)]
img_paths_train = [os.path.join(img_dir_train, sp) for sp in os.listdir(img_dir_train)]

ann_paths_val = [os.path.join(ann_dir_val, sp) for sp in os.listdir(ann_dir_val)]
mask_paths_val = [os.path.join(mask_dir_val, sp) for sp in os.listdir(mask_dir_val)]
img_paths_val = [os.path.join(img_dir_val, sp) for sp in os.listdir(img_dir_val)]

# Check lengths
assert len(ann_paths_train) == len(mask_paths_train) == len(img_paths_train)

# Check that all samples in train set are correct
assert all(os.path.splitext(os.path.basename(p))[0] in train_samples for p in ann_paths_train)
assert all(os.path.splitext(os.path.basename(p))[0] in train_samples for p in mask_paths_train)
assert all(os.path.splitext(os.path.basename(p))[0] in train_samples for p in img_paths_train)

# Check that all samples in validation set are correct
assert all(os.path.splitext(os.path.basename(p))[0] in val_samples for p in ann_paths_val)
assert all(os.path.splitext(os.path.basename(p))[0] in val_samples for p in mask_paths_val)
assert all(os.path.splitext(os.path.basename(p))[0] in val_samples for p in img_paths_val)

In [23]:
# Adjust names for consistency
def adjust_filename(path, old_str, new_str):
    folder = os.path.dirname(path)
    old_name = os.path.basename(path)
    new_name = old_name.replace(old_str, new_str)
    new_path = os.path.join(folder, new_name)
    os.rename(path, new_path)

# Validation
for ann_path, mask_path, img_path in zip(ann_paths_val, mask_paths_val, img_paths_val):
    adjust_filename(ann_path, "train", "validation")
    adjust_filename(mask_path, "train", "validation")
    adjust_filename(img_path, "train", "validation")

# Test
ann_dir_test = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/annotations/test"
mask_dir_test = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data_clean/masks/test"
img_dir_test = "/Users/diegovelasco/Desktop/Diego/FING/DLFCV-FinalProject/data/images/test"

ann_paths_test = [os.path.join(ann_dir_test, sp) for sp in os.listdir(ann_dir_test)]
mask_paths_test = [os.path.join(mask_dir_test, sp) for sp in os.listdir(mask_dir_test)]
img_paths_test = [os.path.join(img_dir_test, sp) for sp in os.listdir(img_dir_test)]

for ann_path, mask_path, img_path in zip(ann_paths_test, mask_paths_test, img_paths_test):
    adjust_filename(ann_path, "validation", "test")
    adjust_filename(mask_path, "validation", "test")
    adjust_filename(img_path, "validation", "test")