In [1]:
import os, glob, re, numpy as np, pandas as pd, laspy, h5py
from shapely.geometry import Point, box

input_folder = "C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1/Site1 las"   # folder containing .las/.laz and .csv files
output_h5 = os.path.join(input_folder, "C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1/Site1 las/road_damage_binary_labels.h5") # Corrected output path

# === 2. Collect all CSV–LAS pairs and sort by site number ===
csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

# Sort CSV files by site and file number (e.g., Site1_0, Site1_1, ...)
def sort_key(filepath):
    basename = os.path.splitext(os.path.basename(filepath))[0]
    match = re.match(r'Site(\d+)_(\d+)', basename)
    if match:
        site_num = int(match.group(1))
        file_num = int(match.group(2))
        return (site_num, file_num)
    return (9999, 9999) # Put files that don't match the pattern at the end

csv_files.sort(key=sort_key)


las_files = {os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(input_folder, "*.la[sz]"))}

# === 3. Process all pairs ===
all_points, all_labels = [], []
total_damage_boxes_processed = 0 # Counter for all damage boxes across files

for csv_file in csv_files:
    base = os.path.splitext(os.path.basename(csv_file))[0].replace("_clean","")
    if base not in las_files:
        print(f"⚠️ Missing LAS/LAZ found for {base}")
        continue

    print(f"🔹 Processing {base}...")
    las = laspy.read(las_files[base])
    pts = np.vstack((las.x, las.y, las.z)).T
    labels = np.zeros(len(pts), dtype=np.uint8)  # 0 = no damage, 1 = damage

    df = pd.read_csv(csv_file)

    # --- Filter for all types (P, C, E) ---
    df_damage = df[df["Name"].str.contains(r"_[PCE]")]

    damage_boxes_count_file = 0 # Counter for damage boxes in the current file
    # --- Loop through start-finish pairs ---
    for i in range(0, len(df_damage) - 1, 2):
        n1, n2 = df_damage.iloc[i]["Name"], df_damage.iloc[i+1]["Name"]
        if not any(tag in n1 for tag in ["_S", "_F"]) or not any(tag in n2 for tag in ["_S", "_F"]):
            continue

        p1 = df_damage.iloc[i][["X", "Y", "Z"]].values
        p2 = df_damage.iloc[i+1][["X", "Y", "Z"]].values

        xmin, xmax = sorted([p1[0], p2[0]])
        ymin, ymax = sorted([p1[1], p2[1]])
        zmin, zmax = sorted([p1[2], p2[2]])

        # --- Apply rectangular bounding box mask ---
        mask = (
            (pts[:,0] >= xmin) & (pts[:,0] <= xmax) &
            (pts[:,1] >= ymin) & (pts[:,1] <= ymax) &
            (pts[:,2] >= zmin - 0.05) & (pts[:,2] <= zmax + 0.05)
        )
        labels[mask] = 1  # Mark as damage
        damage_boxes_count_file += 1 # Increment count for the current file

    # --- Store results ---
    all_points.append(pts)
    all_labels.append(labels)
    total_damage_boxes_processed += damage_boxes_count_file # Add file count to total

    print(f"   Total points: {len(pts)} | Damage points: {np.sum(labels==1)} | Defined Damage Regions (S-F pairs): {damage_boxes_count_file}")

# === 4. Combine all and save to H5 ===
if all_points:
    all_points = np.vstack(all_points)
    all_labels = np.concatenate(all_labels)
    with h5py.File(output_h5, "w") as f:
        f.create_dataset("data", data=all_points, compression="gzip")
        f.create_dataset("label", data=all_labels, compression="gzip")
    print(f"\n✅ Saved binary damage labels to: {output_h5}")
    print(f"   Total combined points: {len(all_points)} | Total damaged points: {np.sum(all_labels==1)} | Total Defined Damage Regions (S-F pairs): {total_damage_boxes_processed}")
else:
    print("⚠️ No data processed.")

🔹 Processing Site1_0...
   Total points: 1917694 | Damage points: 267 | Defined Damage Regions (S-F pairs): 1
🔹 Processing Site1_1...
   Total points: 306546 | Damage points: 1012 | Defined Damage Regions (S-F pairs): 20
🔹 Processing Site1_2...
   Total points: 356774 | Damage points: 1401 | Defined Damage Regions (S-F pairs): 11
🔹 Processing Site1_3...
   Total points: 373677 | Damage points: 1661 | Defined Damage Regions (S-F pairs): 18
🔹 Processing Site1_4...
   Total points: 336539 | Damage points: 16969 | Defined Damage Regions (S-F pairs): 12
🔹 Processing Site1_5...
   Total points: 315342 | Damage points: 3986 | Defined Damage Regions (S-F pairs): 10
🔹 Processing Site1_6...
   Total points: 254686 | Damage points: 1064 | Defined Damage Regions (S-F pairs): 3
🔹 Processing Site1_7...
   Total points: 300495 | Damage points: 9295 | Defined Damage Regions (S-F pairs): 18
🔹 Processing Site1_8...
   Total points: 438942 | Damage points: 41798 | Defined Damage Regions (S-F pairs): 9
🔹 