In [1]:
import os, glob, re, numpy as np, pandas as pd, laspy, h5py

# === 1. Folder setup ===
folder_path = r"C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1/Site1 las"
output_h5 = os.path.join(folder_path, "C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1/Site1 las/road_damage_binary_labels.h5")

# === 2. Collect all CSV–LAS pairs ===
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
las_files = {os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(folder_path, "*.la[sz]"))}

# === 3. Process all pairs ===
all_points, all_labels = [], []

for csv_file in csv_files:
    base = os.path.splitext(os.path.basename(csv_file))[0].replace("_clean","")
    if base not in las_files:
        print(f"⚠️ No LAS/LAZ found for {base}")
        continue

    print(f"🔹 Processing {base}...")
    las = laspy.read(las_files[base])
    pts = np.vstack((las.x, las.y, las.z)).T
    labels = np.zeros(len(pts), dtype=np.uint8)  # 0 = no damage, 1 = damage

    df = pd.read_csv(csv_file)

    # --- Filter for all types (P, C, E) ---
    df_damage = df[df["Name"].str.contains(r"_[PCE]")]

    # --- Loop through start-finish pairs ---
    for i in range(0, len(df_damage) - 1, 2):
        n1, n2 = df_damage.iloc[i]["Name"], df_damage.iloc[i+1]["Name"]
        if not any(tag in n1 for tag in ["_S", "_F"]) or not any(tag in n2 for tag in ["_S", "_F"]):
            continue

        p1 = df_damage.iloc[i][["X", "Y", "Z"]].values
        p2 = df_damage.iloc[i+1][["X", "Y", "Z"]].values

        xmin, xmax = sorted([p1[0], p2[0]])
        ymin, ymax = sorted([p1[1], p2[1]])
        zmin, zmax = sorted([p1[2], p2[2]])

        # --- Apply rectangular bounding box mask ---
        mask = (
            (pts[:,0] >= xmin) & (pts[:,0] <= xmax) &
            (pts[:,1] >= ymin) & (pts[:,1] <= ymax) &
            (pts[:,2] >= zmin - 0.05) & (pts[:,2] <= zmax + 0.05)
        )
        labels[mask] = 1  # Mark as damage

    # --- Store results ---
    all_points.append(pts)
    all_labels.append(labels)

    print(f"   Total points: {len(pts)} | Damage: {np.sum(labels==1)} | No damage: {np.sum(labels==0)}")

# === 4. Combine all and save to H5 ===
if all_points:
    all_points = np.vstack(all_points)
    all_labels = np.concatenate(all_labels)
    with h5py.File(output_h5, "w") as f:
        f.create_dataset("data", data=all_points, compression="gzip")
        f.create_dataset("label", data=all_labels, compression="gzip")
    print(f"\n✅ Saved binary damage labels to: {output_h5}")
    print(f"   Total points: {len(all_points)} | Damage: {np.sum(all_labels==1)} | No damage: {np.sum(all_labels==0)}")
else:
    print("⚠️ No data processed.")


🔹 Processing Site1_0...
   Total points: 1917694 | Damage: 267 | No damage: 1917427
🔹 Processing Site1_1...
   Total points: 306546 | Damage: 1012 | No damage: 305534
🔹 Processing Site1_10...
   Total points: 497676 | Damage: 7398 | No damage: 490278
🔹 Processing Site1_11...
   Total points: 455059 | Damage: 4162 | No damage: 450897
🔹 Processing Site1_12...
   Total points: 456490 | Damage: 5082 | No damage: 451408
🔹 Processing Site1_13...
   Total points: 698678 | Damage: 1852 | No damage: 696826
🔹 Processing Site1_14...
   Total points: 415143 | Damage: 23168 | No damage: 391975
🔹 Processing Site1_2...
   Total points: 356774 | Damage: 1401 | No damage: 355373
🔹 Processing Site1_3...
   Total points: 373677 | Damage: 1661 | No damage: 372016
🔹 Processing Site1_4...
   Total points: 336539 | Damage: 16969 | No damage: 319570
🔹 Processing Site1_5...
   Total points: 315342 | Damage: 3986 | No damage: 311356
🔹 Processing Site1_6...
   Total points: 254686 | Damage: 1064 | No damage: 253