In [1]:
# --- Imports
import os, json, time, math, random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- Paths (Windows)
BASE = Path(r"E:\1.Clustering_TA")
RAW_DIR = BASE / "dataset"          # Afi, Kinan, Miftah (input mentah)
OUT_DIR = BASE / "clustered"        # root output untuk hasil clustering
EVAL_DIR = BASE / "eval"            # root output untuk metriks evaluasi
FIG_DIR  = BASE / "figs"            # root output untuk gambar dokumentasi

SUBJECTS = ["Afi", "Kinan", "Miftah"]   # folder subjek

# --- Algo tag (akan jadi subfolder output)
ALGO_NAME = "vgcc_v08_minpt3_minvox8"  # ubah otomatis saat HPO, jangan lupa update

# --- Prefilter threshold (umum & ringan)
SNR_MIN = 3.0             # buang pantulan lemah
DOPPLER_ABS_MAX = 6.0     # m/s, buang doppler di luar nalar

# --- VG parameters (akan di-sweep saat HPO)
VOXEL_SIZE = 0.08                 # meter (≈ 8 cm)
MIN_POINTS_PER_VOXEL = 3          # buang voxel terlalu jarang
MIN_VOXELS_PER_CLUSTER = 8        # cluster minimal (6-10 cukup umum)

# --- Visual doc: per file, simpan PNG untuk N frame contoh
N_VIZ_FRAMES = 4
RANDOM_SEED = 42

# --- Kolom yang kita gunakan
COLS = ["timestamp","frame","x","y","z","doppler","SNR"]

# --- Utility: pastikan folder
for d in [OUT_DIR, EVAL_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [2]:
def load_raw_csv(path_csv: Path) -> pd.DataFrame:
    df = pd.read_csv(path_csv)
    # pastikan kolom ada
    assert all(c in df.columns for c in COLS), f"Kolom wajib tidak lengkap di {path_csv}"
    # coerce numeric
    for c in ["frame","x","y","z","doppler","SNR"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    # timestamp: biarkan string apa adanya (sudah jadi kunci join)
    return df.dropna(subset=["frame","x","y","z"]).copy()

def prefilter_points(df_frame: pd.DataFrame) -> pd.DataFrame:
    ok = (df_frame["SNR"] >= SNR_MIN) & (df_frame["doppler"].abs() <= DOPPLER_ABS_MAX)
    return df_frame.loc[ok].copy()

def voxelize_points(xyz: np.ndarray, voxel_size: float):
    """
    xyz: (N,3) float
    return:
      vkeys: list of int3 tuple indices
      vdict: dict voxel_key -> list of point indices
      vcenters: dict voxel_key -> center (mean of member points)
    """
    if xyz.size == 0:
        return [], {}, {}
    # integer voxel index
    vids = np.floor(xyz / voxel_size).astype(np.int32)
    vdict = {}
    for idx, key in enumerate(map(tuple, vids)):
        vdict.setdefault(key, []).append(idx)
    # prune voxel by |points|
    vdict = {k:v for k,v in vdict.items() if len(v) >= MIN_POINTS_PER_VOXEL}
    vkeys = list(vdict.keys())
    vcenters = {k: xyz[v].mean(axis=0) for k,v in vdict.items()}
    return vkeys, vdict, vcenters

# 6-neighborhood adjacency di grid (x±1,y±1,z±1) → gunakan 6 arah utama
NEIGHBOR_DIRS = [(1,0,0),(-1,0,0),(0,1,0),(0,-1,0),(0,0,1),(0,0,-1)]

def connected_components_voxels(vkeys: list):
    """
    vkeys: list of 3D integer voxel keys
    return: list of components (list of keys)
    """
    if not vkeys:
        return []
    vset = set(vkeys)
    visited = set()
    comps = []
    for k in vkeys:
        if k in visited: 
            continue
        # BFS
        comp = []
        stack = [k]
        visited.add(k)
        while stack:
            cur = stack.pop()
            comp.append(cur)
            cx,cy,cz = cur
            for dx,dy,dz in NEIGHBOR_DIRS:
                nkey = (cx+dx, cy+dy, cz+dz)
                if nkey in vset and nkey not in visited:
                    visited.add(nkey)
                    stack.append(nkey)
        comps.append(comp)
    return comps

def choose_main_cluster(components, vdict):
    """
    Pilih cluster utama = komponen dengan total member points terbanyak
    """
    if not components:
        return None, 0  # no cluster
    sizes = []
    for comp in components:
        s = sum(len(vdict[k]) for k in comp)
        sizes.append(s)
    idx_max = int(np.argmax(sizes))
    return components[idx_max], sizes[idx_max]

def scatter3_save(points, labels, title, save_path: Path, elev=18, azim=30):
    """
    points: (N,3) numpy
    labels: (N,) int or None
    """
    fig = plt.figure(figsize=(6,5), dpi=140)
    ax = fig.add_subplot(111, projection='3d')
    if points.size == 0:
        ax.set_title(f"{title}\n(empty)")
    else:
        if labels is None:
            ax.scatter(points[:,0], points[:,1], points[:,2], s=8)
        else:
            # map labels to colors
            labs = np.array(labels)
            ulabs = np.unique(labs)
            for li in ulabs:
                m = (labs==li)
                ax.scatter(points[m,0], points[m,1], points[m,2], s=8, label=f"c{li}")
            ax.legend(loc="upper right", fontsize=7)
        ax.set_title(title)
    ax.set_xlabel("x [m]"); ax.set_ylabel("y [m]"); ax.set_zlabel("z [m]")
    ax.view_init(elev=elev, azim=azim)
    ax.grid(True)
    fig.tight_layout()
    save_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(save_path)
    plt.close(fig)


In [4]:
def process_one_frame(df_frame: pd.DataFrame, voxel_size: float):
    """
    Input: satu frame (baris2 dengan 'frame' sama)
    Output:
      labels_per_point: np.array shape (N,) berisi cluster_id (0..C-1), -1 untuk noise/terbuang
      metrics: dict frame-level (pc_count, num_clusters, kept_cluster_size, valid, latency_ms)
    """
    t0 = time.time()
    # sort by something stabil (opsional)
    dfF = df_frame.sort_values(by=["x","y","z"]).reset_index(drop=True)
    pts = dfF[["x","y","z"]].to_numpy(dtype=np.float32)
    pc_count = len(dfF)

    # prefilter
    dfP = prefilter_points(dfF)
    ptsP = dfP[["x","y","z"]].to_numpy(dtype=np.float32)
    if len(dfP)==0:
        labels = np.full(pc_count, -1, dtype=np.int32)
        latency_ms = (time.time()-t0)*1000
        return labels, dict(pc_count=pc_count, num_clusters=0, kept_cluster_size=0, valid=0, latency_ms=latency_ms)

    # voxelize (+ prune sparse voxels)
    vkeys, vdict, _ = voxelize_points(ptsP, voxel_size)
    if len(vdict)==0:
        labels = np.full(pc_count, -1, dtype=np.int32)
        latency_ms = (time.time()-t0)*1000
        return labels, dict(pc_count=pc_count, num_clusters=0, kept_cluster_size=0, valid=0, latency_ms=latency_ms)

    # connected components
    comps = connected_components_voxels(vkeys)
    # prune cluster kecil
    comps = [c for c in comps if sum(len(vdict[k]) for k in c) >= MIN_VOXELS_PER_CLUSTER]

    num_clusters = len(comps)
    if num_clusters == 0:
        labels = np.full(pc_count, -1, dtype=np.int32)
        latency_ms = (time.time()-t0)*1000
        return labels, dict(pc_count=pc_count, num_clusters=0, kept_cluster_size=0, valid=0, latency_ms=latency_ms)

    # pilih komponen utama
    main_comp, kept_size = choose_main_cluster(comps, vdict)

    # berikan label ke semua points (di dfF) → mapping via dfP index
    labels = np.full(pc_count, -1, dtype=np.int32)   # -1 = noise/terbuang
    # label cluster id 0..num_clusters-1 sesuai urutan pada comps
    comp_index = {id(comp):i for i,comp in enumerate(comps)}
    # buat reverse map: point index in dfP -> cluster_id
    p2lab = {}
    for i,comp in enumerate(comps):
        for vk in comp:
            for pidx in vdict[vk]:
                # pidx: index relatif ke dfP
                p2lab[pidx] = i
    # terapkan ke dfF melalui index join (cocokkan baris setelah prefilter)
    # dfP dibuat dari subset dfF -> gunakan dfP.index sebagai pointer
    for rel_pidx, lab in p2lab.items():
        abs_idx = dfP.index[rel_pidx]  # index di dfF
        labels[abs_idx] = lab

    valid = 1 if kept_size > 0 else 0
    latency_ms = (time.time()-t0)*1000

    return labels, dict(
        pc_count=pc_count,
        num_clusters=num_clusters,
        kept_cluster_size=kept_size,
        valid=valid,
        latency_ms=latency_ms
    )


In [5]:
def compute_session_metrics(per_frame_df: pd.DataFrame):
    """
    Input: dataframe ringkas per frame:
      ['frame','timestamp','pc_count','num_clusters','kept_cluster_size','valid','cx','cy','cz','dt','v_walk','latency_ms']
    Output: dict metriks umum
    """
    df = per_frame_df.copy()

    # Valid Rate
    VR = df["valid"].mean() if len(df) else 0.0

    # Largest Cluster Ratio (valid only)
    msk = (df["valid"]==1) & (df["pc_count"]>0)
    LCR_med = float((df.loc[msk, "kept_cluster_size"] / df.loc[msk, "pc_count"]).median()) if msk.any() else 0.0

    # Median Cluster Size
    MCS_med = float(df.loc[df["valid"]==1, "kept_cluster_size"].median()) if (df["valid"]==1).any() else 0.0

    # Num-Clusters Median
    NC_med = float(df["num_clusters"].median()) if len(df) else 0.0

    # v_walk stability (MAD)
    v = df.loc[df["valid"]==1, "v_walk"].dropna()
    median_v = float(v.median()) if len(v) else 0.0
    mad_v = float((v - median_v).abs().median()) if len(v) else 0.0

    # Centroid Drop Rate: transisi valid/invalid
    trans = 0
    drops = 0
    vals = df["valid"].to_numpy(dtype=int)
    for i in range(1, len(vals)):
        if vals[i] != vals[i-1]:
            trans += 1
            drops += 1
    CDR = drops / trans if trans > 0 else 0.0

    # Latency median
    LAT_med = float(df["latency_ms"].median()) if "latency_ms" in df.columns and len(df) else 0.0

    return dict(VR=VR, LCR_med=LCR_med, MCS_med=MCS_med, NC_med=NC_med, median_v=median_v, MAD_v=mad_v, CDR=CDR, LAT_med=LAT_med)

def compute_centroid_and_vwalk(group_df: pd.DataFrame, labels: np.ndarray):
    """
    hitung centroid (cluster utama = label 0) dan v_walk (butuh dt) → v_walk dihitung di loop sesi
    """
    # ambil cluster utama (label==0)
    m0 = (labels == 0)
    if not m0.any():
        return np.nan, np.nan, np.nan  # invalid
    pts0 = group_df.loc[m0, ["x","y","z"]].to_numpy()
    cx, cy, cz = pts0.mean(axis=0)
    return float(cx), float(cy), float(cz)


In [6]:
def process_one_file(subject: str, in_csv: Path, algo_name: str, save_figs=True):
    """
    Proses 1 file:
      - clustering per frame (VG-CC)
      - tulis *_clustered_full.csv (semua titik + cluster_id)
      - tulis eval metrics per frame & ringkasan sesi
      - simpan beberapa figure sampel (warna per cluster_id)
    """
    df = load_raw_csv(in_csv)
    if df.empty:
        print(f"[WARN] kosong: {in_csv}")
        return None

    # Output paths
    out_dir  = OUT_DIR / algo_name / subject
    eval_dir = EVAL_DIR / algo_name / subject
    fig_dir  = FIG_DIR  / algo_name / subject
    out_dir.mkdir(parents=True, exist_ok=True)
    eval_dir.mkdir(parents=True, exist_ok=True)
    fig_dir.mkdir(parents=True, exist_ok=True)

    # group-by frame
    frames = sorted(df["frame"].unique())
    # siapkan tampungan
    labels_all = np.full(len(df), -1, dtype=np.int32)
    rows_metrics = []

    # sampling frame untuk visualisasi
    viz_frames = sorted(random.sample(frames, k=min(N_VIZ_FRAMES, len(frames))))

    # loop frames
    for fr in frames:
        g = df.loc[df["frame"]==fr]
        labels, m = process_one_frame(g, VOXEL_SIZE)
        # simpan label ke posisi baris global yang cocok
        labels_all[g.index] = labels

        # centroid + dt + v_walk
        cx, cy, cz = compute_centroid_and_vwalk(g, labels)
        ts = g["timestamp"].iloc[0]
        rows_metrics.append(dict(
            frame=int(fr),
            timestamp=str(ts),
            pc_count=int(m["pc_count"]),
            num_clusters=int(m["num_clusters"]),
            kept_cluster_size=int(m["kept_cluster_size"]),
            valid=int(m["valid"]),
            cx=cx, cy=cy, cz=cz,
            latency_ms=float(m["latency_ms"])
        ))

        # simpan figure contoh
        if save_figs and fr in viz_frames:
            pts = g[["x","y","z"]].to_numpy()
            ttl = f"{subject} | {in_csv.stem} | frame {fr} | clusters={m['num_clusters']}"
            fpath = fig_dir / f"{in_csv.stem}_frame{fr:05d}.png"
            scatter3_save(pts, labels, ttl, fpath)

    # tulis clustered_full.csv (semua titik + cluster_id)
    df_out = df.copy()
    df_out["cluster_id"] = labels_all
    out_csv = out_dir / f"{in_csv.stem}_clustered_full.csv"
    df_out.to_csv(out_csv, index=False)

    # hitung dt & v_walk pada metrics frame-level
    mdf = pd.DataFrame(rows_metrics).sort_values("frame").reset_index(drop=True)
    # dt dari timestamp: karena format string, kita gunakan delta antar frame berdasarkan urutan (fallback 1/15 s)
    # Jika kamu punya timestamp real (detik float), konversi dulu.
    # fallback Δt ~ 0.066 s (≈ 15 FPS)
    DEFAULT_DT = 1/15.0
    dts = []
    vws = []
    prev = None
    for i, r in mdf.iterrows():
        if prev is None:
            dts.append(np.nan); vws.append(np.nan)
        else:
            # dt fallback
            dt = DEFAULT_DT
            # v_walk dari centroid xy
            if np.isfinite(r["cx"]) and np.isfinite(prev["cx"]):
                dx = r["cx"] - prev["cx"]
                dy = r["cy"] - prev["cy"]
                vw = math.sqrt(dx*dx + dy*dy) / dt
            else:
                vw = np.nan
            dts.append(dt); vws.append(vw)
        prev = r
    mdf["dt"] = dts
    mdf["v_walk"] = vws

    # tulis per-frame metrics
    per_frame_csv = eval_dir / f"{in_csv.stem}_metrics.csv"
    mdf.to_csv(per_frame_csv, index=False)

    # compute session metrics
    sess = compute_session_metrics(mdf)
    sess["file"] = in_csv.name
    sess["subject"] = subject
    sess["algo"] = algo_name
    sess["voxel_size"] = VOXEL_SIZE
    sess["min_pts_voxel"] = MIN_POINTS_PER_VOXEL
    sess["min_vox_cluster"] = MIN_VOXELS_PER_CLUSTER
    sess_json = eval_dir / f"{in_csv.stem}_session_summary.json"
    with open(sess_json, "w") as f:
        json.dump(sess, f, indent=2)

    print(f"[OK] {in_csv} → {out_csv.name} & metrics saved")
    return sess


In [7]:
def list_raw_files(subject: str):
    sdir = RAW_DIR / subject
    return sorted([p for p in sdir.glob("*.csv")])

def run_batch_once(algo_name: str):
    summaries = []
    for subject in SUBJECTS:
        files = list_raw_files(subject)
        for p in files:
            s = process_one_file(subject, p, algo_name, save_figs=True)
            if s:
                summaries.append(s)
    if summaries:
        df_sum = pd.DataFrame(summaries)
        df_sum = df_sum[["subject","file","algo","VR","LCR_med","MCS_med","NC_med","median_v","MAD_v","CDR","LAT_med",
                         "voxel_size","min_pts_voxel","min_vox_cluster"]]
        outf = EVAL_DIR / algo_name / f"SUMMARY_{algo_name}.csv"
        outf.parent.mkdir(parents=True, exist_ok=True)
        df_sum.to_csv(outf, index=False)
        print(f"[OK] SUMMARY saved at {outf}")
    else:
        print("[WARN] No summaries generated.")


In [8]:
# Grid kecil & realistis, silakan sesuaikan
VOXEL_CAND = [0.06, 0.08, 0.10]            # 6–10 cm
MINPTS_VOX_CAND = [2, 3, 4]
MINVOX_CLU_CAND = [6, 8, 10]

def run_hpo_vgcc():
    results = []
    for vs in VOXEL_CAND:
        for mpv in MINPTS_VOX_CAND:
            for mvc in MINVOX_CLU_CAND:
                # set param global (sengaja sederhana agar konsisten antar-cell)
                global VOXEL_SIZE, MIN_POINTS_PER_VOXEL, MIN_VOXELS_PER_CLUSTER, ALGO_NAME
                VOXEL_SIZE = vs
                MIN_POINTS_PER_VOXEL = mpv
                MIN_VOXELS_PER_CLUSTER = mvc
                ALGO_NAME = f"vgcc_v{int(vs*100):02d}_minpt{mpv}_minvox{mvc}"

                print(f"\n=== Running {ALGO_NAME} ===")
                run_batch_once(ALGO_NAME)

                # kumpulkan summary global untuk ranking cepat
                sum_path = EVAL_DIR / ALGO_NAME / f"SUMMARY_{ALGO_NAME}.csv"
                if sum_path.exists():
                    dfS = pd.read_csv(sum_path)
                    # Skor sederhana: rata-rata VR tinggi, LCR_med tinggi, MAD_v rendah, LAT_med rendah
                    dfS["score"] = (dfS["VR"]*0.5 + dfS["LCR_med"]*0.4) - (dfS["MAD_v"]*0.05 + dfS["LAT_med"]*0.0005)
                    avg = dfS["score"].mean()
                    results.append((ALGO_NAME, vs, mpv, mvc, float(avg)))
    if results:
        rank = sorted(results, key=lambda x: x[-1], reverse=True)
        dfR = pd.DataFrame(rank, columns=["algo","voxel_size","min_pts_voxel","min_vox_cluster","score"])
        outf = EVAL_DIR / "HPO_vgcc_results.csv"
        dfR.to_csv(outf, index=False)
        print(f"\n[OK] HPO ranking saved → {outf}")
        print(dfR.head(10))
    else:
        print("[WARN] No HPO results.")
