Cell 1 — Imports & Config

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Dict, Tuple, List

# Paths
DATASET_ROOT = Path(r"E:\0.TA_Teguh\GMM Trial 2")
HEAD1_DIR = DATASET_ROOT / "Head 1"
OUT_ROOT = Path(r"E:\0.TA_Teguh\Sampling\FPS")     

# Subjects A..J
SUBJECTS = list("ABCDEFGHIJ")

# Candidates
N_TARGETS = [48, 56, 64]
RUN_N_TARGETS = [64]  # start with one; change to [48,56,64] after sanity check

# Must-match Head-1 columns
COLS_EXPECTED = ["timestamp","frame","x","y","z","doppler","SNR","w_snr","d2","is_inlier"]
COLS_OUT = COLS_EXPECTED[:]  # keep order

# Sampling settings
START_MODE = "centroid_nearest"
EPS_W = 1e-12

# Read/write config
CSV_READ_KWARGS = dict(dtype={
    "timestamp": str,
    "frame": np.int32,
    "x": np.float32,
    "y": np.float32,
    "z": np.float32,
    "doppler": np.float32,
    "SNR": np.float32,
    "w_snr": np.float32,
    "d2": np.float32,
    "is_inlier": np.int8,
})
CSV_WRITE_KWARGS = dict(index=False)

print("OK | HEAD1_DIR:", HEAD1_DIR)
print("OK | OUT_ROOT :", OUT_ROOT)
print("RUN_N_TARGETS :", RUN_N_TARGETS)


OK | HEAD1_DIR: E:\0.TA_Teguh\GMM Trial 2\Head 1
OK | OUT_ROOT : E:\0.TA_Teguh\Sampling\FPS
RUN_N_TARGETS : [64]


Cell 2 — Deterministic seed helper

In [2]:
import hashlib

def stable_seed(subject: str, file_name: str, frame: int, n_target: int) -> int:
    """
    Stable seed across runs/machines for weighted resampling.
    """
    s = f"{subject}|{file_name}|{frame}|{n_target}"
    h = hashlib.sha256(s.encode("utf-8")).hexdigest()
    # Take first 8 hex chars -> 32-bit
    return int(h[:8], 16)


Cell 3 — IO helpers (list files, read head1)

In [3]:
def list_jalan_csvs(subject_dir: Path) -> List[Path]:
    if not subject_dir.exists():
        return []
    return sorted(subject_dir.glob("Jalan*.csv"))

def read_head1_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, **CSV_READ_KWARGS)
    missing = [c for c in COLS_EXPECTED if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in {path.name}: {missing}")
    # Ensure correct column order (and drop extras if any)
    df = df[COLS_EXPECTED].copy()
    return df


Cell 4 — Frame extraction & sanitization

In [4]:
def get_frame_df(df: pd.DataFrame, frame_id: int) -> pd.DataFrame:
    f = df[df["frame"] == frame_id].copy()
    if f.empty:
        return f
    # Drop NaN/Inf just in case (should be clean already)
    num_cols = ["x","y","z","doppler","SNR","w_snr","d2"]
    for c in num_cols:
        v = f[c].to_numpy()
        ok = np.isfinite(v)
        if not ok.all():
            f = f.loc[ok].copy()
    return f

def xyz_array(frame_df: pd.DataFrame) -> np.ndarray:
    return frame_df[["x","y","z"]].to_numpy(dtype=np.float32, copy=False)


Cell 5 — FPS core (downsample only)

In [5]:
def fps_indices(xyz: np.ndarray, n_select: int, start_idx: int) -> np.ndarray:
    """
    xyz: [M,3], n_select <= M
    Returns indices length n_select using farthest point sampling.
    """
    M = xyz.shape[0]
    if n_select >= M:
        return np.arange(M, dtype=np.int32)

    selected = np.empty(n_select, dtype=np.int32)
    selected[0] = start_idx

    # min squared distance to the selected set for each point
    diff0 = xyz - xyz[start_idx]
    min_dist2 = np.einsum("ij,ij->i", diff0, diff0).astype(np.float32)

    for i in range(1, n_select):
        nxt = int(np.argmax(min_dist2))
        selected[i] = nxt
        diff = xyz - xyz[nxt]
        dist2 = np.einsum("ij,ij->i", diff, diff).astype(np.float32)
        min_dist2 = np.minimum(min_dist2, dist2)

    return selected


Cell 6 — Start point selection (centroid-nearest)

In [6]:
def start_index_centroid_nearest(xyz: np.ndarray) -> int:
    c = xyz.mean(axis=0)
    diff = xyz - c
    dist2 = np.einsum("ij,ij->i", diff, diff)
    return int(np.argmin(dist2))


Cell 7 — Shared upsample (weighted replacement by w_snr)

In [7]:
def weighted_resample_indices(w: np.ndarray, k: int, seed: int) -> np.ndarray:
    """
    Sample k indices with replacement using probability ∝ w.
    """
    if k <= 0:
        return np.empty((0,), dtype=np.int32)

    w = np.asarray(w, dtype=np.float64)
    w = np.clip(w, 0.0, None)
    s = w.sum()
    if not np.isfinite(s) or s <= EPS_W:
        # fallback to uniform if weights degenerate
        p = None
    else:
        p = w / s

    rng = np.random.default_rng(seed)
    return rng.choice(len(w), size=k, replace=True, p=p).astype(np.int32)


Cell 8 — Per-frame sampler (FPS + shared upsample)

In [8]:
def sample_frame_fps(frame_df: pd.DataFrame, n_target: int, subject: str, file_name: str) -> Tuple[pd.DataFrame, Dict]:
    """
    Returns sampled_frame_df with SAME columns as input (COLS_OUT).
    """
    M = len(frame_df)
    meta = {"M": M, "N": n_target, "action": None, "n_drop": 0, "n_fill": 0}

    if M == 0:
        meta["action"] = "empty"
        return frame_df[COLS_OUT].copy(), meta

    if M == n_target:
        meta["action"] = "equal"
        return frame_df[COLS_OUT].copy(), meta

    if M > n_target:
        xyz = xyz_array(frame_df)
        start_idx = start_index_centroid_nearest(xyz)
        idx = fps_indices(xyz, n_target, start_idx)
        out = frame_df.iloc[idx].copy()
        meta["action"] = "downsample"
        meta["n_drop"] = M - n_target
        return out[COLS_OUT], meta

    # M < n_target → upsample with replacement using w_snr
    k = n_target - M
    seed = stable_seed(subject, file_name, int(frame_df["frame"].iloc[0]), n_target)
    add_idx = weighted_resample_indices(frame_df["w_snr"].to_numpy(), k=k, seed=seed)
    add = frame_df.iloc[add_idx].copy()
    out = pd.concat([frame_df, add], axis=0, ignore_index=True)
    meta["action"] = "upsample"
    meta["n_fill"] = k
    return out[COLS_OUT], meta


Cell 9 — Per-file processing (loop frames, save CSV)

In [9]:
def process_one_file(head1_path: Path, subject: str, n_target: int) -> Dict:
    """
    Process one Jalan*.csv from Head-1 and write sampled output CSV.
    """
    df = read_head1_csv(head1_path)

    file_name = head1_path.name
    out_dir = OUT_ROOT / f"N{n_target}" / subject
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / file_name

    frames = df["frame"].unique()
    frames.sort()

    out_chunks = []
    stats = {"file": file_name, "subject": subject, "N": n_target,
             "frames": int(len(frames)), "down": 0, "up": 0, "eq": 0, "empty": 0,
             "total_drop": 0, "total_fill": 0}

    for fr in frames:
        fdf = get_frame_df(df, int(fr))
        sampled, meta = sample_frame_fps(fdf, n_target=n_target, subject=subject, file_name=file_name)

        out_chunks.append(sampled)

        act = meta["action"]
        if act == "downsample":
            stats["down"] += 1
            stats["total_drop"] += int(meta["n_drop"])
        elif act == "upsample":
            stats["up"] += 1
            stats["total_fill"] += int(meta["n_fill"])
        elif act == "equal":
            stats["eq"] += 1
        else:
            stats["empty"] += 1

    out_df = pd.concat(out_chunks, axis=0, ignore_index=True)
    out_df.to_csv(out_path, **CSV_WRITE_KWARGS)
    return stats


Cell 10 — Run all (batch: subject → file), plus summary CSV

In [10]:
all_stats = []

for n_target in RUN_N_TARGETS:
    for subject in SUBJECTS:
        subj_dir = HEAD1_DIR / subject
        files = list_jalan_csvs(subj_dir)
        if not files:
            print(f"[WARN] No files for subject {subject} in {subj_dir}")
            continue

        print(f"\n=== N={n_target} | Subject {subject} | files={len(files)} ===")
        for p in files:
            st = process_one_file(p, subject=subject, n_target=n_target)
            all_stats.append(st)
            print(f"  {st['file']}: frames={st['frames']}, down={st['down']}, up={st['up']}, eq={st['eq']}, "
                  f"fill={st['total_fill']}, drop={st['total_drop']}")

# Save summary
summary_df = pd.DataFrame(all_stats)
summary_path = OUT_ROOT / "fps_sampling_summary.csv"
summary_df.to_csv(summary_path, index=False)
print("\nSaved summary:", summary_path)
summary_df.head()



=== N=64 | Subject A | files=72 ===
  Jalan1.csv: frames=82, down=7, up=74, eq=1, fill=2662, drop=100
  Jalan10.csv: frames=87, down=20, up=66, eq=1, fill=2257, drop=375
  Jalan11.csv: frames=87, down=33, up=54, eq=0, fill=2002, drop=1044
  Jalan12.csv: frames=98, down=34, up=62, eq=2, fill=2085, drop=615
  Jalan13.csv: frames=103, down=28, up=74, eq=1, fill=2137, drop=568
  Jalan14.csv: frames=81, down=3, up=78, eq=0, fill=2965, drop=31
  Jalan15.csv: frames=239, down=40, up=196, eq=3, fill=6940, drop=603
  Jalan16.csv: frames=229, down=45, up=181, eq=3, fill=6298, drop=779
  Jalan17.csv: frames=194, down=67, up=126, eq=1, fill=3688, drop=1635
  Jalan18.csv: frames=166, down=91, up=71, eq=4, fill=1670, drop=3188
  Jalan19.csv: frames=222, down=53, up=169, eq=0, fill=5668, drop=1310
  Jalan2.csv: frames=85, down=44, up=40, eq=1, fill=980, drop=973
  Jalan20.csv: frames=174, down=58, up=115, eq=1, fill=3624, drop=889
  Jalan21.csv: frames=149, down=50, up=98, eq=1, fill=3223, drop=1511

Unnamed: 0,file,subject,N,frames,down,up,eq,empty,total_drop,total_fill
0,Jalan1.csv,A,64,82,7,74,1,0,100,2662
1,Jalan10.csv,A,64,87,20,66,1,0,375,2257
2,Jalan11.csv,A,64,87,33,54,0,0,1044,2002
3,Jalan12.csv,A,64,98,34,62,2,0,615,2085
4,Jalan13.csv,A,64,103,28,74,1,0,568,2137


Cell 11 — Quick sanity checks on outputs (optional)

In [11]:
if len(all_stats) > 0:
    df_sum = pd.DataFrame(all_stats)
    display(df_sum.groupby(["N"]).agg(
        files=("file","count"),
        frames=("frames","sum"),
        down=("down","sum"),
        up=("up","sum"),
        eq=("eq","sum"),
        total_fill=("total_fill","sum"),
        total_drop=("total_drop","sum"),
    ).reset_index())

    display(df_sum.groupby(["N","subject"]).agg(
        files=("file","count"),
        frames=("frames","sum"),
        down=("down","sum"),
        up=("up","sum"),
        eq=("eq","sum"),
    ).reset_index())


Unnamed: 0,N,files,frames,down,up,eq,total_fill,total_drop
0,64,720,114170,28604,84497,1069,2842765,652901


Unnamed: 0,N,subject,files,frames,down,up,eq
0,64,A,72,10459,2787,7574,98
1,64,B,72,12224,3030,9080,114
2,64,C,72,13890,2692,11087,111
3,64,D,72,11861,2514,9235,112
4,64,E,72,11102,2931,8074,97
5,64,F,72,10424,2682,7639,103
6,64,G,72,10532,2766,7685,81
7,64,H,72,11352,2599,8646,107
8,64,I,72,10993,3589,7277,127
9,64,J,72,11333,3014,8200,119
