Cell 1 — Imports & Config

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Dict, Tuple, List
import hashlib

# Paths
DATASET_ROOT = Path(r"E:\0.TA_Teguh\GMM Trial 2")
HEAD1_DIR = DATASET_ROOT / "Head 1"
OUT_ROOT = Path(r"E:\0.TA_Teguh\Sampling\Weighted")

# Subjects A..J
SUBJECTS = list("ABCDEFGHIJ")

# Candidates
N_TARGETS = [48, 56, 64]
RUN_N_TARGETS = [48]  # start with one; change to [48,56,64] after sanity check

# Must-match Head-1 columns
COLS_EXPECTED = ["timestamp","frame","x","y","z","doppler","SNR","w_snr","d2","is_inlier"]
COLS_OUT = COLS_EXPECTED[:]  # keep order

# Sampling settings
EPS_W = 1e-12

# Read/write config
CSV_READ_KWARGS = dict(dtype={
    "timestamp": str,
    "frame": np.int32,
    "x": np.float32,
    "y": np.float32,
    "z": np.float32,
    "doppler": np.float32,
    "SNR": np.float32,
    "w_snr": np.float32,
    "d2": np.float32,
    "is_inlier": np.int8,
})
CSV_WRITE_KWARGS = dict(index=False)

print("OK | HEAD1_DIR:", HEAD1_DIR)
print("OK | OUT_ROOT :", OUT_ROOT)
print("RUN_N_TARGETS :", RUN_N_TARGETS)


OK | HEAD1_DIR: E:\0.TA_Teguh\GMM Trial 2\Head 1
OK | OUT_ROOT : E:\0.TA_Teguh\Sampling\Weighted
RUN_N_TARGETS : [48]


Cell 2 — Deterministic seed helper

In [2]:
def stable_seed(subject: str, file_name: str, frame: int, n_target: int) -> int:
    s = f"{subject}|{file_name}|{frame}|{n_target}"
    h = hashlib.sha256(s.encode("utf-8")).hexdigest()
    return int(h[:8], 16)  # 32-bit


Cell 3 — IO helpers (list files, read head1)

In [3]:
def list_jalan_csvs(subject_dir: Path) -> List[Path]:
    if not subject_dir.exists():
        return []
    return sorted(subject_dir.glob("Jalan*.csv"))

def read_head1_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, **CSV_READ_KWARGS)
    missing = [c for c in COLS_EXPECTED if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in {path.name}: {missing}")
    df = df[COLS_EXPECTED].copy()
    return df


Cell 4 — Frame extraction & sanitization

In [4]:
def get_frame_df(df: pd.DataFrame, frame_id: int) -> pd.DataFrame:
    f = df[df["frame"] == frame_id].copy()
    if f.empty:
        return f
    num_cols = ["x","y","z","doppler","SNR","w_snr","d2"]
    for c in num_cols:
        v = f[c].to_numpy()
        ok = np.isfinite(v)
        if not ok.all():
            f = f.loc[ok].copy()
    return f


Cell 5 — Core weighted index sampler (with fallback)

In [5]:
def _normalize_probs(w: np.ndarray) -> Tuple[np.ndarray, bool]:
    w = np.asarray(w, dtype=np.float64)
    w = np.clip(w, 0.0, None)
    s = w.sum()
    if (not np.isfinite(s)) or s <= EPS_W:
        return np.array([], dtype=np.float64), False
    return (w / s), True

def weighted_choice_indices(w: np.ndarray, k: int, replace: bool, seed: int) -> np.ndarray:
    """
    Weighted sampling indices.
    - replace=False for downsample
    - replace=True for upsample
    Falls back to uniform if weights degenerate.
    """
    if k <= 0:
        return np.empty((0,), dtype=np.int32)

    n = len(w)
    rng = np.random.default_rng(seed)

    p, ok = _normalize_probs(w)
    if not ok:
        # uniform fallback
        return rng.choice(n, size=k, replace=replace).astype(np.int32)

    return rng.choice(n, size=k, replace=replace, p=p).astype(np.int32)


Cell 6 — Per-frame weighted sampler (downsample/upsample/equal)

In [6]:
def sample_frame_weighted(frame_df: pd.DataFrame, n_target: int, subject: str, file_name: str) -> Tuple[pd.DataFrame, Dict]:
    M = len(frame_df)
    meta = {"M": M, "N": n_target, "action": None, "n_drop": 0, "n_fill": 0, "fallback_uniform": 0}

    if M == 0:
        meta["action"] = "empty"
        return frame_df[COLS_OUT].copy(), meta

    fr = int(frame_df["frame"].iloc[0])
    seed = stable_seed(subject, file_name, fr, n_target)

    w = frame_df["w_snr"].to_numpy()

    if M == n_target:
        meta["action"] = "equal"
        return frame_df[COLS_OUT].copy(), meta

    if M > n_target:
        idx = weighted_choice_indices(w, k=n_target, replace=False, seed=seed)
        out = frame_df.iloc[idx].copy()
        meta["action"] = "downsample"
        meta["n_drop"] = M - n_target
        # detect fallback roughly: if weights degenerate, our helper uses uniform
        p, ok = _normalize_probs(w)
        meta["fallback_uniform"] = 0 if ok else 1
        return out[COLS_OUT], meta

    # M < n_target (upsample): keep all + add (N-M) with replacement
    k = n_target - M
    add_idx = weighted_choice_indices(w, k=k, replace=True, seed=seed)
    add = frame_df.iloc[add_idx].copy()
    out = pd.concat([frame_df, add], axis=0, ignore_index=True)
    meta["action"] = "upsample"
    meta["n_fill"] = k
    p, ok = _normalize_probs(w)
    meta["fallback_uniform"] = 0 if ok else 1
    return out[COLS_OUT], meta


Cell 7 — Process one file (loop all frames, save CSV)

In [7]:
def process_one_file(head1_path: Path, subject: str, n_target: int) -> Dict:
    df = read_head1_csv(head1_path)

    file_name = head1_path.name
    out_dir = OUT_ROOT / f"N{n_target}" / subject
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / file_name

    frames = df["frame"].unique()
    frames.sort()

    out_chunks = []
    stats = {
        "file": file_name, "subject": subject, "N": n_target,
        "frames": int(len(frames)),
        "down": 0, "up": 0, "eq": 0, "empty": 0,
        "total_drop": 0, "total_fill": 0,
        "fallback_uniform_frames": 0
    }

    for fr in frames:
        fdf = get_frame_df(df, int(fr))
        sampled, meta = sample_frame_weighted(fdf, n_target=n_target, subject=subject, file_name=file_name)

        out_chunks.append(sampled)

        act = meta["action"]
        if act == "downsample":
            stats["down"] += 1
            stats["total_drop"] += int(meta["n_drop"])
        elif act == "upsample":
            stats["up"] += 1
            stats["total_fill"] += int(meta["n_fill"])
        elif act == "equal":
            stats["eq"] += 1
        else:
            stats["empty"] += 1

        if meta.get("fallback_uniform", 0) == 1:
            stats["fallback_uniform_frames"] += 1

    out_df = pd.concat(out_chunks, axis=0, ignore_index=True)
    out_df.to_csv(out_path, **CSV_WRITE_KWARGS)
    return stats


Cell 8 — Run all (batch: subject → file) + summary CSV

In [8]:
all_stats = []

for n_target in RUN_N_TARGETS:
    for subject in SUBJECTS:
        subj_dir = HEAD1_DIR / subject
        files = list_jalan_csvs(subj_dir)
        if not files:
            print(f"[WARN] No files for subject {subject} in {subj_dir}")
            continue

        print(f"\n=== N={n_target} | Subject {subject} | files={len(files)} ===")
        for p in files:
            st = process_one_file(p, subject=subject, n_target=n_target)
            all_stats.append(st)
            print(f"  {st['file']}: frames={st['frames']}, down={st['down']}, up={st['up']}, eq={st['eq']}, "
                  f"fill={st['total_fill']}, drop={st['total_drop']}, fallback_uniform={st['fallback_uniform_frames']}")

summary_df = pd.DataFrame(all_stats)
summary_path = OUT_ROOT / "weighted_sampling_summary.csv"
summary_df.to_csv(summary_path, index=False)
print("\nSaved summary:", summary_path)

summary_df.head()



=== N=48 | Subject A | files=72 ===
  Jalan1.csv: frames=82, down=17, up=63, eq=2, fill=1506, drop=256, fallback_uniform=0
  Jalan10.csv: frames=87, down=35, up=51, eq=1, fill=1289, drop=799, fallback_uniform=0
  Jalan11.csv: frames=87, down=42, up=44, eq=1, fill=1219, drop=1653, fallback_uniform=0
  Jalan12.csv: frames=98, down=51, up=46, eq=1, fill=1205, drop=1303, fallback_uniform=0
  Jalan13.csv: frames=103, down=47, up=53, eq=3, fill=1084, drop=1163, fallback_uniform=0
  Jalan14.csv: frames=81, down=11, up=69, eq=1, fill=1769, drop=131, fallback_uniform=0
  Jalan15.csv: frames=239, down=86, up=152, eq=1, fill=4187, drop=1674, fallback_uniform=0
  Jalan16.csv: frames=229, down=84, up=142, eq=3, fill=3651, drop=1796, fallback_uniform=0
  Jalan17.csv: frames=194, down=97, up=94, eq=3, fill=1869, drop=2920, fallback_uniform=0
  Jalan18.csv: frames=166, down=125, up=40, eq=1, fill=780, drop=4954, fallback_uniform=0
  Jalan19.csv: frames=222, down=86, up=134, eq=2, fill=3192, drop=2386

Unnamed: 0,file,subject,N,frames,down,up,eq,empty,total_drop,total_fill,fallback_uniform_frames
0,Jalan1.csv,A,48,82,17,63,2,0,256,1506,0
1,Jalan10.csv,A,48,87,35,51,1,0,799,1289,0
2,Jalan11.csv,A,48,87,42,44,1,0,1653,1219,0
3,Jalan12.csv,A,48,98,51,46,1,0,1303,1205,0
4,Jalan13.csv,A,48,103,47,53,3,0,1163,1084,0


Cell 9 — Quick aggregate check (optional)

In [11]:
if len(all_stats) == 0:
    print("No stats collected. Run the batch cell first.")
else:
    df_sum = pd.DataFrame(all_stats)

    # 1) Aggregate per N (overall)
    agg_N = (
        df_sum.groupby(["N"], as_index=False)
        .agg(
            files=("file", "count"),
            frames=("frames", "sum"),
            down=("down", "sum"),
            up=("up", "sum"),
            eq=("eq", "sum"),
            empty=("empty", "sum"),
            total_fill=("total_fill", "sum"),
            total_drop=("total_drop", "sum"),
            fallback_uniform_frames=("fallback_uniform_frames", "sum"),
        )
        .sort_values(["N"])
    )

    # 2) Aggregate per N + subject
    agg_NS = (
        df_sum.groupby(["N", "subject"], as_index=False)
        .agg(
            files=("file", "count"),
            frames=("frames", "sum"),
            down=("down", "sum"),
            up=("up", "sum"),
            eq=("eq", "sum"),
            empty=("empty", "sum"),
            total_fill=("total_fill", "sum"),
            total_drop=("total_drop", "sum"),
            fallback_uniform_frames=("fallback_uniform_frames", "sum"),
        )
        .sort_values(["N", "subject"])
    )

    display(agg_N)
    display(agg_NS)


Unnamed: 0,N,files,frames,down,up,eq,empty,total_fill,total_drop,fallback_uniform_frames
0,48,720,114170,47564,65299,1307,0,1627984,1264840,0


Unnamed: 0,N,subject,files,frames,down,up,eq,empty,total_fill,total_drop,fallback_uniform_frames
0,48,A,72,10459,4501,5843,115,0,148631,124116,0
1,48,B,72,12224,4920,7176,128,0,185828,134600,0
2,48,C,72,13890,4972,8772,146,0,221391,115877,0
3,48,D,72,11861,4481,7230,150,0,181994,112098,0
4,48,E,72,11102,4831,6147,124,0,151656,128797,0
5,48,F,72,10424,4396,5892,136,0,146761,117871,0
6,48,G,72,10532,4535,5879,118,0,144576,122809,0
7,48,H,72,11352,4344,6872,136,0,177104,114324,0
8,48,I,72,10993,5530,5337,126,0,125177,162675,0
9,48,J,72,11333,5054,6151,128,0,144866,131673,0
