✅ Cell 1 — Import & konfigurasi (Trial-2)

In [1]:
# Cell 1: Import & konfigurasi dasar (Trial-2)

import os
import glob
import numpy as np
import pandas as pd

# Root output Trial-2 kamu (sesuaikan)
BASE_GMM_DIR = r"E:\0.TA_Teguh\GMM Trial 2"

# Pilih sumber analisis:
# - "head2" (recommended): pakai Head 2 summary per frame (punya valid_minpts, n_inlier)
# - "head1" (fallback): pakai Head 1 point-level (hanya frame valid yang ditulis)
ANALYZE_MODE = "head2"

SUBJECTS = list("ABCDEFGHIJ")

HEAD1_DIR = os.path.join(BASE_GMM_DIR, "Head 1")
HEAD2_DIR = os.path.join(BASE_GMM_DIR, "Head 2")

print("BASE_GMM_DIR :", BASE_GMM_DIR)
print("ANALYZE_MODE :", ANALYZE_MODE)
print("SUBJECTS     :", SUBJECTS)
print("HEAD1_DIR    :", HEAD1_DIR)
print("HEAD2_DIR    :", HEAD2_DIR)


BASE_GMM_DIR : E:\0.TA_Teguh\GMM Trial 2
ANALYZE_MODE : head2
SUBJECTS     : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
HEAD1_DIR    : E:\0.TA_Teguh\GMM Trial 2\Head 1
HEAD2_DIR    : E:\0.TA_Teguh\GMM Trial 2\Head 2


✅ Cell 2 — List file per subjek

In [2]:
# Cell 2: Fungsi list file per subjek (Trial-2)

def list_trial_files_for_subject(subject_name: str, mode: str = "head2"):
    """
    Mengembalikan list path file per trial untuk 1 subjek.
    Struktur Trial-2:
      Head 1/<SUBJEK>/Jalan*.csv
      Head 2/<SUBJEK>/Jalan*.csv
    """
    if mode.lower() == "head2":
        subj_dir = os.path.join(HEAD2_DIR, subject_name)
    else:
        subj_dir = os.path.join(HEAD1_DIR, subject_name)

    pattern = os.path.join(subj_dir, "Jalan*.csv")
    files = sorted(glob.glob(pattern))
    return files

for subj in SUBJECTS:
    files = list_trial_files_for_subject(subj, ANALYZE_MODE)
    print(f"Subjek {subj}: {len(files)} file ({ANALYZE_MODE})")
    if files:
        print("  Contoh:", files[0])


Subjek A: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\A\Jalan1.csv
Subjek B: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\B\Jalan1.csv
Subjek C: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\C\Jalan1.csv
Subjek D: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\D\Jalan1.csv
Subjek E: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\E\Jalan1.csv
Subjek F: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\F\Jalan1.csv
Subjek G: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\G\Jalan1.csv
Subjek H: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\H\Jalan1.csv
Subjek I: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\I\Jalan1.csv
Subjek J: 72 file (head2)
  Contoh: E:\0.TA_Teguh\GMM Trial 2\Head 2\J\Jalan1.csv


✅ Cell 3 — Bangun df_counts: jumlah titik per frame untuk semua file

In [3]:
# Cell 3: Kumpulkan jumlah titik per frame untuk SEMUA subjek & SEMUA trial

rows = []

for subj in SUBJECTS:
    files = list_trial_files_for_subject(subj, ANALYZE_MODE)
    print(f"Proses subjek {subj}: {len(files)} file")

    for fpath in files:
        fname = os.path.basename(fpath)

        try:
            df = pd.read_csv(fpath)
        except Exception as e:
            print(f"  Gagal baca {fname}: {e}")
            continue

        if ANALYZE_MODE.lower() == "head2":
            # Head-2 di Trial-2 harus punya satu baris per frame
            # Kolom yang kita butuh minimal: frame, valid_minpts, n_inlier (atau N_inlier)
            cols = [c.lower() for c in df.columns]

            # cari nama kolom fleksibel
            def pick_col(cands):
                for c in cands:
                    if c in df.columns: return c
                    # case-insensitive
                    for real in df.columns:
                        if real.lower() == c.lower():
                            return real
                return None

            col_frame = pick_col(["frame"])
            col_valid = pick_col(["valid_minpts", "valid"])
            col_nin   = pick_col(["n_inlier", "ninlier", "points_after", "num_inlier"])

            if col_frame is None or col_nin is None:
                print(f"  WARNING: kolom wajib (frame, n_inlier) tidak lengkap di {fname}, skip.")
                continue

            out = df[[col_frame, col_nin]].copy()
            out = out.rename(columns={col_frame: "frame", col_nin: "num_points"})

            if col_valid is not None:
                out["valid_minpts"] = df[col_valid].astype(int)
            else:
                # kalau valid tidak ada, kita set valid=1 jika num_points>0
                out["valid_minpts"] = (out["num_points"] > 0).astype(int)

            out["subject"] = subj
            out["file"] = fname
            rows.append(out)

        else:
            # Head-1 (point-level): 1 baris = 1 titik
            if "frame" not in df.columns:
                print(f"  WARNING: kolom 'frame' tidak ditemukan di {fname}, skip.")
                continue

            counts = (
                df.groupby("frame")
                  .size()
                  .reset_index(name="num_points")
            )
            counts["valid_minpts"] = 1  # karena Head-1 hanya menulis frame yang valid
            counts["subject"] = subj
            counts["file"] = fname
            rows.append(counts)

if rows:
    df_counts = pd.concat(rows, ignore_index=True)
else:
    df_counts = pd.DataFrame(columns=["frame", "num_points", "valid_minpts", "subject", "file"])

print("df_counts shape:", df_counts.shape)
df_counts.head()


Proses subjek A: 72 file
Proses subjek B: 72 file
Proses subjek C: 72 file
Proses subjek D: 72 file
Proses subjek E: 72 file
Proses subjek F: 72 file
Proses subjek G: 72 file
Proses subjek H: 72 file
Proses subjek I: 72 file
Proses subjek J: 72 file
df_counts shape: (133517, 5)


Unnamed: 0,frame,num_points,valid_minpts,subject,file
0,1,0,0,A,Jalan1.csv
1,2,0,0,A,Jalan1.csv
2,3,0,0,A,Jalan1.csv
3,4,0,0,A,Jalan1.csv
4,5,8,1,A,Jalan1.csv


✅ Cell 4 — Statistik global & per subjek (valid vs all)

In [4]:
# Cell 4: Statistik global & per subjek

if df_counts.empty:
    raise RuntimeError("df_counts kosong. Cek path output Trial-2, ANALYZE_MODE, dan pola file.")

def describe_array(arr):
    arr = np.asarray(arr)
    if len(arr) == 0:
        return None
    return {
        "min":    float(np.min(arr)),
        "max":    float(np.max(arr)),
        "mean":   float(np.mean(arr)),
        "median": float(np.median(arr)),
        "p25":    float(np.percentile(arr, 25)),
        "p50":    float(np.percentile(arr, 50)),
        "p75":    float(np.percentile(arr, 75)),
        "p90":    float(np.percentile(arr, 90)),
        "p95":    float(np.percentile(arr, 95)),
        "p99":    float(np.percentile(arr, 99)),
    }

def print_stats(title, stats):
    print(title)
    if stats is None:
        print("  (no data)")
        return
    for k, v in stats.items():
        print(f"  {k:7s}: {v:.3f}")

# --- Global ---
nums_all = df_counts["num_points"].values
print_stats("\n=== GLOBAL (ALL frames) ===", describe_array(nums_all))

# --- Valid-only (kalau ada valid_minpts) ---
if "valid_minpts" in df_counts.columns:
    nums_valid = df_counts.loc[df_counts["valid_minpts"] == 1, "num_points"].values
    print_stats("\n=== GLOBAL (VALID frames only) ===", describe_array(nums_valid))

# --- Per subject ---
print("\n=== PER SUBJECT (VALID frames only jika tersedia) ===")
for subj in SUBJECTS:
    sub = df_counts[df_counts["subject"] == subj]
    if sub.empty:
        print(f"\nSubjek {subj}: (no data)")
        continue

    if "valid_minpts" in sub.columns:
        sub_valid = sub[sub["valid_minpts"] == 1]["num_points"].values
        print_stats(f"\nSubjek {subj} (VALID)", describe_array(sub_valid))
    else:
        print_stats(f"\nSubjek {subj} (ALL)", describe_array(sub["num_points"].values))



=== GLOBAL (ALL frames) ===
  min    : 0.000
  max    : 150.000
  mean   : 38.325
  median : 34.000
  p25    : 8.000
  p50    : 34.000
  p75    : 60.000
  p90    : 83.000
  p95    : 98.000
  p99    : 128.000

=== GLOBAL (VALID frames only) ===
  min    : 5.000
  max    : 150.000
  mean   : 44.819
  median : 41.000
  p25    : 19.000
  p50    : 41.000
  p75    : 65.000
  p90    : 87.000
  p95    : 101.000
  p99    : 131.000

=== PER SUBJECT (VALID frames only jika tersedia) ===

Subjek A (VALID)
  min    : 5.000
  max    : 150.000
  mean   : 45.656
  median : 42.000
  p25    : 19.000
  p50    : 42.000
  p75    : 66.000
  p90    : 88.000
  p95    : 104.000
  p99    : 133.000

Subjek B (VALID)
  min    : 5.000
  max    : 150.000
  mean   : 43.809
  median : 39.000
  p25    : 17.000
  p50    : 39.000
  p75    : 64.000
  p90    : 87.000
  p95    : 102.000
  p99    : 133.000

Subjek C (VALID)
  min    : 5.000
  max    : 150.000
  mean   : 40.404
  median : 36.000
  p25    : 17.000
  p50    :

✅ Cell 5 — Simulasi “sampling pressure” untuk kandidat N_target

In [5]:
# Cell 5: Simulasi kandidat N_target (sampling pressure)

def simulate_sampling_pressure(df_counts, n_targets, valid_only=True):
    """
    Untuk tiap N_target:
      - pct_need_fill  : % frame dengan M < N (butuh padding/duplication/masking)
      - pct_need_down  : % frame dengan M > N (butuh downsampling)
      - avg_deficit    : rata2 (N-M) pada frame M < N
      - avg_excess     : rata2 (M-N) pada frame M > N
    """
    df = df_counts.copy()

    if valid_only and "valid_minpts" in df.columns:
        df = df[df["valid_minpts"] == 1].copy()

    arr = df["num_points"].astype(int).values
    total = len(arr)

    records = []
    for N in n_targets:
        M = arr
        need_fill = M < N
        need_down = M > N
        eq = M == N

        n_fill = int(np.sum(need_fill))
        n_down = int(np.sum(need_down))
        n_eq   = int(np.sum(eq))

        deficit = (N - M[need_fill]) if n_fill > 0 else np.array([])
        excess  = (M[need_down] - N) if n_down > 0 else np.array([])

        rec = {
            "N_target": int(N),
            "total_frames_used": int(total),
            "n_need_fill": n_fill,
            "n_equal": n_eq,
            "n_need_down": n_down,
            "pct_need_fill": 100.0 * n_fill / total if total else 0.0,
            "pct_equal": 100.0 * n_eq / total if total else 0.0,
            "pct_need_down": 100.0 * n_down / total if total else 0.0,
            "avg_deficit": float(np.mean(deficit)) if deficit.size else 0.0,
            "p95_deficit": float(np.percentile(deficit, 95)) if deficit.size else 0.0,
            "avg_excess": float(np.mean(excess)) if excess.size else 0.0,
            "p95_excess": float(np.percentile(excess, 95)) if excess.size else 0.0,
        }
        records.append(rec)

    return pd.DataFrame(records).sort_values("N_target").reset_index(drop=True)


# Kandidat N_target (placeholder) — nanti kamu refine dari statistik Cell 4
N_candidates = [16, 20, 24, 28, 32, 40, 48, 56, 64]

df_sim = simulate_sampling_pressure(df_counts, N_candidates, valid_only=True)
print("Simulasi sampling pressure (VALID frames only):")
df_sim


Simulasi sampling pressure (VALID frames only):


Unnamed: 0,N_target,total_frames_used,n_need_fill,n_equal,n_need_down,pct_need_fill,pct_equal,pct_need_down,avg_deficit,p95_deficit,avg_excess,p95_excess
0,16,114170,24121,1131,88918,21.127266,0.990628,77.882106,7.696696,11.0,39.091612,90.0
1,20,114170,28758,1211,84201,25.188754,1.060699,73.750547,10.208881,15.0,37.139737,87.0
2,24,114170,34008,1388,78774,29.787159,1.215731,68.99711,12.392908,19.0,35.524335,84.0
3,28,114170,39139,1181,73850,34.281335,1.034422,64.684243,14.579678,23.0,33.729045,81.0
4,32,114170,43957,1280,68933,38.501358,1.121135,60.377507,16.814842,27.0,31.954303,79.0
5,40,114170,55174,1410,57586,48.32618,1.235,50.438819,20.670062,35.0,29.358976,74.0
6,48,114170,65299,1307,47564,57.194534,1.144784,41.660681,24.931224,43.0,26.592381,69.0
7,56,114170,75568,1221,37381,66.189016,1.069458,32.741526,29.076024,51.0,24.630507,66.0
8,64,114170,84497,1069,28604,74.00981,0.936323,25.053867,33.643384,59.0,22.825514,63.0


✅ Cell 6 — Rekomendasi kandidat N_target dan MIN_POINTS secara data-driven

In [7]:
# Cell 6: Data-driven recommendations (berdasarkan VALID frames)

df_valid = df_counts.copy()
if "valid_minpts" in df_valid.columns:
    df_valid = df_valid[df_valid["valid_minpts"] == 1].copy()

nums = df_valid["num_points"].values
if len(nums) == 0:
    raise RuntimeError("Tidak ada VALID frames untuk dianalisis. Cek output Head-2 / Head-1 kamu.")

N_med = int(np.round(np.percentile(nums, 50)))
N_p75 = int(np.round(np.percentile(nums, 75)))
N_p90 = int(np.round(np.percentile(nums, 90)))

print("=== Kandidat N_target dari distribusi VALID frames ===")
print("N_median (p50):", N_med)
print("N_p75        :", N_p75)
print("N_p90        :", N_p90)

# "Min points wajar" (sekadar insight), misal p10 atau p25
low_p = 10
MIN_POINTS_SUGGESTED = int(np.floor(np.percentile(nums, low_p)))
print(f"\nInsight MIN_POINTS dari p{low_p} VALID num_points:", MIN_POINTS_SUGGESTED)

# Opsional: auto-build kandidat N yang dekat p50/p75/p90
N_auto = sorted(set([max(1, N_med-4), N_med, N_med+4,
                     max(1, N_p75-4), N_p75, N_p75+4,
                     max(1, N_p90-4), N_p90, N_p90+4]))
print("\nN_candidates auto (sekitar p50/p75/p90):", N_auto)

df_sim_auto = simulate_sampling_pressure(df_counts, N_auto, valid_only=True)
df_sim_auto


=== Kandidat N_target dari distribusi VALID frames ===
N_median (p50): 41
N_p75        : 65
N_p90        : 87

Insight MIN_POINTS dari p10 VALID num_points: 7

N_candidates auto (sekitar p50/p75/p90): [37, 41, 45, 61, 65, 69, 83, 87, 91]


Unnamed: 0,N_target,total_frames_used,n_need_fill,n_equal,n_need_down,pct_need_fill,pct_equal,pct_need_down,avg_deficit,p95_deficit,avg_excess,p95_excess
0,37,114170,50873,1461,61836,44.558991,1.279671,54.161338,19.247223,32.0,30.271848,75.0
1,41,114170,56584,1279,56307,49.561181,1.120259,49.31856,21.154991,36.0,29.003143,73.0
2,45,114170,61567,1183,51420,53.925725,1.036174,45.038101,23.323355,40.0,27.524601,71.0
3,61,114170,81264,1056,31850,71.178068,0.924936,27.896996,31.902048,56.0,23.395102,64.0
4,65,114170,85566,988,27616,74.946133,0.865376,24.188491,34.223068,60.0,22.606351,62.0
5,69,114170,89583,961,23626,78.46457,0.841727,20.693702,36.621446,64.0,22.007322,61.0
6,83,114170,100311,661,13198,87.861084,0.578961,11.559954,46.067729,78.0,19.851947,57.0
7,87,114170,102644,495,11031,89.904528,0.433564,9.661908,48.988202,82.0,19.270329,55.5
8,91,114170,104569,408,9193,91.59061,0.357362,8.052028,52.058985,86.0,18.633961,53.0
