In [4]:
# Cell 1: Import & konfigurasi dasar

import os
import glob
import numpy as np
import pandas as pd

# Folder dasar hasil torso (SILAKAN SESUAIKAN kalau beda)
BASE_TORSO_DIR =  r"E:\0.TA_Teguh\dataset2\hasil_torso"

# Daftar subjek
SUBJECTS = ["Afi", "Tsamara", "Tsania"]

print("BASE_TORSO_DIR :", BASE_TORSO_DIR)
print("SUBJECTS       :", SUBJECTS)


BASE_TORSO_DIR : E:\0.TA_Teguh\dataset2\hasil_torso
SUBJECTS       : ['Afi', 'Tsamara', 'Tsania']


In [5]:
# Cell 2: Fungsi untuk mencari semua file torso per subjek

def list_torso_files_for_subject(subject_name: str):
    """
    Mengembalikan list path file torso untuk 1 subjek.
    Asumsi nama file: torso_Jalan*.csv
    di folder: BASE_TORSO_DIR/<SUBJEK>/
    """
    subj_dir = os.path.join(BASE_TORSO_DIR, subject_name)
    pattern = os.path.join(subj_dir, "torso_*.csv")  # misal: torso_Jalan1.csv
    files = sorted(glob.glob(pattern))
    return files

# Tes kecil: lihat beberapa file untuk satu subjek
for subj in SUBJECTS:
    files = list_torso_files_for_subject(subj)
    print(f"Subjek: {subj}, jumlah file torso: {len(files)}")
    if files:
        print("  Contoh file:", files[0])
    print()


Subjek: Afi, jumlah file torso: 72
  Contoh file: E:\0.TA_Teguh\dataset2\hasil_torso\Afi\torso_Jalan1.csv

Subjek: Tsamara, jumlah file torso: 72
  Contoh file: E:\0.TA_Teguh\dataset2\hasil_torso\Tsamara\torso_Jalan1.csv

Subjek: Tsania, jumlah file torso: 72
  Contoh file: E:\0.TA_Teguh\dataset2\hasil_torso\Tsania\torso_Jalan1.csv



In [6]:
# Cell 3: Kumpulkan jumlah titik per frame untuk SEMUA file & SEMUA subjek

rows = []

for subj in SUBJECTS:
    torso_files = list_torso_files_for_subject(subj)
    print(f"Proses subjek: {subj}, {len(torso_files)} file")

    for fpath in torso_files:
        fname = os.path.basename(fpath)
        try:
            df = pd.read_csv(fpath)
        except Exception as e:
            print(f"  Gagal baca {fname}: {e}")
            continue

        if "frame" not in df.columns:
            print(f"  WARNING: kolom 'frame' tidak ditemukan di {fname}, skip.")
            continue

        # Hitung jumlah titik per frame
        counts = (
            df.groupby("frame")
              .size()
              .reset_index(name="num_points")
        )

        # Tambahkan info subject & nama file
        counts["subject"] = subj
        counts["file"]    = fname

        rows.append(counts)

# Gabungkan semua ke satu DataFrame besar
if rows:
    df_counts = pd.concat(rows, ignore_index=True)
else:
    df_counts = pd.DataFrame(columns=["frame", "num_points", "subject", "file"])

print("df_counts shape:", df_counts.shape)
df_counts.head()


Proses subjek: Afi, 72 file
Proses subjek: Tsamara, 72 file
Proses subjek: Tsania, 72 file
df_counts shape: (30010, 4)


Unnamed: 0,frame,num_points,subject,file
0,1,18,Afi,torso_Jalan1.csv
1,2,2,Afi,torso_Jalan1.csv
2,3,4,Afi,torso_Jalan1.csv
3,4,6,Afi,torso_Jalan1.csv
4,5,2,Afi,torso_Jalan1.csv


In [7]:
# Cell 4: Statistik global & per subjek untuk num_points

if df_counts.empty:
    raise RuntimeError("df_counts kosong. Cek BASE_TORSO_DIR dan pola file torso.")

nums = df_counts["num_points"].values

def describe_array(arr):
    arr = np.asarray(arr)
    desc = {
        "min":    float(np.min(arr)),
        "max":    float(np.max(arr)),
        "mean":   float(np.mean(arr)),
        "median": float(np.median(arr)),
        "p25":    float(np.percentile(arr, 25)),
        "p75":    float(np.percentile(arr, 75)),
        "p90":    float(np.percentile(arr, 90)),
        "p95":    float(np.percentile(arr, 95)),
    }
    return desc

print("=== STATISTIK GLOBAL (SEMUA SUBJEK) ===")
global_stats = describe_array(nums)
for k, v in global_stats.items():
    print(f"{k:7s} : {v:.3f}")

print("\n=== STATISTIK PER SUBJEK ===")
for subj in SUBJECTS:
    sub_nums = df_counts.loc[df_counts["subject"] == subj, "num_points"].values
    if len(sub_nums) == 0:
        print(f"\nSubjek {subj}: TIDAK ADA DATA")
        continue
    stats = describe_array(sub_nums)
    print(f"\nSubjek: {subj}")
    for k, v in stats.items():
        print(f"  {k:7s} : {v:.3f}")


=== STATISTIK GLOBAL (SEMUA SUBJEK) ===
min     : 1.000
max     : 145.000
mean    : 28.381
median  : 22.000
p25     : 12.000
p75     : 41.000
p90     : 59.000
p95     : 69.000

=== STATISTIK PER SUBJEK ===

Subjek: Afi
  min     : 1.000
  max     : 139.000
  mean    : 28.550
  median  : 22.000
  p25     : 12.000
  p75     : 41.000
  p90     : 59.000
  p95     : 70.000

Subjek: Tsamara
  min     : 1.000
  max     : 137.000
  mean    : 26.344
  median  : 19.000
  p25     : 11.000
  p75     : 38.000
  p90     : 56.000
  p95     : 65.000

Subjek: Tsania
  min     : 1.000
  max     : 145.000
  mean    : 30.648
  median  : 25.000
  p25     : 13.000
  p75     : 45.000
  p90     : 61.000
  p95     : 71.000


In [24]:
# Cell 5: Fungsi untuk menghitung % upsample / downsample untuk beberapa kandidat N_target

def simulate_bootstrap_stats(df_counts, n_targets):
    """
    Untuk setiap N_target dalam n_targets,
    hitung persentase frame yang:
      - M < N (perlu upsample)
      - M = N
      - M > N (perlu downsample)
    Dikembalikan sebagai DataFrame ringkasan.
    """
    records = []
    arr = df_counts["num_points"].values

    for N in n_targets:
        M = arr
        total = len(M)
        n_up   = np.sum(M < N)
        n_eq   = np.sum(M == N)
        n_down = np.sum(M > N)

        rec = {
            "N_target": N,
            "total_frames": int(total),
            "n_upsample": int(n_up),
            "n_equal": int(n_eq),
            "n_downsample": int(n_down),
            "pct_up": 100.0 * n_up   / total if total > 0 else 0.0,
            "pct_eq": 100.0 * n_eq   / total if total > 0 else 0.0,
            "pct_down": 100.0 * n_down / total if total > 0 else 0.0,
        }
        records.append(rec)

    return pd.DataFrame(records)


# Contoh: tentukan kandidat awal N_target secara manual dulu
# (nanti bisa kamu sesuaikan setelah lihat statistik Cell 4)

# Misal (ganti angkanya setelah kamu lihat global_stats):
# N_candidates = [int(global_stats["median"]),
#                 int(global_stats["p75"]),
#                 int(global_stats["p90"])]

N_candidates = [18, 19, 20,  22, 25, 32, 38, 41, 56, 64, 65, 71, 128 ]  # placeholder awal, GANTI setelah lihat statistik beneran

df_bootstrap_sim = simulate_bootstrap_stats(df_counts, N_candidates)
print("Simulasi up/down untuk kandidat N_target:")
print(df_bootstrap_sim)


Simulasi up/down untuk kandidat N_target:
    N_target  total_frames  n_upsample  n_equal  n_downsample     pct_up  \
0         18         30010       12912      562         16536  43.025658   
1         19         30010       13474      480         16056  44.898367   
2         20         30010       13954      394         15662  46.497834   
3         22         30010       14788      415         14807  49.276908   
4         25         30010       16132      464         13414  53.755415   
5         32         30010       18853      325         10832  62.822393   
6         38         30010       21125      392          8493  70.393202   
7         41         30010       22236      377          7397  74.095302   
8         56         30010       26294      236          3480  87.617461   
9         64         30010       27808      138          2064  92.662446   
10        65         30010       27946      164          1900  93.122293   
11        71         30010       28697       9

In [14]:
# Cell 6 (opsional): Tentukan MIN_POINTS_TORSO secara data-driven

lower_percentile = 25 # misalnya kita pakai p5 sebagai batas minimal yang wajar
MIN_POINTS_TORSO = int(np.floor(np.percentile(nums, lower_percentile)))

print(f"Rekomendasi awal MIN_POINTS_TORSO (p{lower_percentile:.0f}):", MIN_POINTS_TORSO)


Rekomendasi awal MIN_POINTS_TORSO (p25): 12
