ðŸ§© Cell 1 â€“ Import & path konfigurasi dasar

In [8]:
# Cell 1: Import & konfigurasi path V3

import os
import glob
import re
from pathlib import Path

import numpy as np
import pandas as pd

# Folder input: gait_with_steps (hasil Tahap 3)
FOLDER_GAIT_WITH_STEPS = r"E:\1.Clustering_TA\dataset\gaitframe_steps"

# Folder output: final fitur gait versi V3
FOLDER_GAIT_FEATURES_FINAL_ROOT_V3 = r"E:\1.Clustering_TA\dataset\gait_features_final_v3"
os.makedirs(FOLDER_GAIT_FEATURES_FINAL_ROOT_V3, exist_ok=True)

print("Input gait_with_steps  :", FOLDER_GAIT_WITH_STEPS)
print("Output root V3         :", FOLDER_GAIT_FEATURES_FINAL_ROOT_V3)

# Nama kolom waktu (detik) di file gait_with_steps
# Jika di file kamu namanya berbeda (misalnya 't_sec'),
# cukup ganti di sini.
COL_T_SEC = "timestamp_sec"   # ganti jika perlu
COL_STEP_FLAG = "step_event"

# Kolom posisi torso (dipakai untuk hitung jarak 2D)
COL_X = "torso_x"
COL_Y = "torso_y"


Input gait_with_steps  : E:\1.Clustering_TA\dataset\gaitframe_steps
Output root V3         : E:\1.Clustering_TA\dataset\gait_features_final_v3


ðŸ§© Cell 2 â€“ Helper: parsing nama file & MAD-based filtering step interval

In [9]:
# Cell 2: Helper parsing nama file & fungsi MAD untuk interval langkah

def parse_subject_trial_from_filename(path):
    """
    Contoh nama file:
    - Afi_Jalan1_gait_with_steps.csv
    - Kinan_Jalan10_gait_with_steps.csv

    Output:
    - subject: 'Afi'
    - trial_name: 'Jalan1'
    - trial_num: 1
    """
    p = Path(path)
    stem = p.stem  # tanpa .csv

    # buang suffix '_gait_with_steps' kalau ada
    stem = stem.replace("_gait_with_steps", "")

    parts = stem.split("_")
    if len(parts) >= 2:
        subject = parts[0]
        trial_name = "_".join(parts[1:])
    else:
        subject = stem
        trial_name = stem

    # ambil angka di akhir trial_name (misal 'Jalan10' -> 10)
    m = re.search(r"(\d+)$", trial_name)
    trial_num = int(m.group(1)) if m else None

    return subject, trial_name, trial_num


def compute_valid_interval_mask(intervals, min_interval=0.2, max_interval=2.5, mad_k=3.5):
    """
    intervals: np.array of step time intervals (detik)
    min_interval, max_interval: loose range untuk filter fisiologis
    mad_k: multiplier untuk MAD (outlier detection)

    Return:
    - mask boolean dengan panjang sama dengan intervals
      True artinya interval dianggap 'sehat'
    """
    if len(intervals) == 0:
        return np.array([], dtype=bool)

    # filter range fisiologis (longgar)
    mask_range = (intervals >= min_interval) & (intervals <= max_interval)

    if not np.any(mask_range):
        # kalau tidak ada satu pun interval dalam range,
        # kita fallback: gunakan semua interval apa adanya
        return np.ones_like(intervals, dtype=bool)

    # MAD-based filter di atas interval yang sudah dalam range
    valid_vals = intervals[mask_range]
    median = np.median(valid_vals)
    abs_dev = np.abs(valid_vals - median)
    mad = np.median(abs_dev)

    if mad == 0:
        # semua interval di subset ini kira-kira sama;
        # cukup pakai filter range saja
        return mask_range

    z_mad = abs_dev / mad
    # buat mask final di domain penuh
    mask_mad_full = np.zeros_like(intervals, dtype=bool)
    mask_mad_full[mask_range] = (z_mad <= mad_k)

    mask_final = mask_range & mask_mad_full
    if not np.any(mask_final):
        # kalau setelah MAD tidak ada yang lolos,
        # fallback ke filter range saja
        return mask_range

    return mask_final


ðŸ§© Cell 3 â€“ Fungsi inti: hitung fitur gait V3 untuk satu trial

In [10]:
# Cell 3: Fungsi inti menghitung fitur gait V3 untuk satu trial

def compute_final_gait_features_v3_for_trial(
    df_trial,
    subject,
    trial_name,
    trial_num,
    col_t=COL_T_SEC,
    col_step=COL_STEP_FLAG,
    col_x=COL_X,
    col_y=COL_Y,
    min_interval=0.2,
    max_interval=2.5,
    mad_k=3.5
):
    """
    Hitung fitur gait (raw + filtered + quality) untuk satu trial.

    Output: dict satu baris untuk dimasukkan ke DataFrame final.
    """
    row = {
        "subject": subject,
        "trial_name": trial_name,
        "trial_num": trial_num,
    }

    df = df_trial.copy().reset_index(drop=True)

    # Pastikan kolom wajib ada
    for col in [col_t, col_step, col_x, col_y]:
        if col not in df.columns:
            raise KeyError(f"Kolom '{col}' tidak ditemukan di trial {subject} - {trial_name}")

    # Ambil step events
    step_mask = df[col_step] == 1
    idx_steps = np.where(step_mask)[0]
    ts_steps = df.loc[step_mask, col_t].to_numpy()

    # --- RAW FEATURES (V1-style) ---
    step_count_raw = len(ts_steps)
    row["step_count_raw"] = step_count_raw

    if step_count_raw < 2:
        # Tidak cukup step untuk hitung gait fitur
        row.update({
            "duration_raw_sec": np.nan,
            "distance_2d_raw_m": np.nan,
            "speed_raw_ms": np.nan,

            "step_count_valid": 0,
            "mean_step_time": np.nan,
            "step_time_std": np.nan,
            "cadence_spm": np.nan,
            "duration_valid_sec": np.nan,
            "distance_2d_valid_m": np.nan,
            "walking_speed_ms": np.nan,
            "step_length_m": np.nan,
            "ratio_valid_steps": 0.0,
            "is_valid_trial": 0,

            # alias backward-compat
            "duration_walk_sec": np.nan,
            "distance_2d_m": np.nan,
        })
        return row

    # Raw duration
    t0_raw = ts_steps[0]
    t1_raw = ts_steps[-1]
    duration_raw = float(t1_raw - t0_raw) if t1_raw > t0_raw else np.nan

    # Raw distance 2D
    first_idx_raw = idx_steps[0]
    last_idx_raw = idx_steps[-1]
    x0_raw = float(df.loc[first_idx_raw, col_x])
    y0_raw = float(df.loc[first_idx_raw, col_y])
    x1_raw = float(df.loc[last_idx_raw, col_x])
    y1_raw = float(df.loc[last_idx_raw, col_y])

    dx_raw = x1_raw - x0_raw
    dy_raw = y1_raw - y0_raw
    distance_raw = float(np.sqrt(dx_raw**2 + dy_raw**2))

    if duration_raw is not None and duration_raw > 0:
        speed_raw = distance_raw / duration_raw
    else:
        speed_raw = np.nan

    row["duration_raw_sec"] = duration_raw
    row["distance_2d_raw_m"] = distance_raw
    row["speed_raw_ms"] = speed_raw

    # --- FILTERED FEATURES (V3-based) ---

    # Interval antar step
    intervals = np.diff(ts_steps)  # len = step_count_raw - 1

    if len(intervals) == 0:
        # cuma 2 step, tidak ada interval variatif
        row.update({
            "step_count_valid": step_count_raw,
            "mean_step_time": np.nan,
            "step_time_std": np.nan,
            "cadence_spm": np.nan,
            "duration_valid_sec": duration_raw,
            "distance_2d_valid_m": distance_raw,
            "walking_speed_ms": speed_raw,
            "step_length_m": np.nan,
            "ratio_valid_steps": 1.0,
            "is_valid_trial": 0,  # trial terlalu pendek untuk ditandai valid penuh

            "duration_walk_sec": duration_raw,
            "distance_2d_m": distance_raw,
        })
        return row

    # Mask interval yang 'sehat' (loose range + MAD)
    mask_valid_intervals = compute_valid_interval_mask(
        intervals,
        min_interval=min_interval,
        max_interval=max_interval,
        mad_k=mad_k
    )

    # Indeks interval yang valid
    valid_int_indices = np.where(mask_valid_intervals)[0]

    if len(valid_int_indices) == 0:
        # Tidak ada interval yang sehat, semua step dianggap bermasalah
        row.update({
            "step_count_valid": 0,
            "mean_step_time": np.nan,
            "step_time_std": np.nan,
            "cadence_spm": np.nan,
            "duration_valid_sec": np.nan,
            "distance_2d_valid_m": np.nan,
            "walking_speed_ms": np.nan,
            "step_length_m": np.nan,
            "ratio_valid_steps": 0.0,
            "is_valid_trial": 0,

            "duration_walk_sec": duration_raw,
            "distance_2d_m": distance_raw,
        })
        return row

    # Tentukan range step index yang valid (pakai cluster pertamaâ€“terakhir yang valid)
    first_valid_interval_idx = valid_int_indices[0]
    last_valid_interval_idx = valid_int_indices[-1]

    # Interval i menghubungkan step i dan i+1
    # jadi subset step valid = [first_valid_interval_idx .. last_valid_interval_idx+1]
    start_step_idx = first_valid_interval_idx
    end_step_idx = last_valid_interval_idx + 1

    ts_steps_valid = ts_steps[start_step_idx : end_step_idx + 1]
    idx_steps_valid = idx_steps[start_step_idx : end_step_idx + 1]

    step_count_valid = len(ts_steps_valid)
    row["step_count_valid"] = step_count_valid

    # Mean & std step time (pakai intervals yang valid saja)
    intervals_valid = intervals[valid_int_indices]
    mean_step_time = float(np.mean(intervals_valid)) if len(intervals_valid) > 0 else np.nan
    step_time_std = float(np.std(intervals_valid, ddof=1)) if len(intervals_valid) > 1 else 0.0

    # Durasi valid
    t0_valid = ts_steps_valid[0]
    t1_valid = ts_steps_valid[-1]
    duration_valid = float(t1_valid - t0_valid) if t1_valid > t0_valid else np.nan

    # Distance valid (2D)
    first_idx_valid = idx_steps_valid[0]
    last_idx_valid = idx_steps_valid[-1]
    x0_valid = float(df.loc[first_idx_valid, col_x])
    y0_valid = float(df.loc[first_idx_valid, col_y])
    x1_valid = float(df.loc[last_idx_valid, col_x])
    y1_valid = float(df.loc[last_idx_valid, col_y])

    dx_valid = x1_valid - x0_valid
    dy_valid = y1_valid - y0_valid
    distance_valid = float(np.sqrt(dx_valid**2 + dy_valid**2))

    if duration_valid is not None and duration_valid > 0:
        walking_speed_ms = distance_valid / duration_valid
    else:
        walking_speed_ms = np.nan

    # Cadence (langkah per menit)
    if mean_step_time is not None and mean_step_time > 0:
        cadence_spm = 60.0 / mean_step_time
    else:
        cadence_spm = np.nan

    # Step length (m per step)
    if step_count_valid > 0:
        step_length_m = distance_valid / step_count_valid
    else:
        step_length_m = np.nan

    # Rasio valid steps
    ratio_valid_steps = step_count_valid / step_count_raw if step_count_raw > 0 else 0.0

    # Simpan ke row
    row.update({
        "mean_step_time": mean_step_time,
        "step_time_std": step_time_std,
        "cadence_spm": cadence_spm,
        "duration_valid_sec": duration_valid,
        "distance_2d_valid_m": distance_valid,
        "walking_speed_ms": walking_speed_ms,
        "step_length_m": step_length_m,
        "ratio_valid_steps": ratio_valid_steps,
    })

    # --- Penentuan is_valid_trial (V3 rules) ---
    # Aturan: versi kompromi (tidak terlalu ketat, tetap fisiologis)
    conds = []

    # Minimal step valid
    conds.append(step_count_valid >= 4)

    # Durasi minimal
    if duration_valid is not None and not np.isnan(duration_valid):
        conds.append(duration_valid >= 1.2)  # detik
    else:
        conds.append(False)

    # Jarak minimal
    if distance_valid is not None and not np.isnan(distance_valid):
        conds.append(distance_valid >= 0.3)  # meter
    else:
        conds.append(False)

    # Range kecepatan
    if walking_speed_ms is not None and not np.isnan(walking_speed_ms):
        conds.append(walking_speed_ms >= 0.1)
        conds.append(walking_speed_ms <= 3.5)
    else:
        conds.append(False)

    # Range mean_step_time (s)
    if mean_step_time is not None and not np.isnan(mean_step_time):
        conds.append(mean_step_time >= 0.2)
        conds.append(mean_step_time <= 2.0)
    else:
        conds.append(False)

    is_valid_trial = int(all(conds))

    row["is_valid_trial"] = is_valid_trial

    # Alias untuk kompatibilitas dengan V1/V2
    row["duration_walk_sec"] = duration_valid
    row["distance_2d_m"] = distance_valid

    return row


ðŸ§© Cell 4 â€“ Fungsi utama: proses semua file gait_with_steps â†’ df_final_v3

In [11]:
# Cell 4: Proses semua file gait_with_steps untuk membuat fitur V3

def process_all_final_gait_features_v3(
    folder_in=FOLDER_GAIT_WITH_STEPS,
    folder_out=FOLDER_GAIT_FEATURES_FINAL_ROOT_V3
):
    pattern = os.path.join(folder_in, "*_gait_with_steps.csv")
    files = sorted(glob.glob(pattern))

    if not files:
        print("Tidak ada file gait_with_steps yang ditemukan di:", folder_in)
        return None

    rows = []

    print("Ditemukan", len(files), "file gait_with_steps.")
    for fpath in files:
        subject, trial_name, trial_num = parse_subject_trial_from_filename(fpath)
        print(f"Proses: {Path(fpath).name} -> subject={subject}, trial={trial_name}, trial_num={trial_num}")

        df_trial = pd.read_csv(fpath)

        # Jika kolom waktu tidak sesuai, fallback otomatis
        col_t_used = COL_T_SEC
        if COL_T_SEC not in df_trial.columns:
            if "t_sec" in df_trial.columns:
                col_t_used = "t_sec"
            else:
                raise KeyError(f"Tidak menemukan kolom waktu '{COL_T_SEC}' maupun 't_sec' di {fpath}")

        row = compute_final_gait_features_v3_for_trial(
            df_trial,
            subject=subject,
            trial_name=trial_name,
            trial_num=trial_num,
            col_t=col_t_used,
            col_step=COL_STEP_FLAG,
            col_x=COL_X,
            col_y=COL_Y,
            min_interval=0.2,
            max_interval=2.5,
            mad_k=3.5,
        )

        rows.append(row)

    df_all = pd.DataFrame(rows)

    # Sort global: subject, lalu trial_num (kalau ada)
    sort_cols = ["subject"]
    if "trial_num" in df_all.columns:
        sort_cols.append("trial_num")
    df_all_sorted = df_all.sort_values(sort_cols).reset_index(drop=True)

    # Simpan global
    global_out_path = os.path.join(folder_out, "gait_features_final_all_v3.csv")
    df_all_sorted.to_csv(global_out_path, index=False)
    print("\n=== [V3] Global summary disimpan ===")
    print(global_out_path)

    # Simpan per subject (format mirip V1/V2)
    for subject, dsub in df_all_sorted.groupby("subject"):
        fname = f"{subject}_gait_features_5_v3.csv"
        fpath = os.path.join(folder_out, fname)
        dsub.to_csv(fpath, index=False)
        print(f"  [V3] Subject {subject} disimpan di: {fpath}")

    return df_all_sorted


ðŸ§© Cell 5 â€“ Jalankan V3 & sanity check singkat

In [12]:
# Cell 5: Jalankan proses V3 dan sanity check sederhana

df_final_v3 = process_all_final_gait_features_v3()

df_final_v3.head()


Ditemukan 216 file gait_with_steps.
Proses: Afi_Jalan10_gait_with_steps.csv -> subject=Afi, trial=Jalan10, trial_num=10
Proses: Afi_Jalan11_gait_with_steps.csv -> subject=Afi, trial=Jalan11, trial_num=11
Proses: Afi_Jalan12_gait_with_steps.csv -> subject=Afi, trial=Jalan12, trial_num=12
Proses: Afi_Jalan13_gait_with_steps.csv -> subject=Afi, trial=Jalan13, trial_num=13
Proses: Afi_Jalan14_gait_with_steps.csv -> subject=Afi, trial=Jalan14, trial_num=14
Proses: Afi_Jalan15_gait_with_steps.csv -> subject=Afi, trial=Jalan15, trial_num=15
Proses: Afi_Jalan16_gait_with_steps.csv -> subject=Afi, trial=Jalan16, trial_num=16
Proses: Afi_Jalan17_gait_with_steps.csv -> subject=Afi, trial=Jalan17, trial_num=17
Proses: Afi_Jalan18_gait_with_steps.csv -> subject=Afi, trial=Jalan18, trial_num=18
Proses: Afi_Jalan19_gait_with_steps.csv -> subject=Afi, trial=Jalan19, trial_num=19
Proses: Afi_Jalan1_gait_with_steps.csv -> subject=Afi, trial=Jalan1, trial_num=1
Proses: Afi_Jalan20_gait_with_steps.csv -> 

Unnamed: 0,subject,trial_name,trial_num,step_count_raw,duration_raw_sec,distance_2d_raw_m,speed_raw_ms,step_count_valid,mean_step_time,step_time_std,cadence_spm,duration_valid_sec,distance_2d_valid_m,walking_speed_ms,step_length_m,ratio_valid_steps,is_valid_trial,duration_walk_sec,distance_2d_m
0,Afi,Jalan1,1,3,0.714816,1.180483,1.651451,3,0.357408,0.040571,167.875369,0.714816,1.180483,1.651451,0.393494,1.0,0,0.714816,1.180483
1,Afi,Jalan2,2,6,2.083814,0.732601,0.351567,6,0.304218,0.033932,197.226989,2.083814,0.732601,0.351567,0.1221,1.0,1,2.083814,0.732601
2,Afi,Jalan3,3,5,1.597363,3.081816,1.929315,5,0.399341,0.052022,150.247627,1.597363,3.081816,1.929315,0.616363,1.0,1,1.597363,3.081816
3,Afi,Jalan4,4,2,0.39193,4.584173,11.696408,2,0.39193,0.0,153.088562,0.39193,4.584173,11.696408,2.292087,1.0,0,0.39193,4.584173
4,Afi,Jalan5,5,5,1.971074,2.492903,1.264743,4,0.349715,0.081992,171.568277,1.049145,1.700045,1.62041,0.425011,0.8,0,1.049145,1.700045


In [None]:
# Cell 6: Sanity check V3 (mirip V1/V2 tapi pakai kolom baru)

df = df_final_v3.copy()

def count_true(series):
    return int(series.fillna(False).sum())

print("=== SANITY CHECK GLOBAL (V3) ===")
print("Total trial                    :", len(df))
print("Step_count_raw  < 2            :", count_true(df["step_count_raw"] < 2))
print("Step_count_valid < 4           :", count_true(df["step_count_valid"] < 4))
print("Durasi_valid <= 0 (detik)      :", count_true(df["duration_valid_sec"] <= 0))
print("Distance_valid < 0.2 m         :", count_true(df["distance_2d_valid_m"] < 0.2))
print("Speed_valid > 3.5 m/s          :", count_true(df["walking_speed_ms"] > 3.5))
print("Speed_valid 0â€“0.1 m/s          :", count_true((df["walking_speed_ms"] > 0) & (df["walking_speed_ms"] < 0.1)))
print("Trial dengan is_valid_trial = 0:", count_true(df["is_valid_trial"] == 0))

print("\n=== SANITY CHECK PER SUBJECT (V3) ===")
for subj in df["subject"].unique():
    dsub = df[df["subject"] == subj]
    print(f"\n-- {subj} --")
    print("  Total trial                 :", len(dsub))
    print("  Step_count_raw  < 2         :", count_true(dsub["step_count_raw"] < 2))
    print("  Step_count_valid < 4        :", count_true(dsub["step_count_valid"] < 4))
    print("  Durasi_valid <= 0 (detik)   :", count_true(dsub["duration_valid_sec"] <= 0))
    print("  Distance_valid < 0.2 m      :", count_true(dsub["distance_2d_valid_m"] < 0.2))
    print("  Speed_valid > 3.5 m/s       :", count_true(dsub["walking_speed_ms"] > 3.5))
    print("  is_valid_trial = 0          :", count_true(dsub["is_valid_trial"] == 0))

=== SANITY CHECK GLOBAL (V3) ===
Total trial                    : 216
Step_count_raw  < 2            : 1
Step_count_valid < 4           : 20
Durasi_valid <= 0 (detik)      : 0
Distance_valid < 0.2 m         : 4
Speed_valid > 3.5 m/s          : 8
Speed_valid 0â€“0.1 m/s          : 15
Trial dengan is_valid_trial = 0: 61

=== SANITY CHECK PER SUBJECT (V3) ===

-- Afi --
  Total trial                 : 72
  Step_count_raw  < 2         : 0
  Step_count_valid < 4        : 10
  Durasi_valid <= 0 (detik)   : 0
  Distance_valid < 0.2 m      : 4
  Speed_valid > 3.5 m/s       : 4
  is_valid_trial = 0          : 30

-- Kinan --
  Total trial                 : 72
  Step_count_raw  < 2         : 0
  Step_count_valid < 4        : 4
  Durasi_valid <= 0 (detik)   : 0
  Distance_valid < 0.2 m      : 0
  Speed_valid > 3.5 m/s       : 2
  is_valid_trial = 0          : 16

-- Miftah --
  Total trial                 : 72
  Step_count_raw  < 2         : 1
  Step_count_valid < 4        : 6
  Durasi_valid <= 0