In [1]:
import os
import glob
import pandas as pd
import numpy as np

BASE_DIR     = r"E:\0.TA_Teguh\dataset2"
CLUSTER_DIR  = os.path.join(BASE_DIR, "hasil_clustering")
SUBJECTS     = ["Afi", "Tsania", "Tsamara"]  # sesuaikan kalau beda

def list_cluster_files_for_subject(subject_name: str):
    """
    Mengembalikan list path penuh ke semua file clustering milik 1 subjek, misal:
    E:\0.TA_Teguh\dataset2\hasil_clustering\Afi\clustering_Jalan1.csv, dst.
    """
    subj_dir = os.path.join(CLUSTER_DIR, subject_name)
    pattern = os.path.join(subj_dir, "clustering_*.csv")
    files = sorted(glob.glob(pattern))
    return files

# Cek cepat
for subj in SUBJECTS:
    files = list_cluster_files_for_subject(subj)
    print(f"Subjek {subj}: {len(files)} file clustering ditemukan")
    for f in files[:5]:
        print("  ", os.path.basename(f))
    if len(files) > 5:
        print("  ...")
    print()


Subjek Afi: 72 file clustering ditemukan
   clustering_Jalan1.csv
   clustering_Jalan10.csv
   clustering_Jalan11.csv
   clustering_Jalan12.csv
   clustering_Jalan13.csv
  ...

Subjek Tsania: 72 file clustering ditemukan
   clustering_Jalan1.csv
   clustering_Jalan10.csv
   clustering_Jalan11.csv
   clustering_Jalan12.csv
   clustering_Jalan13.csv
  ...

Subjek Tsamara: 72 file clustering ditemukan
   clustering_Jalan1.csv
   clustering_Jalan10.csv
   clustering_Jalan11.csv
   clustering_Jalan12.csv
   clustering_Jalan13.csv
  ...



In [2]:
def evaluate_cluster_file(cluster_file_path: str, subject_name: str = None):
    """
    Membaca 1 file hasil clustering (clustering_JalanX.csv),
    lalu menghitung metrik kualitas DBSCAN:

    - n_frames                  : jumlah frame unik
    - frames_with_non_noise     : frame yang punya cluster_id != -1
    - coverage                  : frames_with_non_noise / n_frames
    - avg_clusters_per_frame    : rata-rata banyaknya cluster non-noise per frame
    - frac_frames_multi_cluster : proporsi frame yang punya >1 cluster non-noise
    - avg_non_noise_points      : rata-rata jumlah titik non-noise per frame
    """
    df = pd.read_csv(cluster_file_path)

    if "frame" not in df.columns or "cluster_id" not in df.columns:
        raise ValueError(f"File {cluster_file_path} tidak punya kolom 'frame' atau 'cluster_id'.")

    # Group per frame
    group = df.groupby("frame")

    n_frames = len(group)
    frames_with_non_noise = 0
    clusters_per_frame = []
    non_noise_points_per_frame = []
    frames_multi_cluster = 0

    for frame_id, g in group:
        # titik non-noise
        non_noise = g[g["cluster_id"] != -1]
        if non_noise.empty:
            clusters_per_frame.append(0)
            non_noise_points_per_frame.append(0)
            continue

        frames_with_non_noise += 1
        unique_clusters = non_noise["cluster_id"].unique()
        n_clusters = len(unique_clusters)
        clusters_per_frame.append(n_clusters)
        non_noise_points_per_frame.append(len(non_noise))

        if n_clusters > 1:
            frames_multi_cluster += 1

    coverage = frames_with_non_noise / n_frames if n_frames > 0 else 0.0
    avg_clusters_per_frame = float(np.mean(clusters_per_frame)) if clusters_per_frame else 0.0
    avg_non_noise_points = float(np.mean(non_noise_points_per_frame)) if non_noise_points_per_frame else 0.0
    frac_frames_multi_cluster = frames_multi_cluster / n_frames if n_frames > 0 else 0.0

    return {
        "subject": subject_name,
        "file": os.path.basename(cluster_file_path),
        "path": cluster_file_path,
        "n_frames": n_frames,
        "frames_with_non_noise": frames_with_non_noise,
        "coverage": coverage,
        "avg_clusters_per_frame": avg_clusters_per_frame,
        "frac_frames_multi_cluster": frac_frames_multi_cluster,
        "avg_non_noise_points_per_frame": avg_non_noise_points,
    }


In [3]:
def evaluate_all_clusters(subjects=None):
    """
    Evaluasi semua file clustering untuk semua subjek.
    Mengembalikan DataFrame df_eval dengan 1 baris per file.
    """
    if subjects is None:
        subjects = SUBJECTS

    results = []

    for subj in subjects:
        cluster_files = list_cluster_files_for_subject(subj)
        if not cluster_files:
            print(f"[WARN] Tidak ada file clustering untuk subjek {subj}")
            continue

        print(f"Evaluasi subjek {subj} ({len(cluster_files)} file)...")
        for fpath in cluster_files:
            metrics = evaluate_cluster_file(fpath, subject_name=subj)
            results.append(metrics)

    if not results:
        print("Tidak ada hasil evaluasi. Cek kembali folder hasil_clustering.")
        return None

    df_eval = pd.DataFrame(results)
    return df_eval

# Jalankan evaluasi
df_eval = evaluate_all_clusters()
df_eval.head()


Evaluasi subjek Afi (72 file)...
Evaluasi subjek Tsania (72 file)...
Evaluasi subjek Tsamara (72 file)...


Unnamed: 0,subject,file,path,n_frames,frames_with_non_noise,coverage,avg_clusters_per_frame,frac_frames_multi_cluster,avg_non_noise_points_per_frame
0,Afi,clustering_Jalan1.csv,E:\0.TA_Teguh\dataset2\hasil_clustering\Afi\cl...,99,82,0.828283,0.838384,0.010101,17.111111
1,Afi,clustering_Jalan10.csv,E:\0.TA_Teguh\dataset2\hasil_clustering\Afi\cl...,94,83,0.882979,0.914894,0.031915,21.404255
2,Afi,clustering_Jalan11.csv,E:\0.TA_Teguh\dataset2\hasil_clustering\Afi\cl...,96,87,0.90625,0.90625,0.0,17.708333
3,Afi,clustering_Jalan12.csv,E:\0.TA_Teguh\dataset2\hasil_clustering\Afi\cl...,77,68,0.883117,0.896104,0.012987,26.675325
4,Afi,clustering_Jalan13.csv,E:\0.TA_Teguh\dataset2\hasil_clustering\Afi\cl...,99,87,0.878788,0.888889,0.010101,21.79798


In [4]:
if df_eval is None:
    print("df_eval kosong - cek dulu evaluasi sebelumnya.")
else:
    print("Ringkasan GLOBAL (semua subjek):")
    print(
        df_eval[["coverage", "avg_clusters_per_frame", "frac_frames_multi_cluster", "avg_non_noise_points_per_frame"]]
        .describe()
    )

    print("\nRingkasan per SUBJEK:")
    summary_by_subj = (
        df_eval
        .groupby("subject")
        .agg({
            "coverage": "mean",
            "avg_clusters_per_frame": "mean",
            "frac_frames_multi_cluster": "mean",
            "avg_non_noise_points_per_frame": "mean",
            "file": "count"
        })
        .rename(columns={"file": "n_files"})
    )
    print(summary_by_subj)


Ringkasan GLOBAL (semua subjek):
         coverage  avg_clusters_per_frame  frac_frames_multi_cluster  \
count  216.000000              216.000000                 216.000000   
mean     0.880348                0.905802                   0.023400   
std      0.043979                0.083138                   0.053641   
min      0.658333                0.761905                   0.000000   
25%      0.856101                0.874731                   0.000000   
50%      0.883436                0.901182                   0.012270   
75%      0.908020                0.926358                   0.028302   
max      0.976744                1.672566                   0.575221   

       avg_non_noise_points_per_frame  
count                      216.000000  
mean                        25.226987  
std                          8.297351  
min                         11.472527  
25%                         20.320119  
50%                         23.856733  
75%                         28.990716 

In [5]:
COVERAGE_THRESHOLD = 0.85
MULTI_CLUSTER_THRESHOLD = 0.10

if df_eval is not None:
    print("\nTrial dengan coverage RENDAH (< {:.2f}):".format(COVERAGE_THRESHOLD))
    bad_coverage = df_eval[df_eval["coverage"] < COVERAGE_THRESHOLD].sort_values("coverage")
    if bad_coverage.empty:
        print("  Tidak ada trial dengan coverage di bawah threshold.")
    else:
        display(bad_coverage[["subject", "file", "coverage", "avg_clusters_per_frame",
                              "frac_frames_multi_cluster", "avg_non_noise_points_per_frame"]])

    print("\nTrial dengan banyak frame multi-cluster (> {:.2f}):".format(MULTI_CLUSTER_THRESHOLD))
    bad_multi = df_eval[df_eval["frac_frames_multi_cluster"] > MULTI_CLUSTER_THRESHOLD].sort_values("frac_frames_multi_cluster", ascending=False)
    if bad_multi.empty:
        print("  Tidak ada trial dengan multi-cluster berlebihan.")
    else:
        display(bad_multi[["subject", "file", "coverage", "avg_clusters_per_frame",
                           "frac_frames_multi_cluster", "avg_non_noise_points_per_frame"]])



Trial dengan coverage RENDAH (< 0.85):


Unnamed: 0,subject,file,coverage,avg_clusters_per_frame,frac_frames_multi_cluster,avg_non_noise_points_per_frame
54,Afi,clustering_Jalan59.csv,0.658333,0.833333,0.15,20.441667
175,Tsamara,clustering_Jalan38.csv,0.752381,0.761905,0.009524,15.790476
14,Afi,clustering_Jalan22.csv,0.762376,0.792079,0.029703,32.356436
72,Tsania,clustering_Jalan1.csv,0.780488,0.829268,0.04878,24.560976
23,Afi,clustering_Jalan30.csv,0.7875,0.7875,0.0,14.175
156,Tsamara,clustering_Jalan20.csv,0.790076,0.984733,0.175573,31.332061
128,Tsania,clustering_Jalan60.csv,0.795082,0.827869,0.032787,21.860656
24,Afi,clustering_Jalan31.csv,0.797101,0.797101,0.0,21.144928
20,Afi,clustering_Jalan28.csv,0.8,0.8,0.0,12.969231
200,Tsamara,clustering_Jalan60.csv,0.8,0.8,0.0,13.968



Trial dengan banyak frame multi-cluster (> 0.10):


Unnamed: 0,subject,file,coverage,avg_clusters_per_frame,frac_frames_multi_cluster,avg_non_noise_points_per_frame
74,Tsania,clustering_Jalan11.csv,0.955752,1.672566,0.575221,67.0
25,Afi,clustering_Jalan32.csv,0.961039,1.571429,0.467532,62.883117
156,Tsamara,clustering_Jalan20.csv,0.790076,0.984733,0.175573,31.332061
54,Afi,clustering_Jalan59.csv,0.658333,0.833333,0.15,20.441667


In [6]:
# Ganti sesuai hasil df_eval
subj = "Afi"
fname = "clustering_Jalan7.csv"

row = df_eval[(df_eval["subject"] == subj) & (df_eval["file"] == fname)].iloc[0]
print(row)

# Kalau mau load lagi DataFrame-nya:
df_trial = pd.read_csv(row["path"])
print("Total frame:", df_trial["frame"].nunique())
print("Frame range:", df_trial["frame"].min(), "sampai", df_trial["frame"].max())


subject                                                                         Afi
file                                                          clustering_Jalan7.csv
path                              E:\0.TA_Teguh\dataset2\hasil_clustering\Afi\cl...
n_frames                                                                         95
frames_with_non_noise                                                            86
coverage                                                                   0.905263
avg_clusters_per_frame                                                     0.905263
frac_frames_multi_cluster                                                       0.0
avg_non_noise_points_per_frame                                                 25.0
Name: 66, dtype: object
Total frame: 95
Frame range: 1 sampai 95
