# Preprocessing InsectSound (ARFF time-series)
Gunakan berkas `.arff` dari UEA/UCR, ekstrak fitur statistik + FFT ringkas, encode label, simpan CSV untuk RandomForest.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import skew, kurtosis
from scipy.fft import rfft, rfftfreq
from scipy.io import arff

In [2]:
# Lokasi berkas .arff dari UCR/UEA InsectSound
base_dir = Path("./InsectSound")
train_path = base_dir / "InsectSound_TRAIN.arff"
test_path = base_dir / "InsectSound_TEST.arff"
out_dir = Path("./processed")
out_dir.mkdir(exist_ok=True)
fs = 6000  # Hz, sesuai deskripsi dataset
fft_keep = 256  # banyaknya bin magnitude FFT (mentah) yang disimpan

In [3]:
def load_arff_file(path: Path, max_rows: int | None = None):
    """Parse file .arff via scipy.io.arff into list of arrays and labels."""
    data_raw, meta = arff.loadarff(path)
    df = pd.DataFrame(data_raw)
    # Label diasumsikan kolom terakhir
    label_col = df.columns[-1]
    # Decode bytes ke string jika perlu
    if pd.api.types.is_object_dtype(df[label_col]):
        df[label_col] = df[label_col].apply(lambda x: x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else str(x))
    labels = df[label_col].tolist()
    feature_cols = df.columns[:-1]
    if max_rows is not None:
        df = df.iloc[:max_rows]
        labels = labels[:max_rows]
    data = [np.array(row[feature_cols], dtype=np.float32) for _, row in df.iterrows()]
    return data, labels

def summarize_series(series: np.ndarray, fs_hz: int, fft_keep_bins: int) -> dict:
    x = series.astype(np.float64)
    n = len(x)
    feats = {}
    t = np.arange(n)
    slope = np.polyfit(t, x, 1)[0] if n > 1 else 0.0
    feats.update({
        "mean": float(np.mean(x)),
        "std": float(np.std(x)),
        "min": float(np.min(x)),
        "max": float(np.max(x)),
        "q25": float(np.quantile(x, 0.25)),
        "q50": float(np.quantile(x, 0.50)),
        "q75": float(np.quantile(x, 0.75)),
        "energy": float(np.mean(x ** 2)),
        "skew": float(skew(x)),
        "kurtosis": float(kurtosis(x)),
        "slope": float(slope),
        "len": n,
    })
    # FFT ringkas
    freqs = rfftfreq(n, d=1.0 / fs_hz)
    spectrum = rfft(x)
    mag = np.abs(spectrum)
    mag_sum = np.sum(mag) + 1e-9
    dom_idx = int(np.argmax(mag))
    feats.update({
        "fft_dom_freq": float(freqs[dom_idx]),
        "fft_dom_mag": float(mag[dom_idx]),
        "fft_centroid": float(np.sum(freqs * mag) / mag_sum),
        "fft_bandwidth": float(np.sqrt(np.sum(((freqs - np.sum(freqs * mag) / mag_sum) ** 2) * mag) / mag_sum)),
        "fft_low_energy": float(np.sum(mag[freqs <= 500]) / mag_sum),
        "fft_mid_energy": float(np.sum(mag[(freqs > 500) & (freqs <= 1500)]) / mag_sum),
        "fft_high_energy": float(np.sum(mag[freqs > 1500]) / mag_sum),
    })
    # FFT mentah (magnitudo ternormalisasi) disimpan sebagian depan
    mag_norm = mag / mag_sum
    keep = min(fft_keep_bins, len(mag_norm))
    for i in range(keep):
        feats[f"fft_bin_{i}"] = float(mag_norm[i])
    return feats

def to_feature_df(data: list, labels: list[str], fs_hz: int, fft_keep_bins: int) -> pd.DataFrame:
    rows = []
    for series, label in zip(data, labels):
        feats = summarize_series(series, fs_hz, fft_keep_bins)
        feats["label"] = label
        rows.append(feats)
    return pd.DataFrame(rows)

In [4]:
# Load train/test dari berkas .arff dan ubah ke fitur ringkas + FFT (termasuk bin mentah)
train_data, train_labels = load_arff_file(train_path)
test_data, test_labels = load_arff_file(test_path)

df_train = to_feature_df(train_data, train_labels, fs, fft_keep)
df_test = to_feature_df(test_data, test_labels, fs, fft_keep)
df_all = pd.concat([df_train, df_test], ignore_index=True)
df_train.head()

Unnamed: 0,mean,std,min,max,q25,q50,q75,energy,skew,kurtosis,...,fft_bin_247,fft_bin_248,fft_bin_249,fft_bin_250,fft_bin_251,fft_bin_252,fft_bin_253,fft_bin_254,fft_bin_255,label
0,9e-06,0.096341,-0.471312,0.501054,-0.00264,-9.23188e-07,0.002253,0.009282,0.013708,6.972951,...,0.000578,0.001087,0.001701,0.002359,0.002845,0.002856,0.002495,0.002599,0.003854,Aedes_female
1,-0.000461,0.057711,-0.219536,0.305695,-0.005912,-2.88835e-06,0.005484,0.003331,1.371759,9.330806,...,0.001128,0.000925,0.000681,0.000409,0.000221,0.000115,5.8e-05,2.3e-05,2e-06,Aedes_female
2,-0.00089,0.149547,-0.485221,0.356049,-0.069214,0.000762576,0.105936,0.022365,-0.540432,0.322946,...,5.4e-05,4.5e-05,2.7e-05,5.9e-05,2.6e-05,4.5e-05,0.000135,0.000162,0.000212,Aedes_female
3,-0.000669,0.084268,-0.419142,0.388549,-0.016167,4.804084e-06,0.016174,0.007102,-0.307301,7.231631,...,0.000152,0.000284,0.000395,0.000493,0.000472,0.000529,0.000637,0.000685,0.000641,Aedes_female
4,-8.9e-05,0.058881,-0.29334,0.313444,-0.002565,6.179822e-05,0.009615,0.003467,-0.910162,9.612633,...,0.000569,0.00064,0.000682,0.00067,0.000599,0.000513,0.000437,0.000388,0.000376,Aedes_female


In [5]:
print("Train rows:", len(df_train), "Test rows:", len(df_test))
print("Feature cols:", [c for c in df_train.columns if c != "label"][:5], "... total", df_train.shape[1]-1)

Train rows: 25000 Test rows: 25000
Feature cols: ['mean', 'std', 'min', 'max', 'q25'] ... total 275


In [None]:
# Simpan CSV tanpa label encoding
feature_cols = [c for c in df_all.columns if c != "label"]
df_all.to_csv(out_dir / "insectsound_features_full.csv", index=False)
df_train.to_csv(out_dir / "insectsound_train.csv", index=False)
df_test.to_csv(out_dir / "insectsound_test.csv", index=False)

print("Selesai simpan ke", out_dir.resolve())
print("Total fitur:", len(feature_cols))

NameError: name 'le' is not defined