In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import mlflow
import mlflow.sklearn


In [2]:
# Arahkan MLflow ke folder mlruns utama di machineLearning/ (jangan buat sendiri)
ML_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
mlflow.set_tracking_uri(f'file:///{os.path.join(ML_DIR, "mlruns")}')

# WAJIB sesuai instruksi tugas
mlflow.set_experiment("Baseline_Audio_Linear")

  return FileStore(store_uri, store_uri)
2026/02/21 21:19:23 INFO mlflow.tracking.fluent: Experiment with name 'Baseline_Audio_Linear' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:\\CAWU4GROUP3\\projects\\projectRoodio\\machineLearning\\mlruns/967775050743225553', creation_time=1771683563950, experiment_id='967775050743225553', last_update_time=1771683563950, lifecycle_stage='active', name='Baseline_Audio_Linear', tags={}>

Log tersebut menunjukkan proses inisialisasi dan migrasi database MLflow menggunakan SQLite saat pertama kali dijalankan atau setelah pembaruan versi. MLflow secara otomatis memperbarui struktur tabel agar sesuai dengan skema terbaru.

Selain itu, karena eksperimen "Baseline_Audio_Linear" belum tersedia, sistem secara otomatis membuat eksperimen baru sebagai wadah pencatatan hasil pelatihan model. Proses ini bersifat normal dan bukan merupakan kesalahan.


In [3]:
# Arahkan ke folder data utama di machineLearning/data/raw & raw2
ML_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
SOURCE_DIRS = [
    os.path.join(ML_DIR, 'data', 'raw'),
    os.path.join(ML_DIR, 'data', 'raw2')
]

In [4]:
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=30)

    features = []

    # MFCC (13)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    features.extend(np.mean(mfcc, axis=1))

    # Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features.extend(np.mean(chroma, axis=1))

    # Spectral Centroid
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    features.append(np.mean(centroid))

    # Spectral Bandwidth
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    features.append(np.mean(bandwidth))

    # Spectral Rolloff
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    features.append(np.mean(rolloff))

    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y)
    features.append(np.mean(zcr))

    # RMS Energy
    rms = librosa.feature.rms(y=y)
    features.append(np.mean(rms))

    return np.array(features)

In [5]:
X = []
y = []

print("Extracting features...")

for data_path in SOURCE_DIRS:
    print(f"Scanning: {data_path}")
    if not os.path.exists(data_path):
        print(f"\u26a0\ufe0f Folder tidak ditemukan: {data_path}")
        continue
    labels = os.listdir(data_path)
    for label in labels:
        folder_path = os.path.join(data_path, label)
        if not os.path.isdir(folder_path):
            continue
        for file in os.listdir(folder_path):
            if file.lower().endswith((".mp3", ".wav")):
                file_path = os.path.join(folder_path, file)
                feat = extract_features(file_path)
                X.append(feat)
                y.append(label)

X = np.array(X)
y = np.array(y)

print("Feature shape:", X.shape)


Extracting features...
Scanning: c:\CAWU4GROUP3\projects\projectRoodio\machineLearning\data\raw
Scanning: c:\CAWU4GROUP3\projects\projectRoodio\machineLearning\data\raw2
Feature shape: (200, 30)


Output tersebut menunjukkan bahwa proses ekstraksi fitur audio berhasil menghasilkan matriks data berukuran (100, 30), yang berarti terdapat 100 sampel lagu dengan 30 fitur numerik untuk setiap sampel. Matriks ini digunakan sebagai representasi input dalam proses pelatihan dan evaluasi model klasifikasi.

In [6]:
models = {
    "SVM_RBF": SVC(kernel="rbf", C=10, gamma="scale"),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "Logistic_Regression": LogisticRegression(max_iter=5000)
}

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [8]:
for model_name, model in models.items():
    
    with mlflow.start_run(run_name=model_name):

        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        # Cross Validation Accuracy
        cv_scores = cross_val_score(pipeline, X, y, cv=skf, scoring="accuracy")
        y_pred = cross_val_predict(pipeline, X, y, cv=skf)

        # Metrics
        acc = accuracy_score(y, y_pred)
        precision = precision_score(y, y_pred, average="macro")
        recall = recall_score(y, y_pred, average="macro")
        f1 = f1_score(y, y_pred, average="macro")

        # =========================
        # LOG PARAMETERS
        # =========================
        mlflow.log_param("model_name", model_name)

        if model_name == "SVM_RBF":
            mlflow.log_param("kernel", "rbf")
            mlflow.log_param("C", 10)
            mlflow.log_param("gamma", "scale")

        if model_name == "KNN":
            mlflow.log_param("n_neighbors", 7)

        if model_name == "Logistic_Regression":
            mlflow.log_param("max_iter", 5000)

        # =========================
        # LOG METRICS
        # =========================
        mlflow.log_metric("cv_accuracy_mean", np.mean(cv_scores))
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_macro", precision)
        mlflow.log_metric("recall_macro", recall)
        mlflow.log_metric("f1_macro", f1)

        # =========================
        # CONFUSION MATRIX
        # =========================
        cm = confusion_matrix(y, y_pred)
        
        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                    xticklabels=np.unique(y),
                    yticklabels=np.unique(y))
        plt.title(f"Confusion Matrix - {model_name}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")

        cm_filename = f"confusion_matrix_{model_name}.png"
        plt.savefig(cm_filename)
        plt.close()

        # Log artifact
        mlflow.log_artifact(cm_filename)

        # Log model
        mlflow.sklearn.log_model(pipeline, model_name)

        print(f"{model_name} done. Accuracy: {acc:.4f}")

print("Training selesai.")



SVM_RBF done. Accuracy: 0.3350




KNN done. Accuracy: 0.3400




Logistic_Regression done. Accuracy: 0.3450
Training selesai.


Log tersebut menunjukkan bahwa proses pelatihan dan evaluasi tiga model klasifikasi (SVM RBF, KNN, dan Logistic Regression) telah berhasil diselesaikan. Nilai akurasi masing-masing model adalah 41% (SVM), 31% (KNN), dan 39% (Logistic Regression), dengan SVM menunjukkan performa terbaik pada eksperimen baseline ini.

Pesan warning terkait artifact_path mengindikasikan bahwa parameter tersebut telah didepresiasi pada versi MLflow terbaru dan disarankan menggunakan parameter name. Namun, peringatan ini tidak memengaruhi hasil pelatihan maupun proses logging model.