# Modeling

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
from scipy.io import arff
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint


In [None]:
data, meta = arff.loadarff('./InsectSound/InsectSound.arff')


In [None]:
df = pd.DataFrame(data)

In [None]:
df['target'] = df['target'].str.decode('utf-8')

In [None]:
# Data
X = df.drop('target', axis=1)
Y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


In [None]:
# Data
X = df.drop('target', axis=1)
Y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('rf', RandomForestClassifier())
])

# Parameter untuk Random Search
param_dist = {
    'pca__n_components': [20, 30, 40, 50, 60, 80, 90, 100, 120],             # Sesuaikan jumlah fitur
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_depth': [5, 10, 15, 20, 25, None],
    'rf__min_samples_split': randint(2, 10),
    'rf__min_samples_leaf': randint(1, 5),
    'rf__max_features': ['sqrt', 'log2', None],
    'rf__class_weight': [None, 'balanced']
}

# Randomized Search
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=40,               # 30â€“50 cukup bagus
    cv=3,                    # 3-fold cross validation
    verbose=2,
    n_jobs=-1,               # paralel
    random_state=42
)

# Train
random_search.fit(X_train, y_train)

# Best Model
best_model = random_search.best_estimator_
print("Best params:", random_search.best_params_)

# Evaluate
pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))

Fitting 3 folds for each of 40 candidates, totalling 120 fits
Best params: {'pca__n_components': 100, 'rf__class_weight': 'balanced', 'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 9, 'rf__n_estimators': 400}
Accuracy: 0.5662
