In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load processed data
data = pd.read_csv('../data/processed/processed_data.csv')

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
baseline_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
baseline_model.fit(X_train, y_train)
baseline_predictions = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test, baseline_predictions)
print(f'Baseline model accuracy: {baseline_accuracy:.4f}')

In [None]:
def objective(trial):
    # Định nghĩa không gian tìm kiếm cho các siêu tham số quan trọng
    n_estimators = trial.suggest_int('n_estimators', 50, 500, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    # Khởi tạo mô hình
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )

    # Sử dụng Cross-Validation (CV) trên tập huấn luyện để đánh giá ổn định hơn
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()

    # Trả về điểm trung bình CV (mục tiêu tối đa hóa)
    return score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f"Best Accuracy (CV Score): {study.best_value:.4f}")
print(f"Best Parameters: {study.best_params}")

best_params = study.best_params
optimized_model = RandomForestClassifier(**best_params, random_state=42)
optimized_model.fit(X_train, y_train)

# Dự đoán và tính toán accuracy trên tập kiểm tra (X_test)
optimized_predictions = optimized_model.predict(X_test)
optimized_accuracy = accuracy_score(y_test, optimized_predictions)

print(f"1. Accuracy (Baseline): {baseline_accuracy:.4f}")
print(f"2. Accuracy (Optuna): {optimized_accuracy:.4f}")