In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# 加载数据
data = pd.read_csv('../data/processed/reduced_features_data.csv')

X = data.drop('Label', axis=1)
y = data['Label']

# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 随机森林模型
clf = RandomForestClassifier(random_state=42)

# 使用网格搜索优化模型超参数
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 最优模型
best_clf = grid_search.best_estimator_

# 训练最优模型
best_clf.fit(X_train, y_train)

# 保存模型
import joblib
joblib.dump(best_clf, '../models/random_forest_model.pkl')

# 测试模型
y_pred = best_clf.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.4f}")
