In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            classification_report, roc_curve, precision_recall_curve)
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
import seaborn as sns

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42, stratify=y)

# 3. 处理类别不平衡（使用SMOTE过采样）
from imblearn.over_sampling import SMOTE
print("\n原始训练集类别分布:", np.bincount(y_train))
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("过采样后训练集类别分布:", np.bincount(y_train_res))

# 4. 创建基础随机森林分类器
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# 5. 设置参数分布用于随机搜索
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': [None] + list(np.arange(5, 30)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['sqrt', 'log2', None] + list(np.linspace(0.1, 1.0, 10)),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_samples': [None] + list(np.linspace(0.1, 1.0, 10))
}

# 6. 设置分层交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 7. 设置随机搜索（使用ROC AUC作为主要评估指标）
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=100,
    cv=cv,
    scoring='roc_auc',  # 适用于不平衡数据的良好指标
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# 8. 执行随机搜索
print("\n开始随机搜索调参...")
random_search.fit(X_train_res, y_train_res)

# 9. 输出最佳参数
print("\n最佳参数组合:")
print(random_search.best_params_)
print(f"最佳交叉验证AUC: {random_search.best_score_:.4f}")

# 10. 使用最佳模型进行预测
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]  # 正类的预测概率

# 11. 评估测试集性能
print("\n测试集性能指标:")
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"精确率: {precision_score(y_test, y_pred):.4f}")
print(f"召回率: {recall_score(y_test, y_pred):.4f}")
print(f"F1分数: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

# 12. 输出分类报告和混淆矩阵
print("\n分类报告:")
print(classification_report(y_test, y_pred))

print("\n混淆矩阵:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.show()
# 13. 绘制ROC曲线和PR曲线
plt.figure(figsize=(12, 5))

# ROC曲线
plt.subplot(1, 2, 1)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假正率')
plt.ylabel('真正率')
plt.title('ROC曲线')
plt.legend()

# PR曲线
plt.subplot(1, 2, 2)
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.plot(recall, precision, label=f'AP = {average_precision_score(y_test, y_pred_proba):.2f}')
plt.xlabel('召回率')
plt.ylabel('精确率')
plt.title('PR曲线')
plt.legend()

plt.tight_layout()
plt.show()

# 14. 保存预测结果到CSV
results = pd.DataFrame({
    'ID': id_test,
    'True_Label': y_test,
    'Predicted_Label': y_pred,
    'Predicted_Probability': y_pred_proba,
    'Correct': (y_test == y_pred)
})
