In [None]:
# -*- coding: utf-8 -*-
# 优化随机森林模板（带特征重要性和模型评估）
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score,
                           recall_score, f1_score, roc_auc_score,
                           confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 加载数据
train_data = pd.read_csv('/kaggle/input/houkong-moai/customer_churn.csv')
test_data = pd.read_csv('/kaggle/input/houkong-moai/customer_churn.csv')

# 2. 指定列用途（根据实际数据修改）
target_col = "churn"           # 目标列名
used_features = ["age","subscription_length","monthly_bill","total_usage","service_complaints"]  # 使用的特征列


# 检查目标列分布
print("\n目标列分布:")
print(train_data[target_col].value_counts(normalize=True))

# 划分训练集和验证集
X = train_data[used_features]
y = train_data[target_col]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. 训练优化随机森林模型
model = RandomForestClassifier(
    n_estimators=200,        # 增加树的数量
    max_depth=8,             # 适当增加深度
    min_samples_split=10,    # 节点分裂最小样本数
    min_samples_leaf=4,      # 叶节点最小样本数
    max_features='sqrt',     # 考虑的特征数量
    class_weight='balanced', # 处理类别不平衡
    bootstrap=True,          # 使用bootstrap采样
    oob_score=True,         # 计算OOB分数
    random_state=42,
    n_jobs=-1               # 使用所有CPU核心
)
model.fit(X_train, y_train)

# 5. 模型评估
print("\n===== 模型评估 =====")
print(f"OOB分数: {model.oob_score_:.4f}")

# 交叉验证
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"交叉验证准确率: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

# 验证集评估
val_pred = model.predict(X_val)
val_proba = model.predict_proba(X_val)[:, 1]  # 正类的概率

print("\n验证集性能:")
print(f"准确率: {accuracy_score(y_val, val_pred):.4f}")
print(f"精确率: {precision_score(y_val, val_pred):.4f}")
print(f"召回率: {recall_score(y_val, val_pred):.4f}")
print(f"F1分数: {f1_score(y_val, val_pred):.4f}")
print(f"AUC分数: {roc_auc_score(y_val, val_proba):.4f}")

# 打印分类报告
print("\n分类报告:")
print(classification_report(y_val, val_pred))

# 绘制混淆矩阵
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_val, val_pred),
            annot=True, fmt="d", cmap="Blues",
            xticklabels=["预测0", "预测1"],
            yticklabels=["实际0", "实际1"])
plt.title("混淆矩阵")
plt.show()

# 6. 特征重要性分析
feature_importance = pd.DataFrame({
    'Feature': used_features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n特征重要性:")
print(feature_importance)

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('特征重要性排序')
plt.show()

# 7. 预测测试集并保存结果
test_ids = test_data["customer_id"].values
predictions = model.predict(test_data[used_features])
probabilities = model.predict_proba(test_data[used_features])[:, 1]  # 预测概率

result = pd.DataFrame({
    "ID": test_ids,
    "prediction": predictions,
})
result.to_csv('submission.csv', index=False)

print("\n预测完成！结果已保存到 submission.csv")
print("使用的特征列:", used_features)
print("删除的无用列:", useless_cols)
