In [None]:
# -*- coding: utf-8 -*-
# 优化决策树模板（带特征重要性和模型评估）
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# 1. 加载数据
train_data = pd.read_csv('/kaggle/input/houkong-moai/customer_churn.csv')
test_data = pd.read_csv('/kaggle/input/houkong-moai/customer_churn.csv')

# 2. 指定列用途（根据实际数据修改）
target_col = "churn"           # 目标列名
used_features = ["age","subscription_length","monthly_bill","total_usage","service_complaints"] # 使用的特征列


# 划分训练集和验证集
X = train_data[used_features]
y = train_data[target_col]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. 训练优化决策树模型
model = DecisionTreeClassifier(
    max_depth=5,               # 控制树深度
    min_samples_split=10,      # 节点最小样本数
    min_samples_leaf=5,        # 叶节点最小样本数
    max_features='sqrt',       # 考虑的特征数量
    class_weight='balanced',   # 处理类别不平衡
    random_state=42
)
model.fit(X_train, y_train)

# 5. 模型评估
print("\n===== 验证集评估 =====")
val_pred = model.predict(X_val)
print(classification_report(y_val, val_pred))

# 6. 特征重要性分析
feature_importance = pd.DataFrame({
    'Feature': used_features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\n特征重要性:")
print(feature_importance)

# 7. 可视化决策树（前2层）
plt.figure(figsize=(15, 8))
plot_tree(model,
          feature_names=used_features,
          class_names=np.unique(y).astype(str),
          filled=True,
          max_depth=2,
          fontsize=10)
plt.title("决策树结构（前2层）")
plt.show()

# 8. 预测测试集并保存结果
test_ids = test_data["customer_id"].values
predictions = model.predict(test_data[used_features])

pd.DataFrame({
    "ID": test_ids,
    "prediction": predictions
}).to_csv('submission.csv', index=False)

print("\n预测完成！结果已保存到 submission.csv")
print("使用的特征列:", used_features)
print("删除的无用列:", useless_cols)
