In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, f1_score)
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 数据加载与检查
train_data = pd.read_csv("/kaggle/input/houkong-moai/customer_churn.csv")
test_data = pd.read_csv("/kaggle/input/houkong-moai/customer_churn.csv")

print("数据加载完成")
print(f"训练集形状: {train_data.shape}, 测试集形状: {test_data.shape}")
print("\n训练集前5行:")
print(train_data.head())

# 2. 数据预处理
target_col = "churn"
used_features = ["age","subscription_length","monthly_bill","total_usage","service_complaints"]


# 检查目标列分布
print("\n目标列分布:")
print(train_data[target_col].value_counts(normalize=True))

# 3. 特征工程
# 划分训练集和验证集
X = train_data[used_features]
y = train_data[target_col]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 特征标准化（KNN必须标准化！）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_data[used_features])

# 4. 模型训练与调参
# 定义参数网格
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1:曼哈顿距离, 2:欧式距离
}

# 创建KNN模型
knn = KNeighborsClassifier()

# 网格搜索交叉验证
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

# 获取最佳模型
best_knn = grid_search.best_estimator_
print("\n最佳参数组合:", grid_search.best_params_)

# 5. 模型评估
# 验证集预测
val_pred = best_knn.predict(X_val_scaled)

print("\n===== 验证集评估 =====")
print(f"准确率: {accuracy_score(y_val, val_pred):.4f}")
print(f"F1分数: {f1_score(y_val, val_pred, average='weighted'):.4f}")

# 分类报告
print("\n分类报告:")
print(classification_report(y_val, val_pred))

# 混淆矩阵可视化
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_val, val_pred),
            annot=True, fmt="d", cmap="Blues",
            xticklabels=np.unique(y_val),
            yticklabels=np.unique(y_val))
plt.title("混淆矩阵")
plt.show()

# 6. 测试集预测
test_ids = test_data["customer_id"].values
test_pred = best_knn.predict(X_test_scaled)

# 7. 结果保存
result_df = pd.DataFrame({
    "ID": test_ids,
    "prediction": test_pred
})
result_df.to_csv("submit.csv", index=False)

print("\n预测完成！结果已保存至 submit.csv")
print("使用特征:", used_features)
print("删除列:", useless_cols)

# 8. 特征重要性分析（KNN无法直接获取，使用排列重要性）
from sklearn.inspection import permutation_importance

result = permutation_importance(
    best_knn, X_val_scaled, y_val,
    n_repeats=10, random_state=42
)

importance_df = pd.DataFrame({
    'Feature': used_features,
    'Importance': result.importances_mean
}).sort_values('Importance', ascending=False)

print("\n特征重要性（排列重要性）:")
print(importance_df)

# 可视化特征重要性
plt.figure(figsize=(8, 4))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('特征重要性排序')
plt.show()
