In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 加载数据
df = pd.read_csv('/kaggle/input/houkong-moai/customer_churn.csv')

# 处理缺失值
df = df.dropna()

In [None]:
#编码
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [None]:
# 提取特征和目标变量
X = df.drop(['customer_id', 'churn'], axis=1)
y = df['churn']

In [None]:
#数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#划分训练集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
#回归随机森林

# 针对准确度优化
random_search_acc = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=param_dist,
    n_iter=50,  # 随机采样50组参数
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# 针对F1分数优化
random_search_f1 = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=param_dist,
    n_iter=50,  # 随机采样50组参数
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# 训练模型 - 准确度优化版本
print("开始准确度优化搜索...")
random_search_acc.fit(X_train, y_train)

# 训练模型 - F1分数优化版本
print("\n开始F1分数优化搜索...")
random_search_f1.fit(X_train, y_train)

# 输出最佳参数和得分
print("\n准确度优化结果:")
print(f"最佳参数: {random_search_acc.best_params_}")
print(f"最佳准确度: {random_search_acc.best_score_:.4f}")

print("\nF1分数优化结果:")
print(f"最佳参数: {random_search_f1.best_params_}")
print(f"最佳F1分数: {random_search_f1.best_score_:.4f}")

# 使用最佳模型进行预测
best_model_acc = random_search_acc.best_estimator_
best_model_f1 = random_search_f1.best_estimator_

In [None]:
# 分类随机森林
param_dist_clf = {
    'n_estimators': randint(50, 500),
    'criterion': ['gini', 'entropy'],  # 分类特有的分裂标准 # 多分类中entropy可能表现更好
    'max_depth': [None] + list(randint(1, 20).rvs(10)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'class_weight': [None, 'balanced', 'balanced_subsample']  # 分类特有的类别权重
}
#参数调优
scoring_metrics = {
    'accuracy': 'accuracy',
    'f1': 'f1',  # 二分类默认使用f1，多分类使用f1_macro/f1_micro
    'precision': 'precision',
    'recall': 'recall',
    'roc_auc': 'roc_auc',  # 仅适用于二分类
    'balanced_accuracy': 'balanced_accuracy'
}

# 多分类评分需要指定average方法
multiclass_scoring = {
    'f1_macro': 'f1_macro',
    'f1_micro': 'f1_micro',
    'f1_weighted': 'f1_weighted'
}

from sklearn.model_selection import RandomizedSearchCV
search_clf = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=param_dist_clf,
    n_iter=50,
    scoring=scoring_metrics,
    refit='f1',  # 选择f1作为最终优化的指标
    cv=5,
    n_jobs=-1,
    return_train_score=True
)

search_clf.fit(X_train, y_train)

from sklearn.model_selection import StratifiedKFold

stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search_clf = RandomizedSearchCV(
    RandomForestClassifier(),
    param_dist_clf,
    cv=stratified_cv,  # 使用分层交叉验证
    scoring='f1',
    n_iter=50
)


#处理类别不平衡
# 方法1：使用class_weight
rf_balanced = RandomForestClassifier(class_weight='balanced')

# 方法2：使用过采样
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

pipeline = make_pipeline(
    SMOTE(random_state=42),
    RandomForestClassifier()
)

param_dist_pipeline = {
    'randomforestclassifier__' + key: value 
    for key, value in param_dist_clf.items()
}
#报告
from sklearn.metrics import (classification_report, 
                           confusion_matrix,
                           roc_curve, 
                           precision_recall_curve)

y_pred = search_clf.best_estimator_.predict(X_test)
y_proba = search_clf.best_estimator_.predict_proba(X_test)[:, 1]

print("分类报告:")
print(classification_report(y_test, y_pred))

print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

# ROC曲线
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr)
plt.title('ROC曲线')
plt.show()

#特征重要性分析
importances = search_clf.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("特征重要性")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
#评估模型
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'模型准确率: {accuracy}')
print(f'模型精确率: {precision}')
print(f'模型召回率: {recall}')
print(f'模型 F1 分数: {f1}')

In [None]:
#输出
results = {
    'ID': df['customer_id'],
    'Churn': full_predictions
}
results_df = pd.DataFrame(results)
results_df.to_csv('/kaggle/working/submission.csv', index=False)