In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 创建示例数据
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 基础模型
gbc = GradientBoostingClassifier(
    n_estimators=100,  # 树的数量
    learning_rate=0.1,  # 学习率
    max_depth=3,  # 每棵树的最大深度
    random_state=42
)

gbc.fit(X_train, y_train)
print("测试集准确率:", gbc.score(X_test, y_test))

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# 转换为DMatrix格式（XGBoost专用）
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 参数设置
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'eta': 0.1,  # 学习率
    'eval_metric': 'logloss'
}

# 训练
model = xgb.train(params, dtrain, num_boost_round=100)

# 预测
y_pred = model.predict(dtest)
y_pred_binary = [1 if p > 0.5 else 0 for p in y_pred]
print("XGBoost准确率:", accuracy_score(y_test, y_pred_binary))

In [None]:
4. 关键参数说明
参数	说明	典型值
n_estimators	树的数量	50-500
learning_rate	学习率/收缩系数	0.01-0.2
max_depth	单棵树的最大深度	3-8
min_samples_split	分裂节点所需最小样本数	2-10
min_samples_leaf	叶节点最小样本数	1-5
subsample	样本采样比例	0.5-1.0
max_features	特征采样比例	0.5-1.0


In [None]:
#网络搜索
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)

In [None]:
#早停法

# 划分验证集
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

gbc = GradientBoostingClassifier(
    n_estimators=1000,  # 设置较大的值
    learning_rate=0.1,
    max_depth=3,
    validation_fraction=0.2,
    n_iter_no_change=10,  # 10轮无提升则停止
    tol=1e-4,  # 提升阈值
    random_state=42
)

gbc.fit(X_train, y_train)
print("实际使用的树的数量:", len(gbc.estimators_))

In [None]:
#特征重要性分析

import matplotlib.pyplot as plt
import numpy as np

# 获取特征重要性
importances = gbc.feature_importances_
indices = np.argsort(importances)[::-1]

# 可视化
plt.figure(figsize=(10, 6))
plt.title("特征重要性")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
#多分类问题

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 创建多分类数据（3类）
X, y = make_classification(n_samples=1000, n_features=20, 
                         n_classes=3, n_informative=4,
                         random_state=42)

# 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 多分类GBDT模型
gbc = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gbc.fit(X_train, y_train)
print("测试集准确率:", gbc.score(X_test, y_test))

In [None]:
2.2 关键参数说明
对于多分类问题，以下参数特别重要：

参数	说明	多分类注意事项
loss	损失函数	多分类时自动使用'deviance'(多项式损失)
n_classes_	类别数量	拟合后自动获取
init	初始预测	多分类时通常为类别先验概率

In [None]:
#  XGBOST

import xgboost as xgb
from sklearn.metrics import accuracy_score

# 参数设置
params = {
    'objective': 'multi:softmax',  # 多分类目标
    'num_class': 3,  # 类别数量
    'max_depth': 3,
    'eta': 0.1,
    'eval_metric': 'mlogloss'  # 多分类对数损失
}

# 训练
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(params, dtrain, num_boost_round=100)

# 预测
dtest = xgb.DMatrix(X_test)
y_pred = model.predict(dtest)
print("XGBoost准确率:", accuracy_score(y_test, y_pred))

In [None]:
# LightGBM

import lightgbm as lgb

# 创建数据集
train_data = lgb.Dataset(X_train, label=y_train)

# 参数设置
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'max_depth': 3,
    'learning_rate': 0.1
}

# 训练
model = lgb.train(params, train_data, num_boost_round=100)

# 预测
y_pred = model.predict(X_test)
y_pred_class = [list(p).index(max(p)) for p in y_pred]  # 转换为类别标签
print("LightGBM准确率:", accuracy_score(y_test, y_pred_class))

In [None]:
#多分类指标
from sklearn.metrics import (classification_report, 
                           confusion_matrix,
                           accuracy_score,
                           f1_score)

# 预测
y_pred = gbc.predict(X_test)

# 评估
print("分类报告:\n", classification_report(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
print("准确率:", accuracy_score(y_test, y_pred))
print("宏平均F1:", f1_score(y_test, y_pred, average='macro'))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',  # 或多分类指标如'f1_macro'
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)

In [None]:
# 使用class_weight参数
gbc = GradientBoostingClassifier(class_weight='balanced')