## 深入理解xgboost十四

### 基于特征重要性的特征选择

In [1]:
import xgboost as xgb

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel

In [2]:
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [4]:
model = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=100, objective="binary:logistic", booster="gbtree")
model.fit(X=X_train, y=y_train)

In [5]:
# 对测试集进行预测，并计算AUC
y_pred = model.predict(X=X_test)
auc = roc_auc_score(y_true=y_test, y_score=y_pred)
print("AUC 得分: %.2f" % (auc))

AUC 得分: 0.98


In [6]:
# 获取特征重要性
importance = model.feature_importances_
importance

array([0.00920666, 0.02353375, 0.00494846, 0.01797036, 0.01272686,
       0.0105133 , 0.01288558, 0.24048333, 0.00342244, 0.00464765,
       0.01402702, 0.0038198 , 0.00987051, 0.01139465, 0.00236183,
       0.00699555, 0.007381  , 0.00685163, 0.00614328, 0.00478291,
       0.01611804, 0.01893223, 0.15763223, 0.0549845 , 0.00794952,
       0.00309393, 0.02123242, 0.28822008, 0.00493709, 0.01293348],
      dtype=float32)

In [7]:
# 对特征重要性去重后作为候选阈值
thresholds = []
for imp in importance:
    if imp not in thresholds:
        thresholds.append(imp)
        
# 候选阈值排序
thresholds = sorted(thresholds)
thresholds

[0.0023618278,
 0.0030939253,
 0.0034224368,
 0.003819798,
 0.004647645,
 0.004782911,
 0.0049370904,
 0.0049484647,
 0.0061432756,
 0.0068516335,
 0.006995549,
 0.0073810015,
 0.007949515,
 0.009206663,
 0.009870507,
 0.010513297,
 0.011394647,
 0.012726858,
 0.012885581,
 0.012933484,
 0.014027023,
 0.016118037,
 0.017970355,
 0.01893223,
 0.021232419,
 0.023533752,
 0.0549845,
 0.15763223,
 0.24048333,
 0.28822008]

In [8]:
# 遍历候选阈值
for threshold in thresholds:
    # 通过threshold进行特征选择
    selection = SelectFromModel(model, threshold=threshold, prefit=True)
    selection_X_train = selection.transform(X_train)
    print(selection.get_support(True))
    
    # 训练模型
    selection_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.05, n_estimators=100, objective="binary:logistic", booster="gbtree")
    selection_model.fit(selection_X_train, y_train)
    
    # 评估模型
    selection_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(selection_X_test)
    auc = roc_auc_score(y_true=y_test, y_score=y_pred)
    print("阈值: %.3f, 特征数量: %d, AUC得分: %.2f" % (threshold, selection_X_train.shape[1], auc))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
阈值: 0.002, 特征数量: 30, AUC得分: 0.98
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29]
阈值: 0.003, 特征数量: 29, AUC得分: 0.98
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 17 18 19 20 21 22 23 24
 26 27 28 29]
阈值: 0.003, 特征数量: 28, AUC得分: 0.98
[ 0  1  2  3  4  5  6  7  9 10 11 12 13 15 16 17 18 19 20 21 22 23 24 26
 27 28 29]
阈值: 0.004, 特征数量: 27, AUC得分: 0.98
[ 0  1  2  3  4  5  6  7  9 10 12 13 15 16 17 18 19 20 21 22 23 24 26 27
 28 29]
阈值: 0.005, 特征数量: 26, AUC得分: 0.98
[ 0  1  2  3  4  5  6  7 10 12 13 15 16 17 18 19 20 21 22 23 24 26 27 28
 29]
阈值: 0.005, 特征数量: 25, AUC得分: 0.98
[ 0  1  2  3  4  5  6  7 10 12 13 15 16 17 18 20 21 22 23 24 26 27 28 29]
阈值: 0.005, 特征数量: 24, AUC得分: 0.98
[ 0  1  2  3  4  5  6  7 10 12 13 15 16 17 18 20 21 22 23 24 26 27 29]
阈值: 0.005, 特征数量: 23, AUC得分: 0.98
[ 0  1  3  4  5  6  7 10 12 13 15 16 17 18 20 21 22 23 24 26 27 29]
阈值: 0.006,