In [2]:
import numpy as np

from sklearn.preprocessing import PolynomialFeatures

In [3]:
X = np.arange(6).reshape(3,2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [4]:
# 多项式扩展2维，输入：a,b--> 1, a, b, a^2, ab, b^2
poly1 = PolynomialFeatures(2)
poly1.fit(X)
print(poly1)
print(poly1.transform(X))

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
[[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]


In [5]:
# interaction_only当设置为true的时候，表示不使用单个变量的多次项扩充维度
poly2 = PolynomialFeatures(interaction_only=True)
poly2.fit(X)
print(poly2)
print(poly2.transform(X))

PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)
[[ 1.  0.  1.  0.]
 [ 1.  2.  3.  6.]
 [ 1.  4.  5. 20.]]


In [6]:
# include_bias当设置为true的时候，表示不加入常数项1
poly3 = PolynomialFeatures(include_bias=False)
poly3.fit(X)
print(poly3)
print(poly3.transform(X))

PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
[[ 0.  1.  0.  0.  1.]
 [ 2.  3.  4.  6.  9.]
 [ 4.  5. 16. 20. 25.]]


In [7]:
# GBDT+LR
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

# 弱分类器的数目
n_estimator = 500
# 随机生成分类数据。
X, y = make_classification(n_samples=80000)  
# 切分为测试集和训练集，比例0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# 将训练集切分为两部分，一部分用于训练GBDT模型，
# 另一部分输入到训练好的GBDT模型生成GBDT特征，然后作为LR的特征。
# 这样分成两部分是为了防止过拟合。
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5)
# 调用GBDT分类模型。
grd = GradientBoostingClassifier(n_estimators=n_estimator)
# 调用one-hot编码。
grd_enc = OneHotEncoder()
# 调用LR分类模型。
grd_lm = LogisticRegression()

  from collections import Iterable
  from numpy.core.umath_tests import inner1d


In [8]:
'''使用X_train训练GBDT模型，后面用此模型构造特征'''
grd.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [9]:
# grd.apply方法给定的是：样本在算法中落在第几个叶子上，
# 在GBDT中该方法返回的格式为: [n_samples, n_estimator, n_classes]
grd.apply(X_train)[:, :, :].shape

(20000, 500, 1)

In [10]:
# fit one-hot编码器
grd_enc.fit(grd.apply(X_train)[:, :, 0])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [11]:
''' 
使用训练好的GBDT模型构建特征，然后将特征经过one-hot编码作为新的特征输入到LR模型训练。
'''
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# 用训练好的LR模型多X_test做预测
y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
# 根据预测结果输出
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
print("AUC:{}".format(auc(fpr_grd_lm, tpr_grd_lm)))

AUC:0.9879890020314991
