In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.ensemble import VotingClassifier # 投票法的融合（sklearn专用）
from sklearn.ensemble import GradientBoostingClassifier as GBC # 梯度提升数
from sklearn.ensemble import RandomForestClassifier as RFC     # 随机森林
import xgboost as xgb
from sklearn.model_selection import KFold     # K折交叉验证
from sklearn.model_selection import cross_val_score # 计算交叉验证分数
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.rcParams['axes.unicode_minus'] = False

In [36]:
train = pd.read_csv('train_bf6.csv',index_col=0)
test = pd.read_csv('test_bf6.csv',index_col=0)

In [37]:
train.head()

Unnamed: 0,异常,订单ID,商品一级类别,商品所属渠道,商品ID,品牌,订单金额,商品销售数量,订单渠道,支付方式,...,商品一级类别异常率,订单渠道异常率,商品所属渠道异常率,用户异常率,商品异常率,品牌异常率,金额分箱,分箱均值,分箱异常率,销量分箱
0,1,4283851335,5,1,13080,199,766000.0,200,1,0,...,0.261565,0.167849,0.294727,0.133333,1.0,0.340804,9.0,766000.0,1.0,1
1,1,4281111595,2,1,15075,1856,100.0,100,1,0,...,0.26284,0.167849,0.294727,0.333333,0.333333,0.25,0.0,136.243951,0.203899,1
2,0,4106833871,3,1,15212,676,8800.0,100,1,0,...,0.235079,0.167849,0.294727,0.0,0.238462,0.286842,3.0,9128.895833,0.264881,1
3,0,4253622967,10,1,11454,1568,880.0,100,1,0,...,0.215029,0.167849,0.294727,-1.0,0.296296,0.298387,0.0,136.243951,0.203899,1
4,0,4276159555,3,0,4800,262,4900.0,100,1,0,...,0.235079,0.167849,0.178732,0.133333,0.119565,0.179739,2.0,4165.485313,0.326309,1


In [38]:
test.head()

Unnamed: 0,异常,订单ID,商品一级类别,商品所属渠道,商品ID,品牌,订单金额,商品销售数量,订单渠道,支付方式,...,商品一级类别异常率,订单渠道异常率,商品所属渠道异常率,用户异常率,商品异常率,品牌异常率,金额分箱,分箱均值,分箱异常率,销量分箱
0,1,4276537082,11,1,12689.0,1406.0,19900.0,100,1,0,...,0.201848,0.167849,0.294727,-1.0,0.0,0.083333,4.0,16872.602041,0.336735,1
1,0,3977175284,5,1,15601.0,339.0,990.0,100,1,0,...,0.261565,0.167849,0.294727,-1.0,0.153846,0.372263,1.0,1775.061962,0.246125,1
2,0,4245023523,2,1,15075.0,1856.0,100.0,100,1,0,...,0.26284,0.167849,0.294727,0.333333,0.333333,0.25,0.0,136.243951,0.203899,1
3,0,4284515355,9,1,,393.0,21450.0,50,1,0,...,0.193856,0.167849,0.294727,-1.0,-1.0,0.162162,4.0,16872.602041,0.336735,1
4,0,4284735147,3,1,15111.0,797.0,1950.0,50,1,0,...,0.235079,0.167849,0.294727,-1.0,0.248485,0.240741,1.0,1775.061962,0.246125,1


In [39]:
Xtrain = train.iloc[:,1:]
Xtest = test.iloc[:,1:]
Ytrain = train.iloc[:,0]
Ytest = test.iloc[:,0]

In [40]:
dtrain = xgb.DMatrix(Xtrain, Ytrain)
dtest = xgb.DMatrix(Xtest, Ytest)

In [41]:
# 使用随机森林与GBDC分别进行预测观察结果

In [42]:
rf = RFC(n_estimators=200, random_state=1412)
gbdc = GBC(n_estimators=200, random_state=1412)

In [43]:
# 依然使用和建立benchmark时一模一样的最简单的KFold交叉验证，随机数种子也保持一致
cv = KFold(n_splits=5, shuffle=True, random_state=1412)

In [44]:
result_gbdc_cv = cross_val_score(gbdc, Xtrain, Ytrain, cv=cv)

In [45]:
result_gbdc_cv.mean()

0.8797078430222396

In [46]:
result_gbdc_cv.var()

1.5357936616575313e-06

In [47]:
result_rf_cv = cross_val_score(rf, Xtrain, Ytrain, cv=cv)

In [48]:
result_rf_cv.mean()

0.8753973350431382

In [49]:
result_rf_cv.var()

1.3331735987407713e-06

In [50]:
# 使用XGBoost观察结果

In [51]:
param = {
    'objective': 'binary:logistic', #交叉熵损失
    'eval_metric': 'error', # 不支持准确率，在这里使用的是错误率
    'seed': 1412
}

In [52]:
xgbcv = xgb.cv(param, dtrain, num_boost_round=200,#xgboost中的树
              nfold=5, seed = 1412, shuffle=True)

In [53]:
1 - xgbcv.loc[199,'test-error-mean']

0.8760504418818318

可以看到，比起benchmark的平均准确率83.104%，方差2.45*10^-6，现在的模型无论是准确率均值还是方差都得到了极大地提升。现在再观察三个模型的AUC分数如何

In [54]:
# 每次训练完毕后需要重新设置类，否则会在原来的基础上进行训练
rf = RFC(n_estimators=200, random_state=1412)
gbdc = GBC(n_estimators=200, random_state=1412)
cv = KFold(n_splits=5, shuffle=True, random_state=1412)

In [55]:
result_gbdc_cv = cross_val_score(gbdc, Xtrain, Ytrain, cv=cv, scoring='roc_auc')

In [56]:
result_gbdc_cv.mean()

0.9350844719547844

In [57]:
result_gbdc_cv.var()

4.918121526815208e-07

In [58]:
result_rf_cv = cross_val_score(rf, Xtrain, Ytrain, cv=cv, scoring='roc_auc')

In [59]:
param2 = {
    'objective': 'binary:logistic', #交叉熵损失
    'eval_metric': 'auc', # 模型评估指标
    'seed': 1412
}

In [60]:
xgbcv = xgb.cv(param2, dtrain, num_boost_round=200, nfold=5, seed=1412, shuffle=True)

In [61]:
xgbcv.loc[199, 'test-auc-mean']

0.932221733732602

In [62]:
xgbcv.loc[199, 'test-auc-std']**2

1.2155223567341144e-06

benchmark的交叉验证平均为0.8589，方差1.76*10^-6，现在三个模型的AUC分数也得到了大幅提升，方差变小了8到10倍，可见特征工程是成功的。现在基于目前的水平，可以在此基础上继续调参，来观察模型的结果

### 依照AUC，使用学习曲线进行调参

集成模型的默认参数都是基于经验精心设计的，当我们使用默认参数时，模型就已经达到了某种上限，因此调参在集成模型上基本是对结果进行‘微调’,集成模型调参之后的表现不会与默认参数的表现差距太大。因此，为了大幅提升模型的效果，我们需要考虑一些激进的方案

对于一个不均衡的样本来说，处理样本不均衡问题往往能大幅提升模型auc和准确率。

In [63]:
(Ytrain==1).sum() / Ytrain.shape[0]

0.2167784212130448

对随机森林来说，我们可以试着调用参数class_weight。在class_weight中输入{类别1：占比，类别2：占比}就可以设定训练中两个标签类别的比例，或者使用'balance'模式。

在balance模式下：
    0的比例为 样本量/（类别总量*标签中为0的样本量）
    1的比例为 样本量 / （类别总量*标签中为1的样本量）

In [64]:
for weights in [{0:0.5, 1:0.5},{0:0.5, 1:1}, 'balanced', {0:0.5, 1:2}, {0:0.5, 1: 2.5}]:
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    rf = RFC(n_estimators=5, random_state=1412, class_weight=weights)
    rfcv = cross_val_score(rf, Xtrain, Ytrain, cv = cv, scoring='roc_auc')
    
    print(weights)
    print('\t rf_mean:{:.5f}'.format(rfcv.mean()))
    print('\t rf_var:{}'.format(rfcv.var()))

{0: 0.5, 1: 0.5}
	 rf_mean:0.87692
	 rf_var:4.788624387804161e-06
{0: 0.5, 1: 1}
	 rf_mean:0.87771
	 rf_var:6.217835483804769e-06
balanced
	 rf_mean:0.87602
	 rf_var:5.909270038385735e-06
{0: 0.5, 1: 2}
	 rf_mean:0.87701
	 rf_var:6.4150115691402e-06
{0: 0.5, 1: 2.5}
	 rf_mean:0.87439
	 rf_var:6.80003211246821e-06


In [66]:
for weights in [0.5, 0.3, 0.1]:
    num_round = 200
    param = {'objective':'binary:logistic','eval_metric':'auc','scale_pos_weight':weights}
    xgbcv = xgb.cv(param, dtrain, num_boost_round=num_round, nfold=5, seed=1442, shuffle=True)
    
    print(weights)
    print('\t xgb_mean:{:.5f}'.format(xgbcv.loc[num_round-1, 'test-auc-mean']))
    print('\t xgb_std:{}'.format((xgbcv.loc[num_round-1,'test-auc-std'])**2))

0.5
	 xgb_mean:0.93248
	 xgb_std:1.3052234454075425e-06
0.3
	 xgb_mean:0.93308
	 xgb_std:1.9070475933052777e-06
0.1
	 xgb_mean:0.93412
	 xgb_std:2.91245161035778e-06


In [67]:
for weights in [0.05, 0.03, 0.01]:
    num_round = 200
    param = {'objective':'binary:logistic','eval_metric':'auc','scale_pos_weight':weights}
    xgbcv = xgb.cv(param, dtrain, num_boost_round=num_round, nfold=5, seed=1442, shuffle=True)
    
    print(weights)
    print('\t xgb_mean:{:.5f}'.format(xgbcv.loc[num_round-1, 'test-auc-mean']))
    print('\t xgb_std:{}'.format((xgbcv.loc[num_round-1,'test-auc-std'])**2))

0.05
	 xgb_mean:0.93463
	 xgb_std:2.1563705777309015e-06
0.03
	 xgb_mean:0.93469
	 xgb_std:2.7568053242766273e-06
0.01
	 xgb_mean:0.93458
	 xgb_std:2.6796508301997995e-06


#### 树的数量

随机森林调参

In [68]:
for num_round in [25, 50, 100, 200]:
    rf = RFC(n_estimators=num_round, random_state=1412, class_weight={0:1, 1:5})
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    rf_cv = cross_val_score(rf, Xtrain, Ytrain, cv=cv, scoring='roc_auc')
    
    print(num_round)
    print('\t rf_mean:{:.5f}'.format(rf_cv.mean()))
    print('\t rf_var:{}'.format(rf_cv.var()))

25
	 rf_mean:0.91957
	 rf_var:2.960612488993737e-06
50
	 rf_mean:0.92477
	 rf_var:2.280498676755502e-06
100
	 rf_mean:0.92747
	 rf_var:1.7989561191375065e-06
200
	 rf_mean:0.92875
	 rf_var:2.1791149965394454e-06


In [69]:
for num_round in [300, 400, 500]:
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    rf = RFC(n_estimators=num_round, random_state=1412, class_weight={0:1, 1:5})
    rf_cv = cross_val_score(rf, Xtrain, Ytrain, cv=cv, scoring='roc_auc')
    
    print(num_round)
    print('\t rf_mean:{:.5f}'.format(rf_cv.mean()))
    print('\t rf_var:{}'.format(rf_cv.var()))

300
	 rf_mean:0.92907
	 rf_var:2.0491863316897384e-06
400
	 rf_mean:0.92931
	 rf_var:1.973028169956946e-06
500
	 rf_mean:0.92947
	 rf_var:2.0358324266086187e-06


In [70]:
rf = RFC(n_estimators=500, random_state=1412, class_weight={0:1,1:5})

#### GBDT调参

In [71]:
for num_round in [25, 50, 100, 200, 300, 400, 500]:
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    gbdt = GBC(n_estimators=num_round, random_state=1412)
    gbdt_cv = cross_val_score(gbdt, Xtrain, Ytrain, cv=cv, scoring='roc_auc')
    
    print(num_round)
    print('\t gbdt_mean:{:.5f}'.format(gbdt_cv.mean()))
    print('\t gbdt_var:{}'.format(gbdt_cv.var()))

25
	 gbdt_mean:0.92294
	 gbdt_var:8.344639149241532e-07
50
	 gbdt_mean:0.92930
	 gbdt_var:4.104757449478226e-07
100
	 gbdt_mean:0.93333
	 gbdt_var:3.4205404458299343e-07
200
	 gbdt_mean:0.93508
	 gbdt_var:4.918121526815208e-07
300
	 gbdt_mean:0.93599
	 gbdt_var:5.262058356572777e-07
400
	 gbdt_mean:0.93639
	 gbdt_var:5.762201477156985e-07
500
	 gbdt_mean:0.93656
	 gbdt_var:5.344904573177395e-07


GBDT的AUC随着树的增多而变大，但考虑到后续可使用学习率调参，为了后续节省时间，n_estimators = 200

In [72]:
gbdt = GBC(n_estimators=200, random_state=1412)

#### XGBoost调参

In [73]:
for num_round in [40, 60, 80]:
    param = {"objective":'binary:logistic',"eval_metric": "auc","scale_pos_weight": 0.5}
    xgbcv = xgb.cv(param, dtrain, num_boost_round=num_round, nfold=5, seed=1412, shuffle=True)
    print(num_round, "xgb:{:.5f}".format(xgbcv.loc[num_round-1,"test-auc-mean"]))

40 xgb:0.93675
60 xgb:0.93667
80 xgb:0.93638


### 学习率

随机森林没有调整学习率的参数，XGBoost与GBDT拥有，其中GBDT的参数是learning_rate（默认0.1），XGBoost的参数是eta（默认0.3）,尝试将学习率的取值向两边拓展

随机森林（无法调参）

GBDT

In [75]:
for lr in [0.05,0.1,0.3,0.5]:
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    gbdt = GBC(n_estimators=200, random_state=1412, learning_rate=lr)
    gbdt_cv = cross_val_score(gbdt, Xtrain, Ytrain, cv=cv, scoring='roc_auc')
    
    print(lr)
    print('\t gbdt_mean:{:.5f}'.format(gbdt_cv.mean()))
    print('\t gbdt_var:{}'.format(gbdt_cv.var()))

0.05
	 gbdt_mean:0.93331
	 gbdt_var:3.8601450408924676e-07
0.1
	 gbdt_mean:0.93508
	 gbdt_var:4.918121526815208e-07
0.3
	 gbdt_mean:0.93576
	 gbdt_var:7.4610850412776e-07
0.5
	 gbdt_mean:0.93435
	 gbdt_var:1.169508246046095e-06


In [76]:
for lr in [0.25,0.27,0.31,0.33, 0.35]:
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    gbdt = GBC(n_estimators=200, random_state=1412, learning_rate=lr)
    gbdt_cv = cross_val_score(gbdt, Xtrain, Ytrain, cv=cv, scoring='roc_auc')
    
    print(lr)
    print('\t gbdt_mean:{:.5f}'.format(gbdt_cv.mean()))
    print('\t gbdt_var:{}'.format(gbdt_cv.var()))

0.25
	 gbdt_mean:0.93616
	 gbdt_var:3.8304005825755627e-07
0.27
	 gbdt_mean:0.93619
	 gbdt_var:9.093100214831573e-07
0.31
	 gbdt_mean:0.93603
	 gbdt_var:5.083136173604246e-07
0.33
	 gbdt_mean:0.93618
	 gbdt_var:1.151476272575224e-06
0.35
	 gbdt_mean:0.93563
	 gbdt_var:7.378146262465456e-07


GBDT的学习率采用learning_rate=0.27：

In [77]:
gbdt = GBC(n_estimators=200, random_state=1412, learning_rate=0.27)

#### XGBoost

In [78]:
for lr in [0.05,0.1,0.3,0.5]:
    param = {"objective":'binary:logistic', "eval_metric": "auc", "scale_pos_weight": 0.5, "eta":lr}
    xgbcv = xgb.cv(param, dtrain , num_boost_round=40, nfold=5, seed=1412, shuffle=True)
    print(lr, "xgb:{:.5f}".format(xgbcv.loc[40-1,"test-auc-mean"]))

0.05 xgb:0.93261
0.1 xgb:0.93447
0.3 xgb:0.93675
0.5 xgb:0.93485


In [79]:
for lr in [0.25,0.28,0.32,0.35]:
    param = {"objective":'binary:logistic', "eval_metric": "auc", "scale_pos_weight": 0.5, "eta":lr}    
    xgbcv = xgb.cv(param, dtrain , num_boost_round=40, nfold=5, seed=1412, shuffle=True)    
    print(lr, "xgb:{:.5f}".format(xgbcv.loc[40-1,"test-auc-mean"]))

0.25 xgb:0.93656
0.28 xgb:0.93700
0.32 xgb:0.93681
0.35 xgb:0.93641


XGBoost的学习率采用"eta":0.30

In [80]:
param = {"objective":'binary:logistic', "eval_metric": "auc", "scale_pos_weight": 0.5, "eta":0.30}

#### 对抗过拟合(max_depth)

In [81]:
for max_depth in [5,10,15,20,25]:
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    
    rf = RFC(n_estimators = 500,random_state=1412 ,class_weight = {0:1, 1:5}, max_depth = max_depth)
    gbdt = GBC(n_estimators = 200, random_state = 1412, learning_rate = 0.27, max_depth = max_depth)
    param = {"objective":'binary:logistic', "eval_metric": "auc", "scale_pos_weight": 0.5, "eta":0.3, "max_depth" : max_depth}
    
    rf_cv = cross_val_score(rf, Xtrain, Ytrain, cv = cv, scoring='roc_auc')
    gbdt_cv = cross_val_score(gbdt, Xtrain, Ytrain, cv = cv, scoring='roc_auc')
    xgbcv = xgb.cv(param, dtrain, num_boost_round=40, nfold=5, seed = 1412, shuffle=True)
    
    print(max_depth)
    print("\t rf:{:.5f}".format(rf_cv.mean()))
    print("\t gbdt:{:.5f}".format(gbdt_cv.mean()))
    print("\t xgb:{:.5f}".format(xgbcv.loc[40-1,"test-auc-mean"]))

5
	 rf:0.90850
	 gbdt:0.93514
	 xgb:0.93614
10
	 rf:0.93042
	 gbdt:0.92789
	 xgb:0.93482
15
	 rf:0.93154
	 gbdt:0.92594
	 xgb:0.92982
20
	 rf:0.92966
	 gbdt:0.92670
	 xgb:0.92901
25
	 rf:0.92954
	 gbdt:0.92529
	 xgb:0.92968


In [86]:
rf = RFC(n_estimators = 500, random_state=1412, class_weight = {0:1, 1:5}, max_depth = 13)

查看各个特征从重要性：

In [87]:
FeatureImportance = pd.concat([pd.DataFrame(Xtrain.columns), pd.DataFrame(rf.feature_importances_), pd.DataFrame(gbdt.feature_importances_)], axis =1)
FeatureImportance.columns = ['特征', '随机森林', 'GBDT']
FeatureImportance.sort_values('随机森林', ascending = False)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

### 3. 投票法进行模型融合

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RFC
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score

In [89]:
train=pd.read_csv('train_bf6.csv', index_col = 0)
test =pd.read_csv('test_bf6.csv' , index_col = 0)

In [90]:
Xtrain = train.iloc[:, 1:]
Xtest  = test.iloc[:, 1:]
Ytrain = train.iloc[:, 0]
Ytest  = test.iloc[:, 0]

dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)

硬投票：适用于默认参数的模型，返回的是预测的标签；
软投票：适用于精调过的模型，返回的是预测的标签的概率；此项目使用软投票

应用软投票进行模型融合

In [91]:
#调参后的模型
rf = RFC(n_estimators=500, random_state=1412, class_weight={0:1,1:5}, max_depth=13)
gbdt = GBC(n_estimators=200, random_state=1412, learning_rate=0.27)
param = {"objective":'binary:logistic', "eval_metric": "auc", "scale_pos_weight": 0.5, "eta":0.3, "seed":1412}

In [92]:
rf = rf.fit(Xtrain, Ytrain)
gbdt = gbdt.fit(Xtrain, Ytrain)
xgbc = xgb.train(param, dtrain, num_boost_round=40)

In [95]:
#在训练集上输出训练概率
rf_prob_train = rf.predict_proba(Xtrain)
gbdt_prob_train = gbdt.predict_proba(Xtrain)
xgb_prob_train  = xgbc.predict(dtrain)

In [96]:
prob_train = pd.concat([pd.DataFrame(rf_prob_train,columns=["rf0","rf1"])
                        ,pd.DataFrame(gbdt_prob_train,columns=["gbc0","gbc1"])
                        ,pd.DataFrame(xgb_prob_train,columns=["xgb1"])]
                                      ,axis=1)

In [97]:
prob_train.head()

Unnamed: 0,rf0,rf1,gbc0,gbc1,xgb1
0,0.325029,0.674971,0.083014,0.916986,0.65124
1,0.374572,0.625428,0.671889,0.328111,0.211359
2,0.856583,0.143417,0.991961,0.008039,0.007826
3,0.534456,0.465544,0.809238,0.190762,0.083628
4,0.733234,0.266766,0.679392,0.320608,0.069288


In [98]:
prob_train['xgb0'] = 1 - prob_train['xgb1']
prob_train.head()

Unnamed: 0,rf0,rf1,gbc0,gbc1,xgb1,xgb0
0,0.325029,0.674971,0.083014,0.916986,0.65124,0.34876
1,0.374572,0.625428,0.671889,0.328111,0.211359,0.788641
2,0.856583,0.143417,0.991961,0.008039,0.007826,0.992174
3,0.534456,0.465544,0.809238,0.190762,0.083628,0.916372
4,0.733234,0.266766,0.679392,0.320608,0.069288,0.930712


In [99]:
rf_num, gbdt_num, xgb_num = 4, 0.8, 0.2
#将所有样本为0的概率按权重加和
prob_train.loc[:,"0"] = rf_num * prob_train.loc[:,'rf0'] + gbdt_num * prob_train.loc[:,'gbc0'] + xgb_num * prob_train.loc[:,"xgb0"]
#将所有样本为0的概率按权重加和
prob_train.loc[:,"1"] = rf_num * prob_train.loc[:,"rf1"] + gbdt_num * prob_train.loc[:,"gbc1"] + xgb_num * prob_train.loc[:,"xgb1"]

prob_train.head()

Unnamed: 0,rf0,rf1,gbc0,gbc1,xgb1,xgb0,0,1
0,0.325029,0.674971,0.083014,0.916986,0.65124,0.34876,1.43628,3.56372
1,0.374572,0.625428,0.671889,0.328111,0.211359,0.788641,2.193526,2.806474
2,0.856583,0.143417,0.991961,0.008039,0.007826,0.992174,4.418335,0.581665
3,0.534456,0.465544,0.809238,0.190762,0.083628,0.916372,2.968488,2.031512
4,0.733234,0.266766,0.679392,0.320608,0.069288,0.930712,3.662593,1.337407


In [100]:
#将为1的概率压缩到[0,1]之间，当做最终的概率来输入auc计算函数
prob_train["adjusted1"] = prob_train['1'] / (prob_train['0']+prob_train['1'])
prob_train.head()

Unnamed: 0,rf0,rf1,gbc0,gbc1,xgb1,xgb0,0,1,adjusted1
0,0.325029,0.674971,0.083014,0.916986,0.65124,0.34876,1.43628,3.56372,0.712744
1,0.374572,0.625428,0.671889,0.328111,0.211359,0.788641,2.193526,2.806474,0.561295
2,0.856583,0.143417,0.991961,0.008039,0.007826,0.992174,4.418335,0.581665,0.116333
3,0.534456,0.465544,0.809238,0.190762,0.083628,0.916372,2.968488,2.031512,0.406302
4,0.733234,0.266766,0.679392,0.320608,0.069288,0.930712,3.662593,1.337407,0.267481


将上述代码打包成函数：

In [101]:
def PredictProb(Xtrain, Ytrain, dtrain, TrainOrTest1, TrainOrTest2):                   
    # 调参后的模型
    rf = RFC(n_estimators = 500,random_state=1412 ,class_weight = {0:1, 1:5}, max_depth = 13)
    gbdt = GBC(n_estimators = 200, random_state = 1412, learning_rate = 0.27)
    param = {"objective":'binary:logistic', "eval_metric": "auc", "scale_pos_weight": 0.5, "eta":0.3, "seed":1412} 

    # 在训练集上进行训练
    rf = rf.fit(Xtrain, Ytrain)
    gbdt = gbdt.fit(Xtrain, Ytrain)
    xgbc = xgb.train(param, dtrain, num_boost_round=40)

    # 在XX集上输出训练概率
    rf_prob   = rf.predict_proba(TrainOrTest1)
    gbdt_prob = gbdt.predict_proba(TrainOrTest1)
    xgb_prob  = xgbc.predict(TrainOrTest2)

    prob = pd.concat([pd.DataFrame(rf_prob,columns=["rf0","rf1"])
                            ,pd.DataFrame(gbdt_prob,columns=["gbc0","gbc1"])
                            ,pd.DataFrame(xgb_prob,columns=["xgb1"])]
                                        ,axis=1)
    prob['xgb0'] = 1 - prob['xgb1']
    rf_num, gbdt_num, xgb_num = 4, 0.8, 0.2
    prob.loc[:,"0"] = rf_num * prob.loc[:,"rf0"] + gbdt_num * prob.loc[:,"gbc0"] + xgb_num * prob.loc[:,"xgb0"]
    prob.loc[:,"1"] = rf_num * prob.loc[:,"rf1"] + gbdt_num * prob.loc[:,"gbc1"] + xgb_num * prob.loc[:,"xgb1"]
    prob["adjusted1"] = prob["1"]/(prob["0"] + prob["1"])

    return prob

#### 模型融合后的训练集AUC和ACC

In [102]:
#计算AUC
roc_auc_score(Ytrain, prob_train['adjusted1'])

0.9609215934499938

In [103]:
# 计算准确率
Ytrain_pred = ((prob_train["adjusted1"]) > 0.5).astype("int")
accuracy_score(Ytrain,Ytrain_pred)

0.8785866678277529

In [104]:
Ytrain_pred = ((prob_train["adjusted1"]) > 0.7).astype("int")
accuracy_score(Ytrain,Ytrain_pred)

0.9049832368180433

In [105]:
# 模型融合后的测试集AUC和ACC
prob_test = PredictProb(Xtrain, Ytrain, dtrain, Xtest, dtest)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').