In [1]:
# 首先加载需要用到的库
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV


# 下面是与图像有关的库以及设置
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

## 载入数据

In [2]:
train = pd.read_csv("../../Kaggle_data/xgboost/train_modified.csv")
test = pd.read_csv("../../Kaggle_data/xgboost/test_modified.csv")

In [3]:
train.shape, test.shape

((87020, 51), (37717, 50))

In [4]:
target = 'Disbursed'
IDcol = 'ID'

In [5]:
train['Disbursed'].value_counts()

0.0    85747
1.0     1273
Name: Disbursed, dtype: int64

## 构建一个函数

要求能有一下几点功能：  
    1.数据建模  
    2.求训练准确率  
    3.求训练集AUC  
    4.根据xgboost交叉验证更新n_estimators  
    5.画出特征的重要度  

In [23]:
def modelfit(alg, dtrain, dtest, predictiors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)  # 转换成xgboost需要的训练集格式
        xgtest = xgb.DMatrix(dtest[predictors].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
             early_stopping_rounds=early_stopping_rounds, fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, 
callbacks=None, shuffle=True)
        alg.set_params(n_estimators=cvresult.shape[0])
        
        
    # 进行建模
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
    
    
    # 对训练集进行预测，来计算训练集最后的精度
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    
    # 将结果进行输出
    print("\n关于现在这个模型")
    print("准确率 : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print("AUC 得分 (训练集): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
    

In [24]:
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb1, train, test, predictors)


关于现在这个模型
准确率 : 0.9854
AUC 得分 (训练集): 0.806919


In [None]:
#对subsample 和 max_features 用grid search查找最好的参数
param_test1 = {
    'max_depth':list(range(3,10,2)),
    'min_child_weight':list(range(1,6,2))
}


gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
                       param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])