In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [2]:
%%time
# 导入训练数据集
data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])
data_raw = joblib.load('../../preprocess_data_new/train_ax_nodup.lz4').head(33465).drop(columns=['loan_dt','id','tag'])
data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])
data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])
week_df = joblib.load('../../preprocess_data_discrete/week_df.lz4').head(33465)
data = pd.concat([data_date,data_raw,data_null,data_tag,week_df],axis=1,copy=False)
data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])
x = data.fillna(-1).values
y = data_label.values.ravel()
x.shape

CPU times: user 25.4 s, sys: 5.34 s, total: 30.7 s
Wall time: 30.7 s


#### num_boost_round

In [None]:
%%time
param_test1 = {'n_estimators':range(150,300,10)}
xgb = XGBClassifier(max_depth=5,
                      learning_rate =0.05, 
                      booster='gbtree',
                      objective='binary:logistic',
                      early_stopping_rounds=100,
                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),
                      eval_metric='auc',
                      gamma=1,
                      reg_lambda=1,
                      subsample=0.9,
                      min_child_weight=1,
                      seed=2018,
                      silent=False,
                      n_jobs=24
                             )
gsearch1 = GridSearchCV(estimator = xgb, param_grid = param_test1, scoring='roc_auc',cv=5,n_jobs=8)
gsearch1.fit(x,y)

In [None]:
import joblib
joblib.dump(gsearch1,'./xgb/gsearch1')

In [8]:
gsearch1.best_params_,gsearch1.best_score_

({'n_estimators': 220}, 0.8262723491029047)

#### max_depth

In [9]:
param_test2 = {'max_depth':range(3,8,1)}

In [10]:
%%time
xgb = XGBClassifier(n_estimators = 220,
                      learning_rate =0.05, 
                      booster='gbtree',
                      objective='binary:logistic',
                      early_stopping_rounds=100,
                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),
                      eval_metric='auc',
                      gamma=1,
                      reg_lambda=1,
                      subsample=0.9,
                      min_child_weight=1,
                      seed=2018,
                      silent=False,
                      n_jobs=24
                             )
gsearch2 = GridSearchCV(estimator = xgb, param_grid = param_test2, scoring='roc_auc',cv=5,n_jobs=8)
gsearch2.fit(x,y)

CPU times: user 29min 42s, sys: 10 s, total: 29min 52s
Wall time: 41min 7s


In [11]:
import joblib
joblib.dump(gsearch2,'./xgb/gsearch2')

['./xgb/gsearch2']

In [12]:
gsearch2.best_params_,gsearch2.best_score_

({'max_depth': 3}, 0.8313512310612271)

### min_child_weight

In [13]:
param_test3 = {'min_child_weight':range(1,6,1)}
xgb = XGBClassifier(n_estimators = 220,
                      max_depth = 3,
                      learning_rate =0.05, 
                      booster='gbtree',
                      objective='binary:logistic',
                      early_stopping_rounds=100,
                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),
                      eval_metric='auc',
                      gamma=1,
                      reg_lambda=1,
                      subsample=0.9,
                      seed=2018,
                      silent=False,
                      n_jobs=24
                             )
gsearch3 = GridSearchCV(estimator = xgb, param_grid = param_test3, scoring='roc_auc',cv=5,n_jobs=8)
gsearch3.fit(x,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=100, eval_metric='auc',
       gamma=1, learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=220, n_jobs=24,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=14.22520473157416,
       seed=2018, silent=False, subsample=0.9),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'min_child_weight': range(1, 6)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [14]:
import joblib
joblib.dump(gsearch3,'./xgb/gsearch3')

['./xgb/gsearch3']

In [15]:
gsearch3.best_params_,gsearch3.best_score_

({'min_child_weight': 2}, 0.8314357495296593)

### subsample

In [20]:
%%time
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10,1)],
#  'colsample_bytree':[i/10.0 for i in range(6,10,1)]
}
xgb = XGBClassifier(n_estimators = 220,
                      max_depth = 3,
                      min_child_weight = 2,
                      learning_rate =0.05, 
                      booster='gbtree',
                      objective='binary:logistic',
                      early_stopping_rounds=100,
                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),
                      eval_metric='auc',
                      gamma=1,
                      reg_lambda=1,
                      seed=2018,
                      silent=False,
                      n_jobs=24
                             )
gsearch4 = GridSearchCV(estimator = xgb, param_grid = param_test4, scoring='roc_auc',cv=5,n_jobs=1,pre_dispatch=8)
gsearch4.fit(x,y)

CPU times: user 9h 53min 49s, sys: 8min 26s, total: 10h 2min 15s
Wall time: 31min 31s


In [21]:
import joblib
joblib.dump(gsearch4,'./xgb/gsearch4')

['./xgb/gsearch4']

In [25]:
gsearch4.best_params_,gsearch4.best_score_

({'subsample': 0.9}, 0.8314357495296593)

### 综合

In [None]:
xgb = XGBClassifier(n_estimators = 220,
                      max_depth = 3,
                      min_child_weight = 2,
                      learning_rate =0.05, 
                      subsample = 0.9,
                      booster='gbtree',
                      objective='binary:logistic',
                      early_stopping_rounds=100,
                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),
                      eval_metric='auc',
                      gamma=1,
                      reg_lambda=1,
                      seed=2018,
                      silent=False,
                      n_jobs=24
                             )