| #!/usr/bin/python | |
| import sys | |
| import numpy as np | |
| sys.path.append('../../wrapper') | |
| import xgboost as xgb | |
| ### load data in do training | |
| train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } ) | |
| label = train[:,32] | |
| data = train[:,1:31] | |
| weight = train[:,31] | |
| dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) | |
| param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4} | |
| num_round = 120 | |
| print ('running cross validation, with preprocessing function') | |
| # define the preprocessing function | |
| # used to return the preprocessed training, test data, and parameter | |
| # we can use this to do weight rescale, etc. | |
| # as a example, we try to set scale_pos_weight | |
| def fpreproc(dtrain, dtest, param): | |
| label = dtrain.get_label() | |
| ratio = float(np.sum(label == 0)) / np.sum(label==1) | |
| param['scale_pos_weight'] = ratio | |
| wtrain = dtrain.get_weight() | |
| wtest = dtest.get_weight() | |
| sum_weight = sum(wtrain) + sum(wtest) | |
| wtrain *= sum_weight / sum(wtrain) | |
| wtest *= sum_weight / sum(wtest) | |
| dtrain.set_weight(wtrain) | |
| dtest.set_weight(wtest) | |
| return (dtrain, dtest, param) | |
| # do cross validation, for each fold | |
| # the dtrain, dtest, param will be passed into fpreproc | |
| # then the return value of fpreproc will be used to generate | |
| # results of that fold | |
| xgb.cv(param, dtrain, num_round, nfold=5, | |
| metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc) |