In [9]:
import pandas as pd
import xgboost as xgb
import pickle
import os
import sys,random

In [2]:
if not os.path.exists('featurescore'):
    os.mkdir('featurescore')
if not os.path.exists('model'):
    os.mkdir('model')
if not os.path.exists('preds'):
    os.mkdir('preds')

In [6]:
%%time
#load data
# 使用rank特征需要归一化
train_x = pd.read_csv("../../preprocess_data/train_x_float_rank.csv")
train_y = pd.read_csv("../../preprocess_data/train_y_33465.csv")

train_x.drop(["id"], axis=1, inplace=True)
train_x = train_x/len(train_x)
dtrain = xgb.DMatrix(train_x, label=train_y)
    
valid = pd.read_csv("../../preprocess_data/valid_float_rank.csv")
valid_id = valid.id
valid.drop("id",axis=1,inplace=True)
valid = valid/len(valid)
dvalid = xgb.DMatrix(valid)

CPU times: user 40.6 s, sys: 2.85 s, total: 43.4 s
Wall time: 42 s


In [14]:
def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    params={
        'booster':'gbtree',
        'objective': 'binary:logistic',
#         'scale_pos_weight': float(len(y)-sum(y))/float(sum(y)), # sum(negative instances) / sum(positive instances)， for unbalanced samples
        'eval_metric': 'auc',
        'gamma':gamma,
        'max_depth':max_depth,
        'lambda':lambd,
        'subsample':subsample,
        'colsample_bytree':colsample_bytree,
        'min_child_weight':min_child_weight, 
        'eta': 0.04,
        'seed':random_seed,
        'nthread':-1
        }
    
    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=1350,evals=watchlist)
    model.save_model('./model/xgb{0}.model'.format(iteration))
    
    #predict test set
    test_y = model.predict(dtest)
    test_result = pd.DataFrame(columns=["id","score"])
    test_result.id = test_id
    test_result.score = test_y
    test_result.to_csv("./preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #save feature score
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)

In [15]:
random_seed = [x for x in range(1000,2000,10)]
gamma = [i/1000.0 for i in range(100,200,1)]
max_depth = [6,7,8]
lambd = [x for x in range(100,200,1)]
subsample = [i/1000.0 for i in range(500,700,2)]
colsample_bytree = [i/1000.0 for i in range(250,350,1)]
min_child_weight = [i/1000.0 for i in range(200,300,1)]
random.shuffle(random_seed)
random.shuffle(gamma)
random.shuffle(max_depth)
random.shuffle(lambd)
random.shuffle(subsample)
random.shuffle(colsample_bytree)
random.shuffle(min_child_weight)

In [1]:
%%time
#save params for reproducing
with open('params.pkl','wb') as f:
    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

#to reproduce my result, uncomment following lines
"""
with open('params_for_reproducing.pkl','r') as f:
    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    
"""
#train 100 xgb

for i in range(100):
    pipeline(dtrain,dvalid,valid_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])