In [1]:
import pandas as pd
import xgboost as xgb
import sys,random
import pickle
import os

In [2]:
if not os.path.exists('featurescore'):
    os.mkdir('featurescore')
if not os.path.exists('model'):
    os.mkdir('model')
if not os.path.exists('preds'):
    os.mkdir('preds')

In [3]:
%%time
#use rank_d  and rank_nd feature
#load data
train_x_d = pd.read_csv('../../preprocess_data/train_x_float_rank_d.csv')
train_x_nd = pd.read_csv('../../preprocess_data/train_x_float_rank_nd.csv')
train_x = pd.merge(train_x_d,train_x_nd,on='id')
train_y = pd.read_csv('../../preprocess_data/train_y_33465.csv')
train_x = train_x.drop(['id'],axis=1)
dtrain = xgb.DMatrix(train_x, label=train_y)

valid_d = pd.read_csv('../../preprocess_data/valid_float_rank_d.csv')
valid_nd = pd.read_csv('../../preprocess_data/valid_float_rank_nd.csv')
valid = pd.merge(valid_d,valid_nd,on='id')
valid_id = valid.id
valid = valid.drop("id",axis=1)
dvalid = xgb.DMatrix(valid)

CPU times: user 35 s, sys: 1min 23s, total: 1min 58s
Wall time: 2min


In [6]:
def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    if max_depth==6:
        num_boost_round = 3000
    elif max_depth==7:
        num_boost_round = 2500
    elif max_depth==8:
        num_boost_round = 2000
    
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
#     'scale_pos_weight': float(len(y)-sum(y))/float(sum(y)),
    'eval_metric': 'auc',
    'gamma':gamma,
    'max_depth':max_depth,
    'lambda':lambd,
    'subsample':subsample,
    'colsample_bytree':colsample_bytree,
    'min_child_weight':min_child_weight, 
    'eta': 0.04,
    'seed':random_seed,
    'nthread':16
        }
    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=watchlist)
    model.save_model('./model/xgb{0}.model'.format(iteration))
    
    #predict test set
    test_y = model.predict(dtest)
    test_result = pd.DataFrame(test_id,columns=["id"])
    test_result['score'] = test_y
    test_result.to_csv("./preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #get feature score
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
    

In [None]:
%%time
random_seed = list(range(1000,2000,20))
gamma = [i/1000.0 for i in range(100,200,2)]
max_depth = [6,7,8]
lambd = list(range(200,400,2))
subsample = [i/1000.0 for i in range(600,700,2)]
colsample_bytree = [i/1000.0 for i in range(250,350,2)]
min_child_weight = [i/1000.0 for i in range(200,300,2)]
random.shuffle(random_seed)
random.shuffle(gamma)
random.shuffle(max_depth)
random.shuffle(lambd)
random.shuffle(subsample)
random.shuffle(colsample_bytree)
random.shuffle(min_child_weight)

with open('params.pkl','wb') as f:
    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

#to reproduce my result, uncomment following lines
"""
with open('params_for_reproducing.pkl','rb') as f:
    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    
"""

for i in range(36):
    pipeline(dtrain,dvalid,valid_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

[0]	train-auc:0.50083
[1]	train-auc:0.5016
[2]	train-auc:0.502593
[3]	train-auc:0.502594
[4]	train-auc:0.502694
[5]	train-auc:0.502694
[6]	train-auc:0.502694
[7]	train-auc:0.502694
[8]	train-auc:0.502989
[9]	train-auc:0.503438
[10]	train-auc:0.503554
[11]	train-auc:0.505637
[12]	train-auc:0.506291
[13]	train-auc:0.506523
[14]	train-auc:0.506414
[15]	train-auc:0.506927
[16]	train-auc:0.507174
[17]	train-auc:0.507174
[18]	train-auc:0.507468
[19]	train-auc:0.507473
[20]	train-auc:0.509148
[21]	train-auc:0.511196
[22]	train-auc:0.511692
[23]	train-auc:0.511695
[24]	train-auc:0.512015
[25]	train-auc:0.512
[26]	train-auc:0.560574
[27]	train-auc:0.561206
[28]	train-auc:0.561583
[29]	train-auc:0.631795
[30]	train-auc:0.631964
[31]	train-auc:0.63254
[32]	train-auc:0.632704
[33]	train-auc:0.67143
[34]	train-auc:0.671533
[35]	train-auc:0.670726
[36]	train-auc:0.687185
[37]	train-auc:0.691081
[38]	train-auc:0.693926
[39]	train-auc:0.700047
[40]	train-auc:0.709362
[41]	train-auc:0.711017
[42]	train

[334]	train-auc:0.865277
[335]	train-auc:0.865517
[336]	train-auc:0.865771
[337]	train-auc:0.866015
[338]	train-auc:0.866212
[339]	train-auc:0.866405
[340]	train-auc:0.866625
[341]	train-auc:0.866845
[342]	train-auc:0.867095
[343]	train-auc:0.867284
[344]	train-auc:0.8676
[345]	train-auc:0.867779
[346]	train-auc:0.868015
[347]	train-auc:0.868252
[348]	train-auc:0.868473
[349]	train-auc:0.868646
[350]	train-auc:0.868813
[351]	train-auc:0.868983
[352]	train-auc:0.869195
[353]	train-auc:0.869434
[354]	train-auc:0.869673
[355]	train-auc:0.869865
[356]	train-auc:0.869969
[357]	train-auc:0.870178
[358]	train-auc:0.870379
[359]	train-auc:0.870509
[360]	train-auc:0.870726
[361]	train-auc:0.870959
[362]	train-auc:0.871159
[363]	train-auc:0.871325
[364]	train-auc:0.871554
[365]	train-auc:0.871734
[366]	train-auc:0.871971
[367]	train-auc:0.872182
[368]	train-auc:0.872375
[369]	train-auc:0.872631
[370]	train-auc:0.872872
[371]	train-auc:0.873125
[372]	train-auc:0.87336
[373]	train-auc:0.873546
[37

[664]	train-auc:0.920141
[665]	train-auc:0.920357
[666]	train-auc:0.92044
[667]	train-auc:0.920609
[668]	train-auc:0.920743
[669]	train-auc:0.920872
[670]	train-auc:0.920988
[671]	train-auc:0.921061
[672]	train-auc:0.921157
[673]	train-auc:0.921276
[674]	train-auc:0.921353
[675]	train-auc:0.921497
[676]	train-auc:0.921585
[677]	train-auc:0.921743
[678]	train-auc:0.92188
[679]	train-auc:0.922034
[680]	train-auc:0.922118
[681]	train-auc:0.922225
[682]	train-auc:0.922292
[683]	train-auc:0.922455
[684]	train-auc:0.922512
[685]	train-auc:0.922631
[686]	train-auc:0.922749
[687]	train-auc:0.922864
[688]	train-auc:0.92296
[689]	train-auc:0.923033
[690]	train-auc:0.923186
[691]	train-auc:0.923323
[692]	train-auc:0.923446
[693]	train-auc:0.923634
[694]	train-auc:0.923713
[695]	train-auc:0.923886
[696]	train-auc:0.923996
[697]	train-auc:0.924084
[698]	train-auc:0.924221
[699]	train-auc:0.924307
[700]	train-auc:0.92442
[701]	train-auc:0.924557
[702]	train-auc:0.92473
[703]	train-auc:0.924825
[704]