In [1]:
import pandas as pd
import xgboost as xgb
import sys,random
import pickle
import os
import numpy as np

In [2]:
if not os.path.exists('featurescore'):
    os.mkdir('featurescore')
if not os.path.exists('model'):
    os.mkdir('model')
if not os.path.exists('preds'):
    os.mkdir('preds')

In [3]:
train_x_d_cols = pd.read_csv('./rank_d_feature_score.csv')
train_x_d_cols = list(train_x_d_cols.iloc[10:810].feature)

In [8]:
%%time
#use rank_d  and rank_nd feature
#load data
train_x_date_w = pd.read_csv('../../preprocess_data/train_x_date_w.csv').drop(columns=['id'])
train_x_raw = pd.read_csv('../../preprocess_data/train_x_filter_null.csv').drop(columns=['id','tag'])
train_x_null = pd.read_csv('../../preprocess_data/train_x_null.csv').drop(columns=['id'])
train_x_int = pd.read_csv('../../preprocess_data/train_x_int_filter.csv').drop(columns=['id']) # valid中无‘tag’
train_x_d = pd.read_csv('../../preprocess_data/train_x_float_rank_d.csv',usecols=train_x_d_cols)
train_x_nd = pd.read_csv('../../preprocess_data/train_x_float_rank_nd.csv').drop(columns=['id'])


train_x = pd.concat([train_x_date_w,train_x_null,train_x_int,train_x_d,train_x_nd],axis=1,ignore_index=True,copy=False)
train_y = pd.read_csv('../../preprocess_data/train_y_33465.csv')
print('正负样本不均衡',train_y.label.value_counts().to_dict())
dtrain = xgb.DMatrix(train_x.values, label=train_y.values)

valid_date_w = pd.read_csv('../../preprocess_data/valid_date_w.csv')
valid_raw = pd.read_csv('../../preprocess_data/valid_filter_null.csv').drop(columns=['id'])
valid_null = pd.read_csv('../../preprocess_data/valid_null.csv').drop(columns=['id'])
valid_int = pd.read_csv('../../preprocess_data/valid_int_filter.csv').drop(columns=['id'])
valid_d = pd.read_csv('../../preprocess_data/valid_float_rank_d.csv',usecols=train_x_d_cols)
valid_nd = pd.read_csv('../../preprocess_data/valid_float_rank_nd.csv').drop(columns=['id'])
valid = pd.concat([valid_date_w,valid_null,valid_int,valid_d,valid_nd],axis=1,ignore_index=True,copy=False)
# pd.concat后特征名没有了，变成数字了
valid_id = valid.iloc[:,0].values
valid = valid.drop(0,axis=1)
dvalid = xgb.DMatrix(valid.values)

正负样本不均衡 {0.0: 31267, 1.0: 2198}
CPU times: user 51.6 s, sys: 2.06 s, total: 53.6 s
Wall time: 53.7 s


In [9]:
def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    if max_depth==6:
        num_boost_round = 1200
    elif max_depth==7:
        num_boost_round = 1000
    elif max_depth==8:
        num_boost_round = 600
    
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'early_stopping_rounds':100,
    'scale_pos_weight': float(len(train_y)-np.sum(train_y.values))/float(np.sum(train_y.values)),  # 负例样本除以正例样本
    'eval_metric': 'auc',
    'gamma':gamma,
    'max_depth':max_depth,
    'lambda':lambd,
    'subsample':subsample,
    'colsample_bytree':colsample_bytree,
    'min_child_weight':min_child_weight, 
    'eta': 0.04,
    'seed':random_seed,
    'nthread':16
        }
    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=watchlist)
    model.save_model('./model/xgb{0}.model'.format(iteration))
    
    #predict test set
    test_y = model.predict(dtest)
    test_result = pd.DataFrame(test_id,columns=["id"])
    test_result['score'] = test_y
    test_result.to_csv("./preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #get feature score
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
    

In [10]:
%%time
random_seed = list(range(0,50000,100))
gamma = [i/1000.0 for i in range(100,200,2)]
max_depth = [6,7,8]
lambd = list(range(200,400,2))
subsample = [i/1000.0 for i in range(600,700,2)]
colsample_bytree = [i/1000.0 for i in range(250,350,2)]
min_child_weight = [i/1000.0 for i in range(200,300,2)]
random.shuffle(random_seed)
random.shuffle(gamma)
random.shuffle(max_depth)
random.shuffle(lambd)
random.shuffle(subsample)
random.shuffle(colsample_bytree)
random.shuffle(min_child_weight)

with open('params.pkl','wb') as f:
    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

#to reproduce my result, uncomment following lines
"""
with open('params_for_reproducing.pkl','rb') as f:
    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    
"""

for i in range(36):
    pipeline(dtrain,dvalid,valid_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

[0]	train-auc:0.751637
[1]	train-auc:0.775147
[2]	train-auc:0.788818
[3]	train-auc:0.79652
[4]	train-auc:0.802171
[5]	train-auc:0.802356
[6]	train-auc:0.805375
[7]	train-auc:0.8072
[8]	train-auc:0.808065
[9]	train-auc:0.810724
[10]	train-auc:0.811375
[11]	train-auc:0.812377
[12]	train-auc:0.81353
[13]	train-auc:0.815212
[14]	train-auc:0.816118
[15]	train-auc:0.817151
[16]	train-auc:0.818983
[17]	train-auc:0.820531
[18]	train-auc:0.820584
[19]	train-auc:0.821889
[20]	train-auc:0.823032
[21]	train-auc:0.824735
[22]	train-auc:0.826782
[23]	train-auc:0.828339
[24]	train-auc:0.829237
[25]	train-auc:0.830585
[26]	train-auc:0.83163
[27]	train-auc:0.833115
[28]	train-auc:0.834173
[29]	train-auc:0.835742
[30]	train-auc:0.837033
[31]	train-auc:0.838052
[32]	train-auc:0.839319
[33]	train-auc:0.840112
[34]	train-auc:0.840832
[35]	train-auc:0.841543
[36]	train-auc:0.84272
[37]	train-auc:0.84332
[38]	train-auc:0.844737
[39]	train-auc:0.845652
[40]	train-auc:0.846315
[41]	train-auc:0.847458
[42]	trai