In [17]:
import pandas as pd
import xgboost as xgb
import sys,random
import pickle
import os
import numpy as np

In [18]:
if not os.path.exists('train_30465_featurescore'):
    os.mkdir('train_30465_featurescore')
if not os.path.exists('train_30465_model'):
    os.mkdir('train_30465_model')
if not os.path.exists('train_30465_preds'):
    os.mkdir('train_30465_preds')

In [19]:
train_x_d_cols = pd.read_csv('./rank_d_feature_score.csv')
train_x_d_cols = list(train_x_d_cols.iloc[10:810].feature)

In [20]:
%%time
#use rank_d  and rank_nd feature
#load data
train_x_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id'])
train_x_null = pd.read_csv('../../preprocess_data/train_x_null.csv').drop(columns=['id'])
train_x_int = pd.read_csv('../../preprocess_data/train_x_int.csv').drop(columns=['id','tag']) # valid中无‘tag’
train_x_d = pd.read_csv('../../preprocess_data/train_x_float_rank_d.csv',usecols=train_x_d_cols)
train_x_nd = pd.read_csv('../../preprocess_data/train_x_float_rank_nd.csv').drop(columns=['id'])

train_x = pd.concat([train_x_date,train_x_null,train_x_int,train_x_d,train_x_nd],axis=1,ignore_index=True,copy=False)
train_x = train_x.iloc[:30465]
 
train_y = pd.read_csv('../../preprocess_data/train_y_33465.csv')
train_y = train_y.iloc[:30465]
print('正负样本不均衡',train_y.label.value_counts().to_dict())
dtrain = xgb.DMatrix(train_x.values, label=train_y.values)

unlabel_date = pd.read_csv('../../preprocess_data/unlabel_x_date.csv')
unlabel_null = pd.read_csv('../../preprocess_data/unlabel_x_null.csv').drop(columns=['id'])
unlabel_int = pd.read_csv('../../preprocess_data/unlabel_x_int.csv').drop(columns=['id','tag'])
unlabel_d = pd.read_csv('../../preprocess_data/unlabel_x_float_rank_d.csv',usecols=train_x_d_cols)
unlabel_nd = pd.read_csv('../../preprocess_data/unlabel_x_float_rank_nd.csv').drop(columns=['id'])
unlabel = pd.concat([unlabel_date,unlabel_null,unlabel_int,unlabel_d,unlabel_nd],axis=1,ignore_index=True,copy=False)
# pd.concat后特征名没有了，变成数字了
test_id = unlabel.iloc[:,0].values
test = unlabel.drop(0,axis=1)
dtest = xgb.DMatrix(test.values)

正负样本不均衡 {0.0: 28628, 1.0: 1837}
CPU times: user 23.3 s, sys: 2.03 s, total: 25.3 s
Wall time: 25.7 s


In [21]:
train_x.shape

(30465, 1305)

In [22]:
def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    if max_depth==6:
        num_boost_round = 1200
    elif max_depth==7:
        num_boost_round = 1000
    elif max_depth==8:
        num_boost_round = 600
    
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'early_stopping_rounds':100,
    'scale_pos_weight': float(len(train_y)-np.sum(train_y.values))/float(np.sum(train_y.values)),  # 负例样本除以正例样本
    'eval_metric': 'auc',
    'gamma':gamma,
    'max_depth':max_depth,
    'lambda':lambd,
    'subsample':subsample,
    'colsample_bytree':colsample_bytree,
    'min_child_weight':min_child_weight, 
    'eta': 0.04,
    'seed':random_seed,
    'nthread':16
        }
    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=watchlist)
    model.save_model('./train_30465_model/xgb{0}.model'.format(iteration))
    
    #predict test set
    test_y = model.predict(dtest)
    test_result = pd.DataFrame(test_id,columns=["id"])
    test_result['score'] = test_y
    test_result.to_csv("./train_30465_preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #get feature score
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./train_30465_featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
    

In [1]:
%%time
random_seed = list(range(0,1000,25))
gamma = [i/1000.0 for i in range(100,200,2)]
max_depth = [6,7,8]
lambd = list(range(200,400,2))
subsample = [i/1000.0 for i in range(600,700,2)]
colsample_bytree = [i/1000.0 for i in range(250,350,2)]
min_child_weight = [i/1000.0 for i in range(200,300,2)]
random.shuffle(random_seed)
random.shuffle(gamma)
random.shuffle(max_depth)
random.shuffle(lambd)
random.shuffle(subsample)
random.shuffle(colsample_bytree)
random.shuffle(min_child_weight)

with open('params.pkl','wb') as f:
    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

#to reproduce my result, uncomment following lines
"""
with open('params_for_reproducing.pkl','rb') as f:
    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    
"""

for i in range(36):
    pipeline(dtrain,dtest,test_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

In [None]:
files = os.listdir('train_30465_preds')
pred = pd.read_csv('./train_30465_preds/%s' %files[0])  
id = pred.id
score = pred.score
for file in files[1:]:
    score += pd.read_csv('./train_30465_preds/%s' %files[0]).score
score /= len(files)
avg_pred = pd.DataFrame(id,columns=['id'])
avg_pred['score'] = score

avg_pred.to_csv('./train_30465_avg_pred.csv', index=False)