In [1]:
import xgboost as xgb
import sys,random
import pickle
import os
import numpy as np

In [2]:
# import modin.pandas as pd
import pandas as pd

In [3]:
if not os.path.exists('featurescore'):
    os.mkdir('featurescore')
if not os.path.exists('model'):
    os.mkdir('model')
if not os.path.exists('preds'):
    os.mkdir('preds')

In [4]:
%%time
data_date = pd.read_csv('../../preprocess_data_new/train_ax_date.csv',nrows=33465).drop(columns=['id'])
data_row_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])
data_cat = pd.read_csv('../../preprocess_data_new/train_ax_cat.csv',nrows=33465).drop(columns=['id']) 
data_cont = pd.read_csv('../../preprocess_data_new/train_ax_cont.csv',nrows=33465) # 忘记导入id这一列
data_tag = pd.read_csv('../../preprocess_data_new/train_ax1.csv',nrows=33465,usecols=['tag'])

CPU times: user 57.3 s, sys: 2.4 s, total: 59.7 s
Wall time: 59.7 s


In [5]:
%%time
data = pd.concat([data_date,data_row_null,data_cat,data_cont,data_tag],axis=1)

CPU times: user 4.99 s, sys: 104 ms, total: 5.1 s
Wall time: 5.1 s


In [6]:
%%time
data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.63 ms


## 交叉验证

In [49]:
from sklearn.model_selection import train_test_split # 分割数据模块
from sklearn.metrics import roc_curve
from sklearn import metrics

#1.分割数据
feature_names = feature_rank[:2000]
feature_names = [x for x in feature_names if x!='tag']
X_train, X_test, y_train, y_test = train_test_split(data[feature_names].values,data_label.values,test_size=0.3, random_state=2018)
#2.建立模型
params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'early_stopping_rounds':100,
    'scale_pos_weight': float(len(data_tag)-np.sum(data_tag.values))/float(np.sum(data_tag.values)),  # 负例样本除以正例样本
    'eval_metric': 'auc',
    'gamma':1,
    'max_depth':6,
    'lambda':1,
    'subsample':0.9,
    'colsample_bytree':0.9,
    'min_child_weight':1, 
    'eta': 0.04,
    'seed':2010,
    'nthread':16
        }
#3.训练模型
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
# watchlist  = [(dtrain,'train')]
print('开始训练！')
model  = xgb.train(params,dtrain,num_boost_round=300)

开始训练！


In [50]:
# 4.预测结果
print('开始预测！')
dtest = xgb.DMatrix(X_test, feature_names=feature_names)
y_pre = model.predict(dtest)
auc = metrics.roc_auc_score(y_test, y_pre)
print('AUC:',auc)

开始预测！
AUC: 0.5056060506711306


In [44]:
feature_score  = model.get_fscore()
feature_rank = sorted(feature_score,key=lambda x:x[1],reverse=True)
import pickle
pickle.dump(feature_rank,open('./cv_feat_rank.pkl','wb'))

In [17]:
import gc
del dtrain,dtest,model
gc.collect()

NameError: name 'dtest' is not defined

## 训练与预测

In [7]:
%%time
# data_tag = pd.read_csv('../predict_tag/tag.csv',usecols=['tag'])
valid_date = pd.read_csv('../../preprocess_data_new/valid_date.csv').drop(columns=['id'])
valid_row_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])
valid_cat = pd.read_csv('../../preprocess_data_new/valid_cat.csv').drop(columns=['id']) 
valid_cont = pd.read_csv('../../preprocess_data_new/valid_cont.csv') # 忘记导入id这一列

CPU times: user 29.3 s, sys: 4.82 s, total: 34.1 s
Wall time: 36.7 s


In [8]:
%%time
valid = pd.concat([valid_date,valid_row_null,valid_cat,valid_cont],axis=1)

CPU times: user 2.2 s, sys: 76 ms, total: 2.28 s
Wall time: 2.28 s


In [9]:
dtrain = xgb.DMatrix(data.values, data_tag.values, feature_names=list(data.columns))
dtest = xgb.DMatrix(valid.values, feature_names=list(valid.columns))

In [10]:
test_id = pd.read_csv('../../preprocess_data_new/valid_date.csv',usecols=['id']).values

In [11]:
def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    if max_depth==6:
        num_boost_round = 800
    elif max_depth==7:
        num_boost_round = 650
    elif max_depth==8:
        num_boost_round = 500
    
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'early_stopping_rounds':100,
    'scale_pos_weight': float(len(data_tag)-np.sum(data_tag.values))/float(np.sum(data_tag.values)),  # 负例样本除以正例样本
    'eval_metric': 'auc',
    'gamma':gamma,
    'max_depth':max_depth,
    'lambda':lambd,
    'subsample':subsample,
    'colsample_bytree':colsample_bytree,
    'min_child_weight':min_child_weight, 
    'eta': 0.04,
    'seed':random_seed,
    'nthread':16
        }
    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=watchlist)
    model.save_model('./model/xgb{0}.model'.format(iteration))
    
    #predict test set
    test_y = model.predict(dtest)
    test_result = pd.DataFrame(test_id,columns=["id"])
    test_result['score'] = test_y
    test_result.to_csv("./preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #get feature score
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
    

In [12]:
%%time
random_seed = list(range(1000,2000,20))
gamma = [i/1000.0 for i in range(100,200,2)]
max_depth = [6,7,8]
lambd = list(range(200,400,2))
subsample = [i/1000.0 for i in range(600,700,2)]
colsample_bytree = [i/1000.0 for i in range(250,350,2)]
min_child_weight = [i/1000.0 for i in range(200,300,2)]
random.shuffle(random_seed)
random.shuffle(gamma)
random.shuffle(max_depth)
random.shuffle(lambd)
random.shuffle(subsample)
random.shuffle(colsample_bytree)
random.shuffle(min_child_weight)

with open('params.pkl','wb') as f:
    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

#to reproduce my result, uncomment following lines
"""
with open('params_for_reproducing.pkl','rb') as f:
    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    
"""

for i in range(8):
    pipeline(dtrain,dtest,test_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

[0]	train-auc:0.830037
[1]	train-auc:0.842366
[2]	train-auc:0.851326
[3]	train-auc:0.855948
[4]	train-auc:0.857697
[5]	train-auc:0.862939
[6]	train-auc:0.865729
[7]	train-auc:0.866534
[8]	train-auc:0.870627
[9]	train-auc:0.873351
[10]	train-auc:0.873308
[11]	train-auc:0.874684
[12]	train-auc:0.874293
[13]	train-auc:0.874862
[14]	train-auc:0.874427
[15]	train-auc:0.874668
[16]	train-auc:0.874728
[17]	train-auc:0.875412
[18]	train-auc:0.875661
[19]	train-auc:0.876461
[20]	train-auc:0.876542
[21]	train-auc:0.879569
[22]	train-auc:0.880344
[23]	train-auc:0.880609
[24]	train-auc:0.880736
[25]	train-auc:0.88207
[26]	train-auc:0.882342
[27]	train-auc:0.882981
[28]	train-auc:0.882929
[29]	train-auc:0.883151
[30]	train-auc:0.883683
[31]	train-auc:0.885271
[32]	train-auc:0.887139
[33]	train-auc:0.88698
[34]	train-auc:0.8873
[35]	train-auc:0.887508
[36]	train-auc:0.887907
[37]	train-auc:0.888091
[38]	train-auc:0.889881
[39]	train-auc:0.89082
[40]	train-auc:0.891275
[41]	train-auc:0.891892
[42]	tr