In [9]:
import xgboost as xgb
import sys,random
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_curve
from sklearn import metrics

In [2]:
if not os.path.exists('featurescore'):
    os.mkdir('featurescore')
if not os.path.exists('model'):
    os.mkdir('model')
if not os.path.exists('preds'):
    os.mkdir('preds')

In [6]:
%%time
input_dir = '../../preprocess_data_new/'
data_date = joblib.load(input_dir + 'train_ax_date.lz4')[:33465]
data_nodup =joblib.load(input_dir + 'train_ax_nodup.lz4').drop(columns=['id','loan_dt'])[:33465]
data_null = joblib.load(input_dir + 'train_ax_null.lz4')[:33465]
# 为了迁就valid中tag特征append到最后的格式
data_tag = data_nodup[['tag']] 
data_nodup = data_nodup.drop(columns=['tag'])
data = pd.concat([data_date,data_nodup,data_null,data_tag],axis=1)

x = data.fillna(-1).values
data_label = joblib.load(input_dir + 'train_y_33465.lz4')
y = data_label['label'].values
feature_names = list(data.columns)

In [8]:
data.shape

(33465, 4807)

### 注意:data_nodup发生了一些变化

代码整理之前：使用旧data_nodup（近似data_raw）,拼接后数据维度为：6702     
代码整理之前后：使用新data_nodup,拼接后数据维度为：4807

## 本地验证

In [10]:
#1.分割数据
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
#2.建立模型
params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'early_stopping_rounds':100,
    'scale_pos_weight': float(len(data_label)-np.sum(data_label.values))/float(np.sum(data_label.values)),  # 负例样本除以正例样本
    'eval_metric': 'auc',
    'gamma':1,
    'max_depth':6,
    'lambda':1,
    'subsample':0.9,
    'colsample_bytree':0.9,
    'min_child_weight':1, 
    'eta': 0.04,
    'seed':2010,
    'nthread':32
        }


In [14]:
#3.训练模型
dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=feature_names)
watchlist  = [(dtrain,'train')]
print('开始训练！')
model  = xgb.train(params,dtrain, num_boost_round=300, evals =watchlist, verbose_eval=50)

开始训练！
[0]	train-auc:0.824189
[50]	train-auc:0.958696
[100]	train-auc:0.986746
[150]	train-auc:0.995115
[200]	train-auc:0.998245
[250]	train-auc:0.999418
[299]	train-auc:0.999776


In [16]:
# 4.预测结果
print('开始预测！')
dtest = xgb.DMatrix(x_test, feature_names=feature_names)
y_pre = model.predict(dtest)
auc = metrics.roc_auc_score(y_test, y_pre)
print('AUC:',auc)

开始预测！
AUC: 0.8289777507133729


#### 1.(整理代码之前，结果不能复现)使用data_raw（这里的no_dup较之data_raw，减少了100多维）作为主干特征
#### 用于反映不同特征组合的auc变化，不和第2点作比较，因为随机数种子不一样
raw AUC: 0.8130921791239684  
raw 去除nan列 AUC: 0.8125236195981279  
raw 去除nan列 + 统计null AUC: 0.8263001992035767   
nodup + null AUC: 0.8261527677046496  
nodup + null + tag AUC:0.8296696198580618 
 
nodup + null + tag (前4000维)AUC: 0.8251877022906782    
nodup + null + tag (前5990维) AUC: 0.821991126741565  
nodup + null + tag (前3000维) AUC: 0.8275583356303323  
nodup + null + tag (前2000维) AUC: 0.8249837418081847

nodup + null + tag (PCA降至8维) AUC: 0.5830606257872517  
nodup + null + tag (fillna(-1))AUC: 0.8313812914152184  

#### 2.使用data_nodup作为主干特征
nodup + null + tag (fillna(-1))： AUC: 0.8289777507133729

## 训练与预测

#### 1.读取数据

In [21]:
%%time
valid_date = joblib.load(input_dir + 'valid_date.csv')
valid_nodup = joblib.load(input_dir + 'valid_nodup.lz4').drop(columns=['loan_dt'])
valid_null = joblib.load(input_dir + 'valid_row_null.csv').drop(columns=['id'])
valid_tag = pd.read_csv('../../2.explore_data/6.explore_tag/predict_tag/valid_tag.csv',usecols=['tag'])
valid_id = valid_nodup['id'].values
valid_nodup = valid_nodup.drop(columns=['id'])

valid = pd.concat([valid_date,valid_nodup,valid_null,valid_tag],axis=1)

dtrain = xgb.DMatrix(data.fillna(-1).values, data_label.values, feature_names=list(data.columns))
dtest = xgb.DMatrix(valid.fillna(-1).values, feature_names=list(valid.columns))

#### 2.训练并保存模型

In [25]:
def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    if max_depth==6:
        num_boost_round = 500
    elif max_depth==7:
        num_boost_round = 400
    elif max_depth==8:
        num_boost_round = 300
    
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'early_stopping_rounds':100,
    'scale_pos_weight': float(len(data_label)-np.sum(data_label.values))/float(np.sum(data_label.values)),  # 负例样本除以正例样本
    'eval_metric': 'auc',
    'gamma':gamma,
    'max_depth':max_depth,
    'lambda':lambd,
    'subsample':subsample,
    'colsample_bytree':colsample_bytree,
    'min_child_weight':min_child_weight, 
    'eta': 0.04,
    'seed':random_seed,
    'nthread':16
        }
    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=watchlist)
    model.save_model('./model/xgb{0}.model'.format(iteration))
    
    #predict test set
    test_y = model.predict(dtest)
    test_result = pd.DataFrame(test_id,columns=["id"])
    test_result['score'] = test_y
    test_result.to_csv("./preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #get feature score
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
    

In [1]:
%%time
random_seed = list(range(1000,2000,20))
gamma = [i/1000.0 for i in range(100,200,2)]
max_depth = [6,7,8]
lambd = list(range(200,400,2))
subsample = [i/1000.0 for i in range(600,700,2)]
colsample_bytree = [i/1000.0 for i in range(250,350,2)]
min_child_weight = [i/1000.0 for i in range(200,300,2)]
random.shuffle(random_seed)
random.shuffle(gamma)
random.shuffle(max_depth)
random.shuffle(lambd)
random.shuffle(subsample)
random.shuffle(colsample_bytree)
random.shuffle(min_child_weight)

with open('params.pkl','wb') as f:
    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

#to reproduce my result, uncomment following lines
"""
with open('params_for_reproducing.pkl','rb') as f:
    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    
"""

for i in range(36):
    pipeline(dtrain,dtest,valid_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

## 线上valid-auc分数
raw 去除nan列 + 统计null AUC: 0.82751113123698  
nodup + null + tag AUC: 0.82803008109167  
nodup + null + tag（rank融合）AUC:0.8279914450872  
nodup + null + tag (fillna(-1)) AUC:0.82979480823375