In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split




In [2]:
seed = 71

np.random.seed(seed)
valid_size = 0.2
LOOP = 1
ESR = 40
# XGB param
nround = 1500
#nround = 10

param = {'max_depth':5, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':6
        }


In [3]:
df_train = pd.read_csv('../data/train/orderFuture_train.csv')
df_test = pd.read_csv('../data/test/orderFuture_test.csv')

df_train_t = pd.read_csv('../data/dataSet/df_train_12.csv')
df_test_t = pd.read_csv('../data/dataSet/df_test_12.csv')


df_train = pd.merge(df_train,df_train_t,how='left',on='userid')
df_test = pd.merge(df_test,df_test_t,how='left',on='userid')



In [4]:
# 设置特征数据，去除id数据，不能进行预测
features = df_test.columns[1:]

label = 'orderType'

len(features)


280

In [5]:
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [6]:
models = []
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2)
        
    model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=5)
    models.append(model)
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist



LOOP 0
[0]	train-auc:0.88365	valid-auc:0.87685
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[5]	train-auc:0.906968	valid-auc:0.902974
[10]	train-auc:0.912855	valid-auc:0.90744
[15]	train-auc:0.915326	valid-auc:0.910254
[20]	train-auc:0.919385	valid-auc:0.913798
[25]	train-auc:0.921706	valid-auc:0.916951
[30]	train-auc:0.924758	valid-auc:0.919604
[35]	train-auc:0.927336	valid-auc:0.921821
[40]	train-auc:0.934202	valid-auc:0.928141
[45]	train-auc:0.938446	valid-auc:0.932254
[50]	train-auc:0.942456	valid-auc:0.935537
[55]	train-auc:0.944699	valid-auc:0.937398
[60]	train-auc:0.947321	valid-auc:0.939272
[65]	train-auc:0.949797	valid-auc:0.941705
[70]	train-auc:0.951975	valid-auc:0.943549
[75]	train-auc:0.954065	valid-auc:0.945349
[80]	train-auc:0.955666	valid-auc:0.946249
[85]	train-auc:0.957408	valid-auc:0.947286
[90]	train-auc:0.958998	valid-auc:0.948594
[95]	train-auc:0.960478	valid-auc:0.94

In [7]:
# 计算特征重要程度
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)

print(len(importance))
importance[:110]

feature100 = [f[0] for f in importance[:110] ]


269


In [8]:
df_train.head()

Unnamed: 0,userid,orderType,gender,province,age,gender__0,gender__1,gender__2,age__0,age__1,...,orderType_max_0,orderType_max_1,rating_mean,userComment_cnt_all,tags_long_mean,wrds_long_mean,long_mean_all,orderHistory_ctr,actiontime_orderTime_diff,userComment_rate
0,100000000013,0,2,0,1,0,0,1,0,1,...,2.0,-1.0,4.0,1.0,1.0,19.0,20.0,0.013986,1278697.0,0.5
1,100000000111,0,0,1,5,1,0,0,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,100000000127,0,0,1,5,1,0,0,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,100000000231,0,2,4,2,0,0,1,0,0,...,-1.0,-1.0,5.0,1.0,9.0,10.0,19.0,-1.0,-1.0,-1.0
4,100000000379,0,2,4,5,0,0,1,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [9]:
# 将验证得到的df_train_12最重要的100个特征作为训练集存储
feature100.append('userid')

df_train[feature100].to_csv('../data/dataSet/df_train_12_top110.csv',index=False)
df_test[feature100].to_csv('../data/dataSet/df_test_12_top110.csv',index=False)


