In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV



In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV2.csv')

In [3]:
#dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
#dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res

In [5]:
xgb_params = {
    'n_estimators':800,
    'booster':'gbtree',
    'objective':'binary:logistic',
    'max_depth':8,
    'gamma':0.1,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'lambda':1,
    'eta':0.05,
    'seed':2000,
    'silent':0,
    'max_delta_step':5,
    #'eval_metric':evalMetric,
    #'metrics':['evalMetric'],
    #'num_class':4,
    #'num_boost_round':250,
    #'feval':evalMetric,
    #'early_stopping_rounds':150,
    'min_child_weight':1
}

### 本地CV

In [6]:
dtrain = xgb.DMatrix(train.drop(['uid','label'],axis=1),label=train.label)
xgb.cv(xgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3)

[0]	train-error:0.106621+0.00422142	train-res:0.816335+0.0099503	test-error:0.173032+0.0080425	test-res:0.675892+0.00360159
[5]	train-error:0.076615+0.00333978	train-res:0.879454+0.00710133	test-error:0.146228+0.00587316	test-res:0.723344+0.0187501
[10]	train-error:0.0661133+0.00569737	train-res:0.900299+0.0101555	test-error:0.145828+0.00574687	test-res:0.728915+0.0152333
[15]	train-error:0.0583117+0.00296063	train-res:0.914887+0.00317454	test-error:0.143828+0.00322316	test-res:0.736733+0.00892657
[20]	train-error:0.05141+0.00225921	train-res:0.927339+0.00182266	test-error:0.144828+0.00352215	test-res:0.735381+0.0115138
[25]	train-error:0.0449087+0.000289442	train-res:0.938248+0.00117795	test-error:0.142828+0.00555082	test-res:0.74016+0.0124516
[30]	train-error:0.0381077+0.000845947	train-res:0.948723+0.00131879	test-error:0.143028+0.00563858	test-res:0.743519+0.0146927
[35]	train-error:0.0361073+0.00163253	train-res:0.952281+0.00230624	test-error:0.141628+0.00509286	test-res:0.745724+

Unnamed: 0,train-error-mean,train-error-std,train-res-mean,train-res-std,test-error-mean,test-error-std,test-res-mean,test-res-std
0,0.106621,0.004221,0.816335,0.00995,0.173032,0.008043,0.675892,0.003602


In [118]:
#lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

## 训练

In [7]:
model=xgb.train(xgb_params,dtrain=dtrain)

In [8]:
dtest = xgb.DMatrix(test.drop(['uid'],axis=1))
pre=model.predict(dtest)

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pre})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../resultB/xgb.csv',index=False,header=False,sep=',',columns=['uid','label'])