In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

train = pd.read_csv('../data/train_featureV1.csv')
target = 'label'
IDcol = 'uid'

In [79]:
ttrain = train.drop(['uid','label'],axis=1)
train.head(5)

Unnamed: 0,uid,label,voice_opp_num_unique_count,voice_opp_num_count,voice_opp_head_unique_count,voice_opp_len_3,voice_opp_len_5,voice_opp_len_6,voice_opp_len_7,voice_opp_len_8,...,wa_up_flow_min,wa_up_flow_median,wa_up_flow_mean,wa_up_flow_sum,wa_down_flow_std,wa_down_flow_max,wa_down_flow_min,wa_down_flow_median,wa_down_flow_mean,wa_down_flow_sum
0,u0001,0,22.0,79.0,17.0,0.0,3.0,0.0,0.0,0.0,...,0.0,6067.0,172279.362771,198982664.0,4243115.0,58773144.0,0.0,9683.0,832540.6,961584400.0
1,u0002,0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2453.0,4433.551724,128573.0,3908.229,15438.0,0.0,1656.0,3267.207,94749.0
2,u0003,0,15.0,21.0,10.0,0.0,1.0,0.0,0.0,2.0,...,0.0,3633.0,60843.138817,71003943.0,14454210.0,457021859.0,0.0,5632.0,1115881.0,1302233000.0
3,u0004,0,77.0,254.0,31.0,0.0,1.0,0.0,0.0,12.0,...,0.0,5785.5,181739.734524,305322754.0,15649550.0,513377344.0,0.0,14205.0,2082341.0,3498333000.0
4,u0005,0,55.0,401.0,28.0,0.0,4.0,0.0,0.0,0.0,...,0.0,4101.0,63241.5117,40537809.0,1617302.0,28858654.0,0.0,6916.0,355616.3,227950000.0


In [19]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True

In [85]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, train.label)
        cvresult = xgb.cv(xgb_param,xgtrain)
        alg.set_params(n_estimators=cvresult.shape[0])
        
    alg.fit(dtrain[predictors], dtrain.label)
    
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    print ("Model Report")
    print (cvresult)
    print ("\nAccuracy : %.4g" % metrics.accuracy_score(dtrain['label'].values, dtrain_predictions))
    print ("ACU Score (Train): %f" % metrics.roc_auc_score(dtrain['label'], dtrain_predprob))

In [130]:
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
    learning_rate = 0.03,
    n_estimators = 700,
    max_depth = 6,
    min_child_weight = 2,
    gamma = 0.1,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = 'binary:logistic',
    scale_pos_weight = 1,
    seed = 27,
    num_boost_round=200,
    nfold=5,
    metrics='auc',
    feval=evalMetric,
    early_stopping_rounds=100
)
modelfit(xgb1, train, predictors)
#Accuracy : 0.881  ACU Score (Train): 0.932427

Model Report
   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.123225         0.001673         0.161630        0.008566
1          0.111622         0.005180         0.153830        0.004239
2          0.107621         0.005917         0.151629        0.005332
3          0.104520         0.006803         0.146429        0.001924
4          0.105220         0.006904         0.145228        0.003175
5          0.103420         0.004323         0.144828        0.006090
6          0.102020         0.007498         0.145827        0.006327
7          0.101920         0.007414         0.145628        0.003503
8          0.102520         0.006295         0.143628        0.002787
9          0.101020         0.005647         0.142627        0.005438

Accuracy : 0.885
ACU Score (Train): 0.938475


  if diff:


In [None]:
## 调参 n_estimators

In [111]:
param_test1 = {
    'n_estimators': list(range(50, 111, 10))
}
estimator1 = XGBClassifier(
    learning_rate = 0.1,
    n_estimators = 70,
    max_depth = 6,
    min_child_weight = 1,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = 'binary:logistic',
    scale_pos_weight = 1,
    seed = 27
)
gsearch1 = GridSearchCV(estimator1, param_grid=param_test1,scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors], train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
#{'n_estimators': 70}, 0.8948812601129674)

([mean: 0.89472, std: 0.02197, params: {'n_estimators': 50},
  mean: 0.89433, std: 0.02182, params: {'n_estimators': 60},
  mean: 0.89488, std: 0.02027, params: {'n_estimators': 70},
  mean: 0.89401, std: 0.01981, params: {'n_estimators': 80},
  mean: 0.89268, std: 0.01990, params: {'n_estimators': 90},
  mean: 0.89273, std: 0.01993, params: {'n_estimators': 100},
  mean: 0.89245, std: 0.01972, params: {'n_estimators': 110}],
 {'n_estimators': 70},
 0.8948812601129674)

In [113]:
param_test2 = {
    'max_depth': [4, 5, 6],
    'min_child_weight': [1, 2, 3]
}
estimator2 = XGBClassifier(
    learning_rate = 0.1,
    n_estimators = 70,
    max_depth = 5,
    min_child_weight = 2,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = 'binary:logistic',
    scale_pos_weight = 1,
    seed = 27
)
gsearch2 = GridSearchCV(estimator2, param_grid=param_test2,scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors], train[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
#{'max_depth': 5, 'min_child_weight': 1}, 0.8956325398479871)
#{'max_depth': 5, 'min_child_weight': 2}, 0.8962341046487389)

([mean: 0.89598, std: 0.02280, params: {'max_depth': 4, 'min_child_weight': 1},
  mean: 0.89498, std: 0.02266, params: {'max_depth': 4, 'min_child_weight': 2},
  mean: 0.89525, std: 0.02299, params: {'max_depth': 4, 'min_child_weight': 3},
  mean: 0.89563, std: 0.02504, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.89623, std: 0.02253, params: {'max_depth': 5, 'min_child_weight': 2},
  mean: 0.89463, std: 0.02223, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.89488, std: 0.02027, params: {'max_depth': 6, 'min_child_weight': 1},
  mean: 0.89602, std: 0.02060, params: {'max_depth': 6, 'min_child_weight': 2},
  mean: 0.89472, std: 0.02263, params: {'max_depth': 6, 'min_child_weight': 3}],
 {'max_depth': 5, 'min_child_weight': 2},
 0.8962341046487389)

In [114]:
param_test3 = {
    'gamma': [i / 10.0 for i in range(0, 5)]
}
estimator3 = XGBClassifier(
    learning_rate = 0.1,
    n_estimators = 70,
    max_depth = 5,
    min_child_weight = 2,
    gamma = 0.1,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = 'binary:logistic',
    scale_pos_weight = 1,
    seed = 27
)
gsearch3 = GridSearchCV(estimator1, param_grid=param_test3,scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors], train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
#{'gamma': 0.1}, 0.8953036802101844)

([mean: 0.89401, std: 0.01981, params: {'gamma': 0.0},
  mean: 0.89530, std: 0.02080, params: {'gamma': 0.1},
  mean: 0.89463, std: 0.02139, params: {'gamma': 0.2},
  mean: 0.89512, std: 0.02246, params: {'gamma': 0.3},
  mean: 0.89476, std: 0.01984, params: {'gamma': 0.4}],
 {'gamma': 0.1},
 0.8953036802101844)

In [131]:
model=xgb1.fit(train[predictors], train.label)

In [133]:
test = pd.read_csv('../data/test_featureV1.csv')
preds = model.predict(test[predictors])

  if diff:


In [134]:
preds

array([0, 0, 0, ..., 0, 0, 0])

In [135]:
res =pd.DataFrame({'uid':test.uid,'label':preds})

In [136]:
res=res.sort_values(by='label',ascending=False)

In [138]:
res.to_csv('../result/xgb-baseline2.csv',index=False,header=False,sep=',',columns=['uid','label'])