In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold



In [2]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [3]:
new_cols=['cont_2','cont_3','cont_13','cont_14','cat_2','cat_9','cat_20','cat_21','cat_22']
x_train=train.loc[:,new_cols]
x_train['cat_20_21']=x_train['cat_20']+x_train['cat_21']
x_train=x_train.drop(['cat_20','cat_21'],axis=1)
y_train=np.ravel(train["target"])

x_test=test.loc[:,new_cols]
x_test['cat_20_21']=x_test['cat_20']+x_test['cat_21']
x_test=x_test.drop(['cat_20','cat_21'],axis=1)

In [93]:
params={
    "objective":"multi:softmax",     
    "learning_rate":0.1,
    "subsample":0.8,
    "colsample_bytree": 0.8,
#     'eval_metric':'auc',
    "max_depth":6,
    'silent':1,
    'nthread':3,
    'num_class':3,
    'seed':1330,
    'n_estimators':100,
    } 

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold,StratifiedKFold
x,xx,y,yy=train_test_split(x_train,y_train,test_size=0.2,random_state=98)

In [52]:
x_train=np.array(x_train)
y_train=np.array(y_train)
x_test=np.array(x_test)

In [94]:
kf=KFold(n=len(x_train),n_folds=10,random_state=100)
scores=[]
predictions=[]
for trainkf,testkf in kf:
    train_data_x,train_data_y=x_train[trainkf],y_train[trainkf]
    valid_data_x,valid_data_y=x_train[testkf],y_train[testkf]
    
    dtrain=xgb.DMatrix(train_data_x,train_data_y)
    dvalid=xgb.DMatrix(valid_data_x,valid_data_y)
    dtest=xgb.DMatrix(x_test)
    watchlist=[(dtrain,'train'),(dvalid,'valid')]
    gbm=xgb.train(params,dtrain,10000,evals=watchlist,early_stopping_rounds=30,verbose_eval=1)
    cv=gbm.predict(dvalid)
    lst=gbm.predict(dtest)
    scores.extend(cv)
    predictions.append(lst)

[0]	train-merror:0.220428	valid-merror:0.223791
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 30 rounds.
[1]	train-merror:0.219818	valid-merror:0.223495
[2]	train-merror:0.2197	valid-merror:0.2232
[3]	train-merror:0.219647	valid-merror:0.223082
[4]	train-merror:0.219693	valid-merror:0.223082
[5]	train-merror:0.219726	valid-merror:0.223141
[6]	train-merror:0.21968	valid-merror:0.223141
[7]	train-merror:0.219588	valid-merror:0.223141
[8]	train-merror:0.219569	valid-merror:0.223082
[9]	train-merror:0.219483	valid-merror:0.223023
[10]	train-merror:0.219044	valid-merror:0.222728
[11]	train-merror:0.218971	valid-merror:0.222668
[12]	train-merror:0.219044	valid-merror:0.222728
[13]	train-merror:0.218919	valid-merror:0.222668
[14]	train-merror:0.218965	valid-merror:0.222668
[15]	train-merror:0.218926	valid-merror:0.222609
[16]	train-merror:0.218932	valid-merror:0.222609
[17]	train-merror:0.218873	valid-

[53]	train-merror:0.218538	valid-merror:0.220956
[54]	train-merror:0.218538	valid-merror:0.220956
[55]	train-merror:0.218519	valid-merror:0.220956
[56]	train-merror:0.218506	valid-merror:0.220956
[57]	train-merror:0.218486	valid-merror:0.220956
[58]	train-merror:0.218486	valid-merror:0.220956
[59]	train-merror:0.218479	valid-merror:0.220956
[60]	train-merror:0.218473	valid-merror:0.220956
[61]	train-merror:0.218473	valid-merror:0.220956
[62]	train-merror:0.218466	valid-merror:0.220956
[63]	train-merror:0.218453	valid-merror:0.221015
[64]	train-merror:0.21846	valid-merror:0.221015
[65]	train-merror:0.21846	valid-merror:0.221015
[66]	train-merror:0.21844	valid-merror:0.221015
[67]	train-merror:0.218446	valid-merror:0.221015
[68]	train-merror:0.21844	valid-merror:0.221015
[69]	train-merror:0.218427	valid-merror:0.221015
[70]	train-merror:0.21842	valid-merror:0.221015
[71]	train-merror:0.21842	valid-merror:0.220956
[72]	train-merror:0.218407	valid-merror:0.220956
[73]	train-merror:0.218394

[1]	train-merror:0.220684	valid-merror:0.218652
[2]	train-merror:0.220448	valid-merror:0.218593
[3]	train-merror:0.220172	valid-merror:0.218121
[4]	train-merror:0.220258	valid-merror:0.21818
[5]	train-merror:0.220277	valid-merror:0.218121
[6]	train-merror:0.220225	valid-merror:0.218002
[7]	train-merror:0.220225	valid-merror:0.218062
[8]	train-merror:0.220225	valid-merror:0.218121
[9]	train-merror:0.220127	valid-merror:0.218062
[10]	train-merror:0.219792	valid-merror:0.217825
[11]	train-merror:0.219562	valid-merror:0.217766
[12]	train-merror:0.219556	valid-merror:0.217707
[13]	train-merror:0.219516	valid-merror:0.217707
[14]	train-merror:0.219582	valid-merror:0.217884
[15]	train-merror:0.219516	valid-merror:0.217766
[16]	train-merror:0.219424	valid-merror:0.217589
[17]	train-merror:0.219457	valid-merror:0.217648
[18]	train-merror:0.219411	valid-merror:0.217648
[19]	train-merror:0.219385	valid-merror:0.21753
[20]	train-merror:0.219359	valid-merror:0.217589
[21]	train-merror:0.219372	vali

[53]	train-merror:0.219542	valid-merror:0.211506
[54]	train-merror:0.219536	valid-merror:0.211506
[55]	train-merror:0.219536	valid-merror:0.211506
[56]	train-merror:0.219523	valid-merror:0.211506
[57]	train-merror:0.219523	valid-merror:0.211506
[58]	train-merror:0.219516	valid-merror:0.211506
[59]	train-merror:0.219503	valid-merror:0.211624
[60]	train-merror:0.219497	valid-merror:0.211624
[61]	train-merror:0.219477	valid-merror:0.211624
[62]	train-merror:0.21947	valid-merror:0.211624
[63]	train-merror:0.219464	valid-merror:0.211624
[64]	train-merror:0.219457	valid-merror:0.211624
[65]	train-merror:0.219444	valid-merror:0.211683
[66]	train-merror:0.219437	valid-merror:0.211683
[67]	train-merror:0.219424	valid-merror:0.211742
[68]	train-merror:0.219424	valid-merror:0.211742
[69]	train-merror:0.219418	valid-merror:0.211742
[70]	train-merror:0.219411	valid-merror:0.211801
[71]	train-merror:0.219411	valid-merror:0.211801
Stopping. Best iteration:
[41]	train-merror:0.219707	valid-merror:0.21

[7]	train-merror:0.219633	valid-merror:0.221028
[8]	train-merror:0.219377	valid-merror:0.220555
[9]	train-merror:0.21939	valid-merror:0.220614
[10]	train-merror:0.219639	valid-merror:0.221087
[11]	train-merror:0.219633	valid-merror:0.221146
[12]	train-merror:0.219653	valid-merror:0.221205
[13]	train-merror:0.219639	valid-merror:0.221146
[14]	train-merror:0.219397	valid-merror:0.220732
[15]	train-merror:0.219403	valid-merror:0.220673
[16]	train-merror:0.21939	valid-merror:0.220732
[17]	train-merror:0.219364	valid-merror:0.220614
[18]	train-merror:0.219318	valid-merror:0.220496
[19]	train-merror:0.219272	valid-merror:0.220496
[20]	train-merror:0.219259	valid-merror:0.220496
[21]	train-merror:0.219259	valid-merror:0.220496
[22]	train-merror:0.2192	valid-merror:0.220378
[23]	train-merror:0.219239	valid-merror:0.220437
[24]	train-merror:0.2192	valid-merror:0.220437
[25]	train-merror:0.219206	valid-merror:0.220437
[26]	train-merror:0.21918	valid-merror:0.220437
[27]	train-merror:0.219154	val

In [99]:
from scipy.stats import mode
pred=mode(predictions,axis=0)
pred=pred[0][0]
pred=np.ravel(pred)

In [100]:
pred

array([ 1.,  0.,  0., ...,  0.,  0.,  1.], dtype=float32)

In [101]:
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred
sub['target'] = sub['target'].astype(int)
sub.to_csv('sub_day4_2.csv', index=False)

In [102]:
s=pd.read_csv('sub_day4_2.csv')
len(s)

91166

In [46]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [5,6,7,8,9,10],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}

xgb_model=xgb.XGBClassifier()
clf = GridSearchCV(xgb_model, parameters,
                   cv=KFold(len(x_train), n_folds=5, shuffle=True), 
                   verbose=2, refit=True)



In [47]:
clf.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5 
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5, total=   0.6s
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5, total=   0.6s
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5 
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5, total=   0.7s
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5 
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=5, total=   0.6s
[CV] colsample_bytree=0.7, sil

[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=9, total=   0.8s
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=9 
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=9, total=   0.9s
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=10 
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=10, total=   1.0s
[CV] colsample_bytree=0.7, s

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   24.6s finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=169307, n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'missing': [-999], 'learning_rate': [0.05], 'n_estimators': [5], 'seed': [1337], 'colsample_bytree': [0.7], 'silent': [1], 'nthread': [4], 'min_child_weight': [11], 'subsample': [0.8], 'objective': ['binary:logistic'], 'max_depth': [5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [49]:
clf.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 10,
 'min_child_weight': 11,
 'missing': -999,
 'n_estimators': 5,
 'nthread': 4,
 'objective': 'binary:logistic',
 'seed': 1337,
 'silent': 1,
 'subsample': 0.8}

In [7]:
train.head()

Unnamed: 0,connection_id,cont_1,cont_2,cont_3,cont_4,cont_5,cont_6,cont_7,cont_8,cont_9,...,cat_15,cat_16,cat_17,cat_18,cat_19,cat_20,cat_21,cat_22,cat_23,target
0,cxcon_1,0,1032,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,511,511,255,255,2
1,cxcon_4,0,520,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,511,511,255,255,0
2,cxcon_7,0,1032,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,511,511,255,255,0
3,cxcon_10,0,1032,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,511,511,255,255,0
4,cxcon_13,0,1032,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,511,511,255,255,2
