##Grid search refined.
###In the last search, the best AUC is 0.773. Considering changing eta and num_boost_round

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from scipy import stats,sparse
from sklearn.base import TransformerMixin
from datetime import datetime as dt
from math import isnan
from numpy import ma
import cPickle as pickle
import xgboost as xgb
import time
from pandas import *
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier 

In [2]:
import json
from sklearn.metrics import roc_curve, auc
from re import sub
from collections import defaultdict

In [3]:
from sklearn.cross_validation import StratifiedKFold,cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

In [4]:
class XGBoostClassifier():
    def __init__(self, num_boost_round=40, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'binary:logistic'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        dtrain = xgb.DMatrix(X, label=y)
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
        return self
 
    def predict(self, X):
        Y = self.predict_proba(X)
        Y = np.argmax(Y, axis=1)
        return Y
 
    def predict_proba(self, X):
        ypreds = np.zeros((X.shape[0],2))
        dtest = xgb.DMatrix(X)
        ypreds[:,1] = self.clf.predict(dtest)
        ypreds[:,0] = 1- ypreds[:,1]        # return the proba for both classes
        return ypreds
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / self.logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
    #    if 'num_boost_round' in params:
    #        self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    def logloss(self,y_true, Y_pred):
        label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
        return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)

## 0. Load data

In [5]:
LocalTest=False           # whether to do a local test
SelectedFeature=True    # whether to use selected features
njobs = 1
nrows=5000   #this is for local test only

In [6]:
if LocalTest:
    trainfile = 'C:/Huaixiu/Kaggle/GridSearch/data/train-5000.csv'
    xtrain = read_csv(trainfile,nrows=nrows)
    ytrain = xtrain['target']
    
    xtrain = xtrain.ix[:,1:-1]

else:    
    X=np.load('pickledata/nxtrain_standard_original0.npy')
    X1= np.load('pickledata/nxtrain_standard_derived0.npy')
    X2=pickle.load(open("pickledata/time_series_derived_standard_train2.dat","rb"))
    X3=pickle.load(open("pickledata/time_series_original_standard_train2.dat","rb"))
    X4=pickle.load(open("pickledata/cat_numeric_th60_standard_train2.dat","rb"))
    X5=pickle.load(open("pickledata/cat_le_train2.dat","rb"))#label encoded categorical data 15 in total
    ytrain=pickle.load(open("pickledata/ytrain2.dat","rb"))
    
    xtrain=np.hstack((X,X1,X2,X3,X4, X5))
    
    if SelectedFeature:
        with open('features/XGB_335Features Oct172015_044255_AUC_0p76115.p', 'rb') as fid:
            xgb_goodfeat1 = pickle.load(fid)
        with open('features/XGB_482Features_backward_selection_XX1X2X3X4.p', 'rb') as fid2:
            xgb_goodfeat2 = pickle.load(fid2)
        with open('features/XGB_5features_backward_selection_X5.p', 'rb') as fid3:
            xgb_goodfeat3 = pickle.load(fid3)
        with open('features/SDGRF_Oct172015125634_AUC_0.755531555054_with_cat_variables.p', 'rb') as fid4:
            xgb_goodfeat4 = pickle.load(fid4)

        
        good_features=list(set(list(xgb_goodfeat1) + xgb_goodfeat2 + xgb_goodfeat3 + list(xgb_goodfeat4)))
        xtrain = xtrain[:,good_features]

In [7]:
print(xtrain.shape, ytrain.shape)

((145231L, 933L), (145231L,))


## 1. Grid search: XGBoost

In [8]:
if LocalTest:
    param_xgb = {
        'num_boost_round': [2],
        'eta': [0.3],
        'max_depth': [1],
        'subsample': [0.9],
        'colsample_bytree': [0.9],
        'min_child_weight':[1],
        'gamma':[10],
        'objective':['binary:logistic'],
        'eval_metric': ['auc']
    }
else:    
    param_xgb = {
        'num_boost_round': [300],   #originally 40
        'eta': [0.01],       #originally 0.1
        'max_depth': [14],  #originally 12, 14, opt 12. then 10. Goes down at 9 or 8.
        'subsample': [1.0],  #originally 1
        'colsample_bytree': [0.9],  #originally 0.6, 0.7, opt 0.6. Better at 0.5. 0.4. Goes down at 0.3.
        'min_child_weight':[3],   #originally 5, 6, 7. opt 7. Better at 8. 9, goes down at 10.
        'gamma':[0],               #originally 1
        'max_delta_step' :[0],     #default 0. 
        'lambda': [0],           #l1 vs l2  #1 is better. 
        'objective':['binary:logistic'],
        'eval_metric': ['auc']
    }

In [9]:
start_time=time.clock()

print('Starting GridSearch using XGBoost...')
clf_xgb = XGBoostClassifier()
gs_xgb = GridSearchCV(clf_xgb,param_grid = param_xgb,cv = StratifiedKFold(ytrain,n_folds = 3),scoring='roc_auc', n_jobs = njobs,verbose = 2)
gs_xgb.fit(xtrain,ytrain)

total_time=time.clock()-start_time
print('Completed GridSearch using XGBoost')
print('Total running time is %d seconds\n' %total_time)

Starting GridSearch using XGBoost...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] colsample_bytree=0.9, eval_metric=auc, max_delta_step=0, min_child_weight=3, subsample=1.0, eta=0.01, objective=binary:logistic, num_boost_round=300, max_depth=14, gamma=0, lambda=0 
[CV]  colsample_bytree=0.9, eval_metric=auc, max_delta_step=0, min_child_weight=3, subsample=1.0, eta=0.01, objective=binary:logistic, num_boost_round=300, max_depth=14, gamma=0, lambda=0 - 2.3min
[CV] colsample_bytree=0.9, eval_metric=auc, max_delta_step=0, min_child_weight=3, subsample=1.0, eta=0.01, objective=binary:logistic, num_boost_round=300, max_depth=14, gamma=0, lambda=0 
[CV]  colsample_bytree=0.9, eval_metric=auc, max_delta_step=0, min_child_weight=3, subsample=1.0, eta=0.01, objective=binary:logistic, num_boost_round=300, max_depth=14, gamma=0, lambda=0 - 2.3min
[CV] colsample_bytree=0.9, eval_metric=auc, max_delta_step=0, min_child_weight=3, subsample=1.0, eta=0.01, objective=binary:logistic, 

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  2.3min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.0min finished



Completed GridSearch using XGBoost
Total running time is 625 seconds



In [16]:
print 'Best AUC Score of XGB is {}'.format(gs_xgb.best_score_)
print 'Best parameters set of XGB:'
best_param_xgb = gs_xgb.best_estimator_.get_params()
for param_name in sorted(best_param_xgb.keys()):
    print '\t%s: %r' % (param_name,best_param_xgb[param_name])

Best AUC Score of XGB is 0.759530365161
Best parameters set of XGB:
	colsample_bytree: 0.9
	eta: 0.01
	eval_metric: 'auc'
	gamma: 0
	lambda: 0
	max_delta_step: 0
	max_depth: 14
	min_child_weight: 3
	num_boost_round: 500
	objective: 'binary:logistic'
	subsample: 1.0


In [11]:
xgb_grid_scores = gs_xgb.grid_scores_
str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
with open(('XGB_grid_score_' +str1 + '_AUC_' + '0p'+ str(int(gs_xgb.best_score_*1e5))  +'.p'), 'wb') as f:
    pickle.dump(xgb_grid_scores, f, protocol =2)

###dump the model to pickle 

In [12]:
xgb_opt = gs_xgb.best_estimator_

with open(('XGB_best_estimator_' +str1 + '_AUC_' + '0p'+ str(int(gs_xgb.best_score_*1e5))  +'.p'), 'wb') as f2:
    pickle.dump(xgb_opt, f2,protocol = 2)
    
with open(('XGB_best_param_' +str1 + '_AUC_' + '0p'+ str(int(gs_xgb.best_score_*1e5))  +'.p'), 'wb') as f3:
    pickle.dump(best_param_xgb, f3,protocol = 2)    

In [13]:
xgb_grid_scores

[mean: 0.75911, std: 0.00275, params: {'colsample_bytree': 0.9, 'eval_metric': 'auc', 'max_delta_step': 0, 'min_child_weight': 3, 'subsample': 1.0, 'eta': 0.01, 'objective': 'binary:logistic', 'num_boost_round': 40, 'max_depth': 14, 'gamma': 0, 'lambda': 1},
 mean: 0.75911, std: 0.00275, params: {'colsample_bytree': 0.9, 'eval_metric': 'auc', 'max_delta_step': 0, 'min_child_weight': 3, 'subsample': 1.0, 'eta': 0.01, 'objective': 'binary:logistic', 'num_boost_round': 500, 'max_depth': 14, 'gamma': 0, 'lambda': 1},
 mean: 0.76943, std: 0.00052, params: {'colsample_bytree': 0.9, 'eval_metric': 'auc', 'max_delta_step': 0, 'min_child_weight': 3, 'subsample': 1.0, 'eta': 0.1, 'objective': 'binary:logistic', 'num_boost_round': 40, 'max_depth': 14, 'gamma': 0, 'lambda': 1},
 mean: 0.76943, std: 0.00052, params: {'colsample_bytree': 0.9, 'eval_metric': 'auc', 'max_delta_step': 0, 'min_child_weight': 3, 'subsample': 1.0, 'eta': 0.1, 'objective': 'binary:logistic', 'num_boost_round': 500, 'max_de

##2. GridSearch using RandomForest

In [16]:
if LocalTest:
    param_rf = {'n_estimators': [200],'max_depth':[20],'n_jobs': [1],'max_features':['auto'],'min_samples_leaf':[1,3]}        
else:    
    param_rf = {'n_estimators': [200,500],
            'max_depth':[20,None],
         'n_jobs': [-1],
         'max_features':['auto'],
        'min_samples_leaf':[1,3,5]}

In [17]:
xtrain=np.hstack((X,X1,X2,X3,X4, X5))

with open('features/XGB_335Features Oct172015_044255_AUC_0p76115.p', 'rb') as fid:
    rf_goodfeat1 = pickle.load(fid)

good_features=list(rf_goodfeat1)
xtrain = xtrain[:,good_features]

In [18]:
start_time=time.clock()

print('Starting GridSearch using RandomForest...')
clf_rf = RandomForestClassifier(random_state =100)
gs_rf = GridSearchCV(clf_rf,param_grid = param_rf,cv = StratifiedKFold(ytrain,n_folds = 3),scoring='roc_auc', n_jobs = njobs,verbose = 2)
gs_rf.fit(xtrain,ytrain)

total_time=time.clock()-start_time
print('Completed GridSearch using RandomForest')
print('Total running time is %d seconds\n' %total_time)

Starting GridSearch using RandomForest...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 -  40.9s
[CV] max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 -  41.1s
[CV] max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 -  40.8s
[CV] max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_samples_leaf=1 - 1.7min
[CV] max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_sample

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   40.9s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 44.1min finished



Completed GridSearch using RandomForest
Total running time is 2815 seconds



In [19]:
print 'Best AUC Score of RF is {}'.format(gs_rf.best_score_)
print 'Best parameters set of RF:'
best_param_rf = gs_rf.best_estimator_.get_params()
for param_name in sorted(best_param_rf.keys()):
    print '\t%s: %r' % (param_name,best_param_rf[param_name])

Best AUC Score of RF is 0.76943287915
Best parameters set of RF:
	bootstrap: True
	class_weight: None
	criterion: 'gini'
	max_depth: None
	max_features: 'auto'
	max_leaf_nodes: None
	min_samples_leaf: 5
	min_samples_split: 2
	min_weight_fraction_leaf: 0.0
	n_estimators: 500
	n_jobs: -1
	oob_score: False
	random_state: 100
	verbose: 0
	warm_start: False


###dump the model into pickle

In [20]:
rf_opt = gs_rf.best_estimator_

str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
with open(('rf_opt_best_estimator' +str1 + '_AUC_' + '0p'+ str(int(gs_rf.best_score_*1e5))  +'.p'), 'wb') as f:
    pickle.dump(rf_opt, f, protocol = 2)
    
with open(('rf_opt_best_param' +str1 + '_AUC_' + '0p'+ str(int(gs_rf.best_score_*1e5))  +'.p'), 'wb') as f:
    pickle.dump(best_param_rf, f, protocol = 2) 

##3. GridSearch using RandomForest using another set of features

In [21]:
xtrain=np.hstack((X,X1,X2,X3,X4, X5))

with open('features/SDGRF_Oct172015125634_AUC_0.755531555054_with_cat_variables.p', 'rb') as fid:
    rf_goodfeat2 = pickle.load(fid)

good_features=list(rf_goodfeat2)
xtrain = xtrain[:,good_features]
print(xtrain.shape)

(145231L, 799L)


In [22]:
start_time=time.clock()

print('Starting GridSearch using RandomForest...')
clf_rf = RandomForestClassifier(random_state =100)
gs_rf = GridSearchCV(clf_rf,param_grid = param_rf,cv = StratifiedKFold(ytrain,n_folds = 3),scoring='roc_auc', n_jobs = njobs,verbose = 2)
gs_rf.fit(xtrain,ytrain)

total_time=time.clock()-start_time
print('Completed GridSearch using RandomForest')
print('Total running time is %d seconds\n' %total_time)

Starting GridSearch using RandomForest...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 - 1.2min
[CV] max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 - 1.2min
[CV] max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=-1, max_depth=20, min_samples_leaf=1 - 1.2min
[CV] max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_samples_leaf=1 - 3.0min
[CV] max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=500, n_jobs=-1, max_depth=20, min_sample

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 82.1min finished



Completed GridSearch using RandomForest
Total running time is 5222 seconds



In [23]:
print 'Best AUC Score of RF is {}'.format(gs_rf.best_score_)
print 'Best parameters set of RF:'
best_param_rf = gs_rf.best_estimator_.get_params()
for param_name in sorted(best_param_rf.keys()):
    print '\t%s: %r' % (param_name,best_param_rf[param_name])

Best AUC Score of RF is 0.768240704571
Best parameters set of RF:
	bootstrap: True
	class_weight: None
	criterion: 'gini'
	max_depth: None
	max_features: 'auto'
	max_leaf_nodes: None
	min_samples_leaf: 5
	min_samples_split: 2
	min_weight_fraction_leaf: 0.0
	n_estimators: 500
	n_jobs: -1
	oob_score: False
	random_state: 100
	verbose: 0
	warm_start: False


In [24]:
rf_opt = gs_rf.best_estimator_

str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
with open(('rf_900_opt_best_estimator' +str1 + '_AUC_' + '0p'+ str(int(gs_rf.best_score_*1e5))  +'.p'), 'wb') as f:
    pickle.dump(rf_opt, f, protocol = 2)
    
with open(('rf_900_opt_best_param' +str1 + '_AUC_' + '0p'+ str(int(gs_rf.best_score_*1e5))  +'.p'), 'wb') as f:
    pickle.dump(best_param_rf, f, protocol = 2) 