In [23]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from scipy import stats,sparse
from sklearn.base import TransformerMixin
from datetime import datetime as dt
from math import isnan
from numpy import ma
import cPickle as pickle
import xgboost as xgb
import time
from pandas import *
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier 

In [24]:
import json
from sklearn.metrics import roc_curve, auc
from re import sub
from collections import defaultdict

In [25]:
from sklearn.cross_validation import StratifiedKFold,cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

In [26]:
class XGBoostClassifier():
    def __init__(self, num_boost_round=40, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'binary:logistic'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        dtrain = xgb.DMatrix(X.values, label=y)
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
        return self
 
    def predict(self, X):
        Y = self.predict_proba(X.values)
        Y = np.argmax(Y, axis=1)
        return Y
 
    def predict_proba(self, X):
        ypreds = np.zeros((X.shape[0],2))
        dtest = xgb.DMatrix(X.values)
        ypreds[:,1] = self.clf.predict(dtest)
        ypreds[:,0] = 1- ypreds[:,1]        # return the proba for both classes
        return ypreds
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / self.logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    def logloss(self,y_true, Y_pred):
        label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
        return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)

## 0. Load data

In [27]:
LocalTest=True           # whether to do a local test
SelectedFeature=False    # whether to use selected features
njobs = 1
nrows=5000

In [28]:
if LocalTest:
    trainfile = 'C:/Huaixiu/Kaggle/GridSearch/data/train-5000.csv'
    xtrain = read_csv(trainfile,nrows=nrows)
    ytrain = xtrain['target']
    
    xtrain = xtrain.ix[:,1:-1]

else:    
    X=np.load('data/nxtrain_standard_original0.npy')
    X1= np.load('data/nxtrain_standard_derived0.npy')
    X2=pickle.load(open("data/time_series_derived_train2.dat","rb"))
    X3=pickle.load(open("data/time_series_original_train2.dat","rb"))
    X4=pickle.load(open("data/cat_numeric_th60_train2.dat","rb"))
    ytrain=pickle.load(open("data/ytrain2.dat","rb"))
    xtrain=np.hstack((X,X1,X2,X3,X4))
    del X,X1,X2,X3,X4
    
    if SelectedFeature:
        with open('data/XGB_80Features Oct142015_221850_AUC_0p76183.p', 'rb') as fid:
            xgb_goodfeat = pickle.load(fid)
    
        good_features=list(xgb_goodfeat)
        xtrain = xtrain[:,good_features]

In [29]:
print(xtrain.shape, ytrain.shape)

((5000, 1709), (5000L,))


## 1. Grid search: XGBoost

In [30]:
if LocalTest:
    param_xgb = {
        'num_boost_round': [2],
        'eta': [0.3],
        'max_depth': [1],
        'subsample': [0.9],
        'colsample_bytree': [0.9],
        'min_child_weight':[1],
        'gamma':[10],
        'objective':['binary:logistic'],
        'eval_metric': ['auc']
    }
else:    
    param_xgb = {
        'num_boost_round': [40,100],
        'eta': [0.01,0.1,0.3],
        'max_depth': [6, 12],
        'subsample': [1.0],
        'colsample_bytree': [0.9, 1.0],
        'min_child_weight':[1,3],
        'gamma':[0,1],
        'max_depth':[14],
        'objective':['binary:logistic'],
        'eval_metric': ['auc']
    }

In [31]:
start_time=time.clock()

print('Starting GridSearch using XGBoost...')
clf_xgb = XGBoostClassifier()
gs_xgb = GridSearchCV(clf_xgb,param_grid = param_xgb,cv = StratifiedKFold(ytrain,n_folds = 3),scoring='roc_auc', n_jobs = njobs,verbose = 2)
gs_xgb.fit(xtrain,ytrain)

total_time=time.clock()-start_time
print('Completed GridSearch using XGBoost')
print('Total running time is %d seconds\n' %total_time)

Starting GridSearch using XGBoost...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] colsample_bytree=0.9, eval_metric=auc, min_child_weight=1, subsample=0.9, eta=0.3, objective=binary:logistic, num_boost_round=2, max_depth=1, gamma=10 
[CV]  colsample_bytree=0.9, eval_metric=auc, min_child_weight=1, subsample=0.9, eta=0.3, objective=binary:logistic, num_boost_round=2, max_depth=1, gamma=10 -   3.3s
[CV] colsample_bytree=0.9, eval_metric=auc, min_child_weight=1, subsample=0.9, eta=0.3, objective=binary:logistic, num_boost_round=2, max_depth=1, gamma=10 
[CV]  colsample_bytree=0.9, eval_metric=auc, min_child_weight=1, subsample=0.9, eta=0.3, objective=binary:logistic, num_boost_round=2, max_depth=1, gamma=10 -   3.9s
[CV] colsample_bytree=0.9, eval_metric=auc, min_child_weight=1, subsample=0.9, eta=0.3, objective=binary:logistic, num_boost_round=2, max_depth=1, gamma=10 
[CV]  colsample_bytree=0.9, eval_metric=auc, min_child_weight=1, subsample=0.9, eta=0.3, objective=bi

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    3.3s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.3s finished



Completed GridSearch using XGBoost
Total running time is 16 seconds



In [32]:
print 'Best AUC Score of XGB is {}'.format(gs_xgb.best_score_)
print 'Best parameters set of XGB:'
best_param_xgb = gs_xgb.best_estimator_.get_params()
for param_name in sorted(best_param_xgb.keys()):
    print '\t%s: %r' % (param_name,best_param_xgb[param_name])

Best AUC Score of XGB is 1.0
Best parameters set of XGB:
	colsample_bytree: 0.9
	eta: 0.3
	eval_metric: 'auc'
	gamma: 10
	max_depth: 1
	min_child_weight: 1
	num_boost_round: 2
	objective: 'binary:logistic'
	subsample: 0.9


###dump the model to pickle 

In [11]:
xgb_opt = gs_xgb.best_estimator_

with open('xgb_opt.pkl', 'wb') as fid:
    pickle.dump(xgb_opt, fid,protocol = 2)
    
with open('xgb_best_param.pkl', 'wb') as fid:
    pickle.dump(best_param_xgb, fid,protocol = 2)    

In [12]:
bestparam=pickle.load(open("xgb_best_param.pkl","rb"))
bestparam

{'colsample_bytree': 0.9,
 'eta': 0.3,
 'eval_metric': 'auc',
 'gamma': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'num_boost_round': 10,
 'objective': 'binary:logistic',
 'subsample': 0.9}

##2. GridSearch using RandomForest

In [62]:
if LocalTest:
    param_rf = {'n_estimators': [200],'max_depth':[20],'n_jobs': [1],'max_features':['auto'],'min_samples_leaf':[1,3]}        
else:    
    param_rf = {'n_estimators': [200,500],
            'max_depth':[20,None],
         'n_jobs': [-1],
         'max_features':['auto'],
        'min_samples_leaf':[1,3,5]}

In [63]:
start_time=time.clock()

print('Starting GridSearch using RandomForest...')
clf_rf = RandomForestClassifier(random_state =100)
gs_rf = GridSearchCV(clf_rf,param_grid = param_rf,cv = StratifiedKFold(ytrain,n_folds = 3),scoring='roc_auc', n_jobs = njobs,verbose = 2)
gs_rf.fit(xtrain,ytrain)

total_time=time.clock()-start_time
print('Completed GridSearch using RandomForest')
print('Total running time is %d seconds\n' %total_time)

Starting GridSearch using RandomForest...
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=1 -  12.0s
[CV] max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=1 -  18.8s
[CV] max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=1 
[CV]  max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=1 -  15.1s
[CV] max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=3 
[CV]  max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=3 -  17.1s
[CV] max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=3 
[CV]  max_features=auto, n_estimators=200, n_jobs=1, max_depth=20, min_samples_leaf=3 -  

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   12.1s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.6min finished



Completed GridSearch using RandomForest
Total running time is 131 seconds



In [64]:
print 'Best AUC Score of RF is {}'.format(gs_rf.best_score_)
print 'Best parameters set of RF:'
best_param_rf = gs_rf.best_estimator_.get_params()
for param_name in sorted(best_param_rf.keys()):
    print '\t%s: %r' % (param_name,best_param_rf[param_name])

Best AUC Score of RF is 0.994065966861
Best parameters set of RF:
	bootstrap: True
	class_weight: None
	criterion: 'gini'
	max_depth: 20
	max_features: 'auto'
	max_leaf_nodes: None
	min_samples_leaf: 1
	min_samples_split: 2
	min_weight_fraction_leaf: 0.0
	n_estimators: 200
	n_jobs: 1
	oob_score: False
	random_state: 100
	verbose: 0
	warm_start: False


###dump the model into pickle

In [65]:
rf_opt = gs_rf.best_estimator_

with open('rf_opt.pkl', 'wb') as fid:
    pickle.dump(rf_opt, fid,protocol = 2)
    
with open('rf_best_param.pkl', 'wb') as fid:
    pickle.dump(best_param_rf, fid,protocol = 2) 

In [66]:
bestparam=pickle.load(open("rf_best_param.pkl","rb"))
bestparam

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 100,
 'verbose': 0,
 'warm_start': False}

##3. make prediction on the test set

In [67]:
clf0 = XGBoostClassifier()
clf0.set_params(**best_param_xgb)
XGB = clf0.fit(xtrain,ytrain)


clf1 = RandomForestClassifier(random_state =100)
clf1.set_params(**best_param_rf)
RF = clf1.fit(xtrain,ytrain)

del xtrain
del ytrain

with open('data/xtest_ID.pkl','rb') as fid:
    test_ID = pickle.load(fid)

    # load test data set
if LocalTest:
    testfile = 'C:/Huaixiu/Kaggle/GridSearch/data/train-5000.csv'
    xtest = read_csv(testfile,nrows=nrows)
    xtest = xtest.ix[:,1:-1]
    
    test_ID = test_ID[:nrows]
    
else:    
    X=np.load('data/nxtest_standard_original0.npy')
    X1= np.load('data/nxtest_standard_derived0.npy')
    X2=pickle.load(open("data/time_series_derived_test2.dat","rb"))
    X3=pickle.load(open("data/time_series_original_test2.dat","rb"))
    X4=pickle.load(open("data/cat_numeric_th60_test2.dat","rb"))
    xtest=np.hstack((X,X1,X2,X3,X4))
    del X,X1,X2,X3,X4
    
    
    if SelectedFeature:
        with open('data/XGB_75Features Oct142015_171619_AUC_0p76030.p', 'rb') as fid:
            xgb_goodfeat = pickle.load(fid)
    
        good_features=list(xgb_goodfeat)
        xtest = xtest[:,good_features]

# make final predictions
ypreds_xgb = XGB.predict_proba(xtest)[:,1]
ypreds_rf = RF.predict_proba(xtest)[:,1]

In [68]:
with open('ypreds_xgb_gs_basic.pkl', 'wb') as fid:
    pickle.dump(ypreds_xgb, fid,protocol = 2)
    
with open('ypreds_rf_gs_basic.pkl', 'wb') as fid:
    pickle.dump(ypreds_rf, fid,protocol = 2)

In [69]:
# generate submission files

def save_results(test_ID, predictions, filename):
    """Given a vector of predictions, save results in CSV format."""
    with open(filename, 'w') as f:
        f.write("ID,target\n")
        for i in range(len(test_ID)):
            f.write("%d,%f\n" % (test_ID[i], predictions[i]))
    
save_results(test_ID, ypreds_xgb, 'ypreds_xgb_gs_basic.csv')
save_results(test_ID, ypreds_rf, 'ypreds_rf_gs_basic.csv')