In [34]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import linear_model
import warnings
import mytimer
import time
from sklearn.feature_extraction import DictVectorizer
from scipy import sparse
import pickle
import xgboost as xgb
mytimer=mytimer.Timer()
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from datetime import datetime
%matplotlib inline

In [36]:
X=np.load('./mostrecentinput/nxtrain_standard_original0.npy')
X1= np.load('./mostrecentinput/nxtrain_standard_derived0.npy')
X2=pickle.load(open("./mostrecentinput/time_series_derived_train2.dat","rb"))
X3=pickle.load(open("./mostrecentinput/time_series_original_train2.dat","rb"))
X4=pickle.load(open("./mostrecentinput/cat_numeric_th60_train2.dat","rb"))
X=np.hstack((X,X1,X2,X3,X4))

del X1,X2,X3,X4
y=pickle.load(open("./mostrecentinput/ytrain2.dat","rb"))

In [38]:
def cv_loop(X, y, model, N, SEED=40, diagnostics=False, randomsplit=False):   
    AUC = np.zeros(N)
    skf = cross_validation.StratifiedKFold(y, n_folds=3) 
    for train, cv in skf:
        if randomsplit: # random split of the row index
            train, cv = cross_validation.train_test_split(range(len(y)), test_size=1.0/float(N), random_state = i*SEED)
        break
    mean_auc,preds_cv = model.fit_N_Predict(X[train,:], y[train],X[cv,:],y[cv])
    std_auc=0.01
    return mean_auc, std_auc  

In [39]:
def feature_selection(model, params, Xtrain, ytrain, diagonistics=False, nth=5, features=set([])):
    print ("Performing greedy feature selection...")
    score_hist = []
    good_features=features
    SEED=40
    count = 0
    nfeatures = 0
    highest_score = -9999
    MODEL = model.set_params(random_state=SEED)
    while len(score_hist) < 2 or count < nth: 
        scores = []
        bestAUC=0
        ncount=0
        AUC_All=np.zeros(Xtrain.shape[1])
        f_All=np.zeros(Xtrain.shape[1]).astype(int)
        print('%s\t ' % (datetime.now()))
        for f in range(Xtrain.shape[1]):
            if f not in good_features:
                feats = list(good_features) + [f]
                X11=Xtrain[:,feats]
                score = cv_loop(Xtrain[:,feats], y, MODEL, N=2)
                AUC_All[ncount]=score[0]
                f_All[ncount]=f
                ncount=ncount+1
                if score[0]>bestAUC:
                    bestAUC=score[0]
                    print('\t\t\t\t feature #: %d\tcurrent AUC: %f' % (f, bestAUC))
                scores.append((score, f))

        best_score = sorted(scores)[-1]
        best_score0=best_score[0][0]
        best_feature = sorted(scores)[-1][1]
        
        if best_score0>highest_score:
            highest_score = best_score0
            nfeatures +=1
        else:
            count +=1
        print(highest_score)    
        index=np.argsort(AUC_All)[::-1]
        for i in range(5):
             good_features.add(f_All[index[i]])
            
        score_hist.append(best_score)
        print ("Current features: %s" % sorted(list(good_features)))
        print('Feature selection: %d features selected' % len(good_features))
        
        dir1="C:\\Users\\Wei\\Dropbox\\Bio_Physics_JailBreak\\Kaggle\\pickledump\\"
        str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
        str2='_AUC_'+0p'+ str(int(highest_score*1e5))
        fn=dir1+'XGB_'+str(len(good_features))+'Features '+str1+str2+'.p'
        
        with open((fn), 'wb') as f:
            pickle.dump(good_features, f, protocol =2)  

    good_features = good_features[0:nfeatures]
    good_features = sorted(list(good_features))
    print('Feature selection: %d features selected'%nfeatures)
    
    if diagonistics:
        plt.clf()
        plt.plot(range(len(score_hist)),score_hist,'ro--')
        plt.xlabel('number of selected features')
        plt.ylabel('AUC score')
        plt.title('Feature-selection curve')
        plt.savefig('plots/featsel_curve.png')
        
    return good_features

In [40]:
class XGBClassfier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.params = params
        self.niter=100
 
    def fit_N_Predict(self, x_train,y_train,x_test,y_test):
        niter=self.niter
        param = {'max_depth':14, 
                 'eta':0.01, 
                 'objective':'binary:logistic', 
                 'subsample':0.6,
                 'colsample_bytree':0.6,
                 'eval_metric': 'auc'  }

        dtrain = xgb.DMatrix(x_train, label=y_train)
        dtest = xgb.DMatrix(x_test, label=y_test)
        clf = xgb.train(param, dtrain, 50)
        preds = clf.predict(dtest)
        AUC=metrics.roc_auc_score(y_test, preds)
        return AUC,preds
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        self.params.update(params)
        return self

In [None]:
features0 = set([43, 49, 129, 130, 131, 161, 163, 170, 188, 215, 218, 246, 286, 305, 316, 
                      396, 427, 460, 485, 489, 552, 555, 558, 595, 709, 735, 748, 751, 752, 774, 
                      778, 801, 810, 832, 839, 916, 917, 918, 950, 967, 969, 982, 983, 985, 1019, 
                      1022, 1027, 1050, 1075, 1116, 1142, 1144, 1162, 1176, 1254, 1278, 
                      1299, 1306, 1356, 1365, 1372, 1395, 1398, 1402, 1436])

In [None]:
clf = XGBClassfier()
params=0
good_features=feature_selection(clf, params,X, y, diagonistics=False, nth=20,features=features0)