In [8]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import linear_model
import warnings

import time
from sklearn.feature_extraction import DictVectorizer
from scipy import sparse
import pickle

warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from datetime import datetime
%matplotlib inline
t = time.time()
#elapsed = (time.time() - t)/60

In [38]:
X=np.load('./mostrecentinput/nxtrain_standard_original0.npy')
X1= np.load('./mostrecentinput/nxtrain_standard_derived0.npy')
X2=pd.read_pickle("./mostrecentinput/time_series_derived_standard_train2.dat")
X3=pd.read_pickle("./mostrecentinput/time_series_original_standard_train2.dat")
X=np.hstack((X,X1,X2,X3))
del X1,X2,X3
y=pickle.load(open("./mostrecentinput/ytrain2.dat","rb"))

## Defining cross validation and feature selection functions

In [23]:
kf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=42)

for train_ind, test_ind in kf:
    x_train= X[train_ind,:]
    x_test = X[test_ind,:]
    y_train= y[train_ind]
    y_test = y[test_ind]
    break

In [41]:
def cv_loop(X, y, model, N, SEED=40, diagnostics=False, randomsplit=False, method="Logistic"):   
    # N random splits into train and test sets with the test of 1/N fraction
    
    AUC = np.zeros(N)
    skf = cross_validation.StratifiedKFold(y, n_folds=N) # K-fold cv splitting
    i=0
    for train, cv in skf:
        if randomsplit: # random split of the row index
            train, cv = cross_validation.train_test_split(range(len(y)), test_size=1.0/float(N), random_state = i*SEED)
        break
    
    if method == "Logistic":
        MODEL = model.fit(X[train,:], y[train])
        preds_cv = MODEL.predict_proba(X[cv,:])[:,1]
        fpr, tpr, _ = metrics.roc_curve(y[cv], preds_cv)
        roc_auc = metrics.auc(fpr, tpr)
    elif method == "Aggressive" or method == "SVM":
        roc_auc,preds_cv = MODEL.fit_N_Predict(X[train,:], y[train],X[cv,:],y[cv])

    AUC[i] = roc_auc

    # plot learning curve and roc curve for diagonistics purpose
    if diagnostics and i == 0:  # only plot for first fold
        print("plotting ROC curve")
        plot_roc(fpr, tpr)
        print("plotting learning curve")
        learning_curve(model, X.ix[train,:], y[train], X.ix[cv,:], y[cv])
    i+=1
        
    mean_auc = roc_auc
    std_auc=0.1
    #std_auc = AUC.std()
    return mean_auc, std_auc  # Return the mean and std of the cv AUC score

In [44]:
def feature_selection(model, params, Xtrain, ytrain, diagonistics=False, 
                      nth=2,FeatsPerRun=20,good_features = set([]),ModelName="Logistic"):
    "Greedy (forward) feature selection until the performance stops improving for 20 steps"
    
    print ("Performing greedy feature selection...")
    score_hist = []
    SEED=40
    count = 0
    nfeatures = 0
    highest_score = -9999
    MODEL = model.set_params(random_state=SEED)

    while len(score_hist) < 2 or count < nth: 
        scores = []
        bestAUC=0
        ncount=0
        AUC_All=np.zeros(Xtrain.shape[1])
        f_All=np.zeros(Xtrain.shape[1]).astype(int)
        print('%s\t ' % (datetime.now()))
        for f in range(Xtrain.shape[1]):
            if f not in good_features:
                feats = list(good_features) + [f]
                X11=Xtrain[:,feats]
                score = cv_loop(Xtrain[:,feats], y, MODEL, N=2,model=ModelName)
                AUC_All[ncount]=score[0]
                f_All[ncount]=f
                ncount=ncount+1
                if score[0]>bestAUC:
                    bestAUC=score[0]
                    print('\t\t\t\t feature #: %d\t\tcurrent AUC: %f' % (f, bestAUC))
                elif np.mod(f, np.round(Xtrain.shape[1]/20))==0:
                    print('\t\t\t\t feature #: %d\t\t: ' % (f))
                scores.append((score, f))

        best_score = sorted(scores)[-1]
        best_score0=best_score[0][0]
        best_feature = sorted(scores)[-1][1]
        
        if best_score0>highest_score:
            highest_score = best_score0
            nfeatures +=1
        else:
            count +=1
        print(highest_score)    
        index=np.argsort(AUC_All)[::-1]
        for i in range(FeatsPerRun):
             good_features.add(f_All[index[i]])
            
        
        feats=list(good_features)
        
        score = cv_loop(Xtrain[:,feats], y, MODEL, N=2)
        print (score)
 
        score_hist.append(best_score)
        
        print ("Current features: %s" % list(good_features))
        print('Feature selection: %d features selected' % len(good_features))
        
        str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
        fn= 'Feats'+ str(len(good_features))+str1+'_AUC_'+'0p'+str(int(highest_score*1e5))+'.p'
        pickle.dump(good_features, open((fn), 'wb'), protocol =2)  
        elapsed = (time.time() - t)/60
        print(elapsed)

    good_features = good_features[0:nfeatures]
    good_features = sorted(list(good_features))
    print('Feature selection: %d features selected'% nfeatures)
    
    if diagonistics:
        plt.clf()
        plt.plot(range(len(score_hist)),score_hist,'ro--')
        plt.xlabel('number of selected features')
        plt.ylabel('AUC score')
        plt.title('Feature-selection curve')
        plt.savefig('plots/featsel_curve.png')
        
    return good_features

## SGD - Logistic regression

In [46]:
good_features=set( [1024, 1027, 1030, 11, 534, 1049, 1050, 551, 552, 
                    43, 555, 558, 49, 1075, 64, 595, 88, 1116, 93, 
                    1123, 1126, 1142, 1144, 636, 637, 129, 130, 131,
                    1154, 134, 647, 1162, 1164, 141, 1176, 161, 163, 
                    165, 170, 178, 693, 188, 709, 198, 723, 215, 218,
                    735, 739, 1253, 1254, 746, 1258, 748, 751, 752, 1265,
                    246, 1272, 1278, 769, 260, 774, 1286, 778, 781, 270, 
                    1299, 277, 1306, 286, 801, 291, 810, 1326, 305, 309, 
                    311, 1336, 316, 832, 834, 839, 1356, 1365, 341, 342, 
                    856, 1372, 860, 363, 1389, 1395, 884, 1398, 1401, 1402, 
                    901, 396, 913, 916, 917, 918, 919, 410, 1436, 415, 929, 
                    427, 950, 960, 967, 969, 460, 1489, 982, 983, 1496, 985, 
                    1498, 485, 998, 489, 1019, 1022])

In [None]:
clf = linear_model.SGDClassifier(loss='log',average=True,n_iter=20)
params=0
good_features1=feature_selection(clf, params,X, y, diagonistics=False, nth=10,FeatsPerRun=30,
                                 good_features=good_features,ModelName="Logistic")

In [None]:
# Logistic regression classifer
print ('start Logistic regression using Scikit learn')

mytimer.startTimer()
clf = linear_model.SGDClassifier(loss='log',average=True,n_iter=100)
clf.fit(x_train, y_train)
preds=clf.predict_proba(x_test)[:,1]

print(metrics.roc_auc_score(y_test, preds))
print ('Logistic regression using Scikit learn completed')
ElapsedTime,sec=mytimer.endTimer(display=True)
AUC_Logistic=metrics.roc_auc_score(y_test, preds)

str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
preds.dump('./MetaResults/Logistic_'+str1 + '_AUC_' + '0p'+ str(int(AUC_Logistic*1e5)) 
           +'Time' +str(ElapsedTime) + '_WithCleanedData.p')


## Passive Agreesive Classfier 

In [None]:
class AggressiveClassfier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.params = params
        self.niter = 100
 
    def fit_N_Predict(self, x_train,y_train,x_test,y_test):
        niter = self.niter 
        preds=0
        for i in range(niter):
            clf1= linear_model.PassiveAggressiveClassifier(loss='squared_hinge',C=1.0)
            clf1.fit(x_train, y_train)
            pred0=clf1.predict(x_test)
            preds=preds+pred0
            #if np.mod(i,10)==0: print(i,end=' ')
        preds=preds/float(niter)
        AUC=metrics.roc_auc_score(y_test, preds)
        return AUC,preds
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        self.params.update(params)
        return self

In [None]:
clf = AggressiveClassfier(niter=100)
params=0;
good_features=feature_selection(clf, params,X, y, diagonistics=False, nth=2,ModelName="Aggressive"
                                ,good_features=good_features)
pickle.dump(good_features, open('good_features_SDG_Aggressive.dat', 'wb') , protocol =2)

# Logistic regression classifer
print ('start Aggressive using Scikit learn')

mytimer.startTimer()
clf = linear_model.SGDClassifier(loss='log',average=True,n_iter=100)
clf.fit(x_train, y_train)
preds=clf.predict_proba(x_test)[:,1]

print(metrics.roc_auc_score(y_test, preds))
print ('Aggressive using Scikit learn completed')
ElapsedTime,sec=mytimer.endTimer(display=True)
AUC_Logistic=metrics.roc_auc_score(y_test, preds)

str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
preds.dump('./MetaResults/Aggressive_'+str1 + '_AUC_' + '0p'+ str(int(AUC_Logistic*1e5)) 
           +'Time' +str(ElapsedTime) + '_WithCleanedData.p')

## SGD SVM

In [None]:
class SVMClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.params = params
        self.niter=100
 
    def fit_N_Predict(self, x_train,y_train,x_test,y_test):
        niter=self.niter
        preds=0
        for i in range(niter):
            clf1= linear_model.SGDClassifier()
            clf1.fit(x_train, y_train)
            pred0=clf1.predict(x_test)
            preds=preds+pred0
            #if np.mod(i,10)==0: print(i,end=' ')
        preds=preds/float(niter)
        AUC=metrics.roc_auc_score(y_test, preds)
        return AUC,preds
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        self.params.update(params)
        return self

In [None]:
clf = SVMClassifier(n_iter=100)
params=0
good_features=feature_selectionSVM(clf, params,X, y, diagonistics=False, nth=2,
                                   good_features=good_features)

print ('start SVM using Scikit learn')
mytimer.startTimer()
clf = linear_model.SGDClassifier(loss='log',average=True,n_iter=100)
clf.fit(x_train, y_train)
preds=clf.predict_proba(x_test)[:,1]

print(metrics.roc_auc_score(y_test, preds))
print ('SVM using Scikit learn completed')
ElapsedTime,sec=mytimer.endTimer(display=True)
AUC_Logistic=metrics.roc_auc_score(y_test, preds)

str1=time.strftime("%b%d%Y_%H%M%S", time.localtime())
preds.dump('./MetaResults/SVM_'+str1 + '_AUC_' + '0p'+ str(int(AUC_Logistic*1e5)) 
           +'Time' +str(ElapsedTime) + '_WithCleanedData.p')
