# ML Feature enginerring - SelectKbest

In [143]:
%matplotlib inline
%run talibref.py
%run ensemble.py
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

### Get data

In [179]:
#df=pd.read_csv("data/IYZ.csv")
ticker = 'ITB'
startdate=datetime.date(2010, 1, 1)
enddate=datetime.date.today()
df = generate_ticker_data(ticker, startdate, enddate)

Available data: Ticker(ITB) from 2006.05.05 to 2015.11.27
Usable data: Ticker(ITB) from 2007.02.22 to 2015.11.27 
Returned data: Ticker(ITB) from 2010.01.04 to 2015.11.27 
Save path: data/ITB_from_2010.01.04_2015.11.27.csv


In [180]:
dftouse=df.copy()

### Feature Engineering

In [181]:
IGNORE = ['date', 'result_1','close_1','perf_1','result_14','close_14','perf_14','results']

In [182]:
INDICATORS=[]
for v in df.columns:
    l=df[v].unique()
    if len(l) <= 10 and v not in IGNORE:
        #print v, l
        INDICATORS.append(v)

In [183]:
STANDARDIZABLE = []
for v in df.columns:
    if v not in INDICATORS and v not in IGNORE:
        #print v
        STANDARDIZABLE.append(v)

In [184]:
################################################
# Code to add signals from other tickers
################################################

# Additional tickers to consider
moreTickers = ['^GSPC', #S&P500
               '^dji',  #Dow Jones Industrials
               '^IXIC' #Nasdaq
              ]

# Signals for additional tickers
STANDARDIZABLE_TO_ADD = ['roc']
INDICATORS_TO_ADD = ['sar_signal']


def addDataFromOtherTicker(tickerPlus):
    dfPlus = generate_ticker_data(tickerPlus, startdate, enddate)
    if df.shape == dfPlus.shape:
        dfPluss = dfPlus.copy()
        dfPluss = dfPluss[STANDARDIZABLE_TO_ADD+INDICATORS_TO_ADD]
        STANDARDIZABLE2 = STANDARDIZABLE_TO_ADD[:]
        INDICATORS2 = INDICATORS_TO_ADD[:]
        for p in STANDARDIZABLE2:
            renamed = p+'_'+tickerPlus
            dftouse[renamed]=dfPluss[p]*1.0
            STANDARDIZABLE.append(renamed)
        for p in INDICATORS2:
            renamed = p+'_'+tickerPlus
            dftouse[renamed]=dfPluss[p]
            INDICATORS.append(renamed)
    else:
        print "Data for additional tickers doesn't match data frame for sector of interest."

for tick in moreTickers:
    print "####### Loading data for "+tick+"...."
    addDataFromOtherTicker(tick)    
    time.sleep(0.5)
    
dftouse.head()

####### Loading data for ^GSPC....
Available data: Ticker(^GSPC) from 1950.01.03 to 2015.11.27
Usable data: Ticker(^GSPC) from 1962.05.28 to 2015.11.27 
Returned data: Ticker(^GSPC) from 2010.01.04 to 2015.11.27 
Save path: data/^GSPC_from_2010.01.04_2015.11.27.csv
####### Loading data for ^dji....
Available data: Ticker(^dji) from 1985.01.29 to 2015.11.27
Usable data: Ticker(^dji) from 1987.10.15 to 2015.11.27 
Returned data: Ticker(^dji) from 2010.01.04 to 2015.11.27 
Save path: data/^dji_from_2010.01.04_2015.11.27.csv
####### Loading data for ^IXIC....
Available data: Ticker(^IXIC) from 1971.02.05 to 2015.11.27
Usable data: Ticker(^IXIC) from 1973.11.26 to 2015.11.27 
Returned data: Ticker(^IXIC) from 2010.01.04 to 2015.11.27 
Save path: data/^IXIC_from_2010.01.04_2015.11.27.csv


Unnamed: 0,date,open,high,low,close,volume,close_1,result_1,perf_1,close_14,result_14,perf_14,results,bb_upper,bb_middle,bb_lower,bb_pct,bb_bandwidth,bb_squeeze,bb_signalup,bb_signaldn,bb_signal,ema50,ema150,ema200,ema_signal1,ema_signal2,kama50,kama150,kama200,kama_signal1,kama_signal2,sar,sar_signal,adx,plus_di,minus_di,adx_trend,adx_direction,adx_signal,aroon_osc,aroon_signal,cci,cci_signal,macd,macd_sigline,macd_hist,macd_signal,ppo,ppo_signal,mfi,mfi_signal,roc,roc_signal,rsi,rsi_signal,ult_osc,ult_signal,willr,wr_signal,ad_osc,ad_signal,stoch_slowk,stoch_slowd,sslow_signal,stoch_fastk,stoch_fastd,srsi_signal,trix,trix_signal,sr_pivotpts,sr_res1,sr_sup1,sr_res2,sr_sup2,sr_res3,sr_sup3,cv_signal,roc_^GSPC,sar_signal_^GSPC,roc_^dji,sar_signal_^dji,roc_^IXIC,sar_signal_^IXIC
0,2010-01-04,12.03,12.18,11.93,12.16,1025900,12.27,True,0.009046,12.15,False,-0.000822,1,12.439338,11.741,11.042662,0.799998,11.895716,False,False,False,1,11.857245,11.678119,11.722784,1,1,12.171537,11.754141,11.399259,1,1,11.730824,1,15.432722,21.225016,15.569395,False,True,0,48,1,53.921827,0,0.119657,0.06353,0.056127,1,2.431366,1,90.134229,0,4.288165,0,59.016992,0,59.069754,0,-21.495327,0,801373.144197,1,30.620155,37.389168,0,100.0,35.736614,0,-0.071497,0,12.09,13.02,11.79,13.32,10.86,14.25,10.56,0,3.367458,1,2.674556,1,5.888391,1
1,2010-01-05,12.2,12.29,11.92,12.27,365500,12.26,False,-0.000815,12.16,False,-0.008965,1,12.505505,11.778,11.050495,0.838142,12.353631,False,False,False,1,11.873431,11.685959,11.728229,1,1,12.172204,11.759592,11.412275,1,1,11.796742,1,15.929102,22.08019,14.003845,False,True,0,48,1,65.357945,0,0.134463,0.077716,0.056746,1,2.62372,1,90.482696,0,5.775862,0,61.48924,0,63.418986,0,-12.244898,0,895704.419706,1,52.074167,38.09829,0,100.0,66.666667,0,-0.064882,0,12.16,13.16,11.93,13.39,10.93,14.39,10.7,0,3.088524,1,2.353882,1,4.386692,1
2,2010-01-06,12.27,12.33,12.18,12.26,169800,12.97,True,0.057912,12.2,False,-0.004894,1,12.557483,11.8195,11.081517,0.798449,12.487551,False,False,False,1,11.888591,11.693562,11.733521,1,1,12.172613,11.765721,11.42263,1,1,11.856068,1,16.564244,22.273071,13.414924,False,True,0,48,1,90.614887,0,0.143732,0.090919,0.052813,1,2.787234,1,90.456035,0,4.340426,0,61.128227,0,62.01733,0,-13.684211,0,856468.630623,1,75.840475,52.844932,0,94.080229,98.026743,0,-0.057946,0,12.256667,13.353333,12.123333,13.486667,11.026667,14.583333,10.893333,0,2.072615,1,1.531956,1,2.834666,1
3,2010-01-07,12.42,13.06,12.42,12.97,1418100,12.98,True,0.000771,12.34,False,-0.048574,0,12.752054,11.905,11.057946,1.12865,14.230218,False,True,False,1,11.930999,11.710468,11.745824,1,1,12.187862,11.784507,11.445005,1,1,11.909461,1,19.204247,35.692555,10.805046,False,True,0,76,1,251.903114,0,0.205995,0.113935,0.092061,1,3.245174,1,91.950153,0,6.661184,0,73.171839,0,73.594548,0,-5.806452,0,1086866.642533,1,89.875562,72.596735,0,100.0,98.026743,0,-0.049214,0,12.816667,14.473333,12.573333,14.716667,10.916667,16.373333,10.673333,0,2.117129,1,1.356251,1,2.103288,1
4,2010-01-08,12.97,13.02,12.83,12.98,1121700,12.95,False,-0.002311,12.35,False,-0.048536,0,12.905241,11.991,11.076759,1.040886,15.248781,False,True,False,1,11.972136,11.727283,11.758104,1,1,12.205716,11.80383,11.465788,1,1,12.047526,1,21.655678,34.000682,10.292873,False,True,0,76,1,225.761841,0,0.253227,0.141793,0.111434,1,3.660764,1,92.819927,0,6.568144,0,73.297328,0,74.96407,0,-5.16129,0,1288943.263634,1,89.338183,85.018073,0,100.0,98.026743,0,-0.039076,0,12.943333,14.726667,12.826667,14.843333,11.043333,16.626667,10.926667,1,2.176533,1,1.449872,1,2.094166,1


In [185]:
dftouse['date'] = pd.to_datetime(dftouse['date'])
mask = (dftouse.date < '2015-01-01').values
mask.shape, mask.sum()

((1487,), 1258)

#### 1.2 Standardize the data

Use the mask to compute the training and test parts of the dataframe. Use `StandardScaler` from `sklearn.preprocessing` to "fit" the columns in `STANDARDIZABLE` on the training set. Then use the resultant estimator to transform both the training and the test parts of each of the columns in the dataframe, replacing the old unstandardized values in the `STANDARDIZABLE` columns of `dftouse` by the new standardized ones.

In [186]:
#your code here
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(dftouse[mask][STANDARDIZABLE])
dftouse[STANDARDIZABLE] = scaler.transform(dftouse[STANDARDIZABLE])
dftouse.head()

Unnamed: 0,date,open,high,low,close,volume,close_1,result_1,perf_1,close_14,result_14,perf_14,results,bb_upper,bb_middle,bb_lower,bb_pct,bb_bandwidth,bb_squeeze,bb_signalup,bb_signaldn,bb_signal,ema50,ema150,ema200,ema_signal1,ema_signal2,kama50,kama150,kama200,kama_signal1,kama_signal2,sar,sar_signal,adx,plus_di,minus_di,adx_trend,adx_direction,adx_signal,aroon_osc,aroon_signal,cci,cci_signal,macd,macd_sigline,macd_hist,macd_signal,ppo,ppo_signal,mfi,mfi_signal,roc,roc_signal,rsi,rsi_signal,ult_osc,ult_signal,willr,wr_signal,ad_osc,ad_signal,stoch_slowk,stoch_slowd,sslow_signal,stoch_fastk,stoch_fastd,srsi_signal,trix,trix_signal,sr_pivotpts,sr_res1,sr_sup1,sr_res2,sr_sup2,sr_res3,sr_sup3,cv_signal,roc_^GSPC,sar_signal_^GSPC,roc_^dji,sar_signal_^dji,roc_^IXIC,sar_signal_^IXIC
0,2010-01-04,-1.055232,-1.056582,-1.042337,-1.03079,-0.61643,12.27,True,0.009046,12.15,False,-0.000822,1,-1.112213,-1.099606,-1.080306,0.800161,0.088232,False,False,False,1,-1.064873,-1.053642,-1.02823,1,1,-0.996363,-1.036326,-1.118015,1,1,-1.087456,1,-0.763132,-0.447355,-0.952848,False,True,0,0.571202,1,0.41771,0,0.158613,-0.039646,0.587714,1,0.770558,1,2.155632,0,0.666324,0,0.576373,0,0.772608,0,0.76445,0,0.361341,1,-0.865794,-0.655599,0,1.18005,-0.41578,0,-0.699566,0,-1.043468,-1.054074,-0.909982,-1.166004,-0.876922,-1.160626,-0.723936,0,1.022709,1,0.854181,1,1.648852,1
1,2010-01-05,-1.023336,-1.036093,-1.044227,-1.010147,-0.878797,12.26,False,-0.000815,12.16,False,-0.008965,1,-1.10009,-1.092604,-1.078782,0.917897,0.17355,False,False,False,1,-1.061762,-1.052063,-1.0271,1,1,-0.996236,-1.035286,-1.1155,1,1,-1.07517,1,-0.704336,-0.317347,-1.181922,False,True,0,0.571202,1,0.525116,0,0.210364,0.013771,0.594222,1,0.844688,1,2.176495,0,0.945762,0,0.798681,0,1.218399,0,1.068674,0,0.409022,1,-0.031562,-0.625546,0,1.18005,0.501045,0,-0.663429,0,-1.030332,-1.029722,-0.883741,-1.15394,-0.862829,-1.13826,-0.696166,0,0.922154,1,0.730416,1,1.176547,1
2,2010-01-06,-1.010202,-1.028642,-0.995105,-1.012024,-0.956545,12.97,True,0.057912,12.2,False,-0.004894,1,-1.090567,-1.084751,-1.072744,0.795378,0.198502,False,False,False,1,-1.058849,-1.050532,-1.026002,1,1,-0.996158,-1.034116,-1.113499,1,1,-1.064112,1,-0.629103,-0.288024,-1.268094,False,True,0,0.571202,1,0.762325,0,0.242766,0.063485,0.552872,1,0.907703,1,2.174899,0,0.67614,0,0.766219,0,1.074731,0,1.021338,0,0.38919,1,0.892584,-0.000569,0,1.040781,1.430617,0,-0.625537,0,-1.012193,-0.996094,-0.847503,-1.137281,-0.843366,-1.107375,-0.657817,0,0.555921,1,0.413191,1,0.688413,1
3,2010-01-07,-0.982058,-0.892667,-0.949762,-0.87878,-0.460616,12.98,True,0.000771,12.34,False,-0.048574,0,-1.054919,-1.06857,-1.077332,1.814589,0.523193,False,True,False,1,-1.0507,-1.047127,-1.023448,1,1,-0.993258,-1.030532,-1.109175,1,1,-1.05416,1,-0.316392,1.752071,-1.649977,False,True,0,1.023908,1,2.27712,0,0.460397,0.150145,0.965456,1,1.084186,1,2.264354,0,1.112054,0,1.8492,0,2.261381,0,1.280418,0,0.505646,1,1.438334,0.836531,0,1.18005,1.430617,0,-0.577834,0,-0.907109,-0.801281,-0.763156,-0.925308,-0.865513,-0.821418,-0.701455,0,0.571968,1,0.345377,1,0.458384,1
4,2010-01-08,-0.878864,-0.900118,-0.872301,-0.876903,-0.578371,12.95,False,-0.002311,12.35,False,-0.048536,0,-1.026854,-1.052295,-1.07367,1.543695,0.712971,False,True,False,1,-1.042794,-1.04374,-1.0209,1,1,-0.989863,-1.026844,-1.105159,1,1,-1.028427,1,-0.026017,1.494864,-1.724919,False,True,0,1.023908,1,2.031605,0,0.625489,0.255041,1.169113,1,1.244347,1,2.316429,0,1.094578,0,1.860484,0,2.401756,0,1.301636,0,0.607788,1,1.417438,1.362959,0,1.18005,1.430617,0,-0.522454,0,-0.88334,-0.757217,-0.715671,-0.903479,-0.840011,-0.780947,-0.651205,1,0.593383,1,0.381511,1,0.455515,1


We create a list `lcols` of the columns we will use in our classifier. This list should not contain the response `RESP`. How many features do we have?

In [187]:
#lcols=list(dftouse.columns)
#lcols.remove(u'results')
lcols=[]
for c in list(dftouse.columns):
    if c not in IGNORE: 
        lcols.append(c)
print len(lcols)

76


### EDA for the data

We create a variable `ccols` which contains all variables not in our indicators list

In [188]:
ccols=[]
for c in lcols:
    if c not in INDICATORS and c not in IGNORE:
        ccols.append(c)
print len(ccols), len(INDICATORS)

47 29


In [189]:
def cv_optimize(clf, parameters, X, y, n_folds, score_func):
    fitmodel = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    fitmodel.fit(X, y)
    return fitmodel.best_estimator_

In [190]:
from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        #print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        #print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    #print "############# based on standard predict ################"
    #print "Accuracy on training data: %0.2f" % (training_accuracy)
    #print "Accuracy on test data:     %0.2f" % (test_accuracy)
    #print confusion_matrix(ytest, clf.predict(Xtest))
    #print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [191]:
X=dftouse[lcols].values
y=dftouse['results'].values
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]  
reuse_split=dict(Xtrain=Xtrain, Xtest=Xtest, ytrain=ytrain, ytest=ytest)

In [192]:
print "whole data set", dftouse['results'].mean()
print "training set", dftouse['results'][mask].mean(), "test set", dftouse['results'][~mask].mean()

whole data set 0.486213853396
training set 0.484101748808 test set 0.497816593886


#####our data is not very asymmetric, but we might still want to balance the trainset

### Balancing train set to test set for training, 
### intended to be used with SVM only

In [193]:
jtrain=np.arange(0, ytrain.shape[0])
n_pos=len(jtrain[ytrain==1])
n_neg=len(jtrain[ytrain==0])
print n_pos, n_neg

ineg = np.random.choice(jtrain[ytrain==0], n_pos, replace=False)
alli=np.concatenate((jtrain[ytrain==1], ineg))
Xtrain_new = Xtrain[alli]
ytrain_new = ytrain[alli]
print Xtrain_new.shape, ytrain_new.shape

reuse_split_balanced=dict(Xtrain=Xtrain_new, Xtest=Xtest, ytrain=ytrain_new, ytest=ytest)

609 649
(1218, 76) (1218,)


## Test all classifiersres using SelectKbest

In [194]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def evaluate(clf):
    clf.fit(Xtrain,ytrain)
    training_accuracy = accuracy_score(ytrain, clf.predict(Xtrain))
    test_accuracy = accuracy_score(ytest, clf.predict(Xtest))
    df_pred = df[~mask].reset_index(drop=True)
    df_pred['pred_result'] = clf.predict(Xtest)
    df_pred['result_baseline'] = np.ones(df_pred.shape[0])
    _,_,ROI_base,_,_ = evaluate_profit(df_pred, startdate, enddate, 10000, 'result_baseline', 'close', False, [1])
    _,_,ROI_long,_,_ = evaluate_profit(df_pred, startdate, enddate, 10000, 'pred_result', 'close', False, [1])
    _,_,ROI_lgst,_,_ = evaluate_profit(df_pred, startdate, enddate, 10000, 'pred_result', 'close', False, [1,0])
    return training_accuracy, test_accuracy, ROI_base, ROI_long, ROI_lgst

def print_result(clfpipe):     
    print "Number of features: {0}".format(clfpipe.get_params()['selectk__k'])
    print "Features: {0}".format(np.array(lcols)[clfpipe.named_steps['selectk'].get_support()].tolist())
    r = evaluate(clfpipe)
    print "train accuracy: {0}".format(r[0])
    print "test accuracy: {0}".format(r[1])
    print "ROI baseline: {0}".format(r[2])
    print "ROI long-only: {0}".format(r[3])
    print "ROI long-short: {0}".format(r[4])

In [195]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score

In [196]:
%%time
max_number_featrues = 20
Long_ROI_result={}
Test_set_accuracies={}
print "#############====================== Log Regression =====================#############"
selectk = SelectKBest(score_func=f_regression)
pipeLR = Pipeline([('selectk', selectk), ('LR', LogisticRegression(penalty="l1"))])
pipeLR, _,_,_,_  = do_classify(pipeLR, {"selectk__k":range(1,max_number_featrues+1), 
                                        "LR__C": [1e-3, 1e-2, 1e-1, 1, 1e2],
                                        "LR__random_state": [111]}, 
                               dftouse,lcols, u'results',1, reuse_split=reuse_split)

print pipeLR.get_params()['LR__C']
print_result(pipeLR)
Long_ROI_result["log_regression"] = evaluate(pipeLR)[3]
Test_set_accuracies["log_regression"] = evaluate(pipeLR)[1]

1
Number of features: 7
Features: ['bb_pct', 'cci', 'rsi', 'ult_osc', 'willr', 'stoch_slowk', 'stoch_slowd']
train accuracy: 0.671701112878
test accuracy: 0.637554585153
ROI baseline: 0.123313
ROI long-only: 0.2855679895
ROI long-short: 0.4437769646
CPU times: user 11.1 s, sys: 261 ms, total: 11.3 s
Wall time: 11.4 s


In [209]:
%%time
print "#############====================== RBF SVM ===========================#############"
pipesvm2 = Pipeline([('selectk', selectk), ('svm2', SVC())])
pipesvm2,_,_,_,_  = do_classify(pipesvm2, {"selectk__k":[2,3,4,5,6,7,9,12,15,18,25], 
                                            "svm2__C": [1e6,1e7, 1e8,1e9], 
                                            "svm2__gamma": [ 1e-7,1e-8, 1e-9, 1e-10],
                                            "svm2__random_state": [111]}, 
                                 dftouse,lcols, u'results',1, reuse_split=reuse_split_balanced)
print pipesvm2.get_params()['svm2__C'], pipesvm2.get_params()['svm2__gamma']
print_result(pipesvm2)
Long_ROI_result["RBF_SVM"] = evaluate(pipesvm2)[3]
Test_set_accuracies["RBF_SVM"] = evaluate(pipesvm2)[1]

100000000.0 1e-08
Number of features: 4
Features: ['bb_pct', 'cci', 'willr', 'stoch_slowk']
train accuracy: 0.652623211447
test accuracy: 0.655021834061
ROI baseline: 0.123313
ROI long-only: 0.2110288426
ROI long-short: 0.2853026858
CPU times: user 49.4 s, sys: 346 ms, total: 49.7 s
Wall time: 50 s


In [198]:
%%time
print "#############====================== Random Forest =====================#############"
pipeRF = Pipeline([('selectk', selectk), ('RF', RandomForestClassifier())])
pipeRF,_,_,_,_  = do_classify(pipeRF, {"selectk__k": [5,6,7,8,9,10],
                                       "RF__max_depth": [3,5,7], 
                                       "RF__n_estimators": [5,10,20],
                                       "RF__max_features": [1,2,3],
                                       "RF__random_state": [111]}, 
                              dftouse, lcols, u'results', 1, reuse_split=reuse_split)

print pipeRF.get_params()['RF__max_depth'], pipeRF.get_params()['RF__n_estimators'], pipeRF.get_params()['RF__max_features']
print_result(pipeRF)
Long_ROI_result["Random_forest"] = evaluate(pipeRF)[3]
Test_set_accuracies["Random_forest"] = evaluate(pipeRF)[1]

5 5 1
Number of features: 10
Features: ['bb_pct', 'plus_di', 'cci', 'macd_hist', 'rsi', 'ult_osc', 'willr', 'stoch_slowk', 'stoch_slowd', 'stoch_fastd']
train accuracy: 0.717806041335
test accuracy: 0.615720524017
ROI baseline: 0.123313
ROI long-only: 0.246537133
ROI long-short: 0.3571452916
CPU times: user 32.1 s, sys: 466 ms, total: 32.5 s
Wall time: 32.8 s


In [199]:
%%time
print "#############====================== Extra Trees= =====================#############"
pipeET = Pipeline([('selectk', selectk), ('ET', ExtraTreesClassifier())])
pipeET, _,_,_,_  = do_classify(pipeET, {"selectk__k": [5,6,7,8,9,10],
                                        "ET__max_depth": [3,5,7,10,15], 
                                        "ET__n_estimators": [5,10,20],
                                        "ET__max_features": [1,2,3,4,5],
                                        "ET__random_state": [111]}, 
                               dftouse, lcols, u'results', 1, reuse_split=reuse_split)
print "ET__max_depth: {0}".format(pipeET.get_params()['ET__max_depth']) 
print "ET__n_estimators: {0}".format(pipeET.get_params()['ET__n_estimators']) 
print "ET__max_features: {0}".format(pipeET.get_params()['ET__max_features']) 
print_result(pipeET)
Long_ROI_result["Extra_Trees"] = evaluate(pipeET)[3]
Test_set_accuracies["Extra_Trees"] = evaluate(pipeET)[1]

ET__max_depth: 10
ET__n_estimators: 40
ET__max_features: 1
Number of features: 10
Features: ['bb_pct', 'plus_di', 'cci', 'macd_hist', 'rsi', 'ult_osc', 'willr', 'stoch_slowk', 'stoch_slowd', 'stoch_fastd']
train accuracy: 0.804451510334
test accuracy: 0.646288209607
ROI baseline: 0.123313
ROI long-only: 0.2464681766
ROI long-short: 0.353990396
CPU times: user 3min 10s, sys: 2.96 s, total: 3min 13s
Wall time: 3min 16s


In [200]:
%%time
print "#############====================== Gaussian NB ==========================#############"
pipeNB = Pipeline([('selectk', selectk), ('NB', GaussianNB())])
pipeNB,_,_,_,_ = do_classify(pipeNB, {"selectk__k":range(1,max_number_featrues+1)}, 
                             dftouse, lcols, u'results',1, reuse_split=reuse_split)
print_result(pipeNB)
Long_ROI_result["Gaussian_NB"] = evaluate(pipeNB)[3]
Test_set_accuracies["Gaussian_NB"] = evaluate(pipeNB)[1]

Number of features: 2
Features: ['bb_pct', 'stoch_slowk']
train accuracy: 0.662162162162
test accuracy: 0.650655021834
ROI baseline: 0.123313
ROI long-only: 0.2850048209
ROI long-short: 0.4429055967
CPU times: user 2.47 s, sys: 65.1 ms, total: 2.54 s
Wall time: 2.92 s


In [201]:
#%%time
#print "#############====================== Gradient Boosting ====================#############"
#pipeGB = Pipeline([('selectk', selectk), ('GB', GradientBoostingClassifier())])
#pipeGB, _,_,_,_  = do_classify(pipeGB, {"selectk__k":range(1,max_number_featrues+1), 
#                                        "GB__n_estimators": [5,10,20,40],
#                                        "GB__learning_rate": [0.1,0.5,1.0],
#                                        "GB__random_state": [111]}, 
#                               dftouse,lcols, u'results',1, reuse_split=reuse_split)
#print pipeGB.get_params()['GB__n_estimators'], pipeGB.get_params()['GB__learning_rate']
#print_result(pipeGB)
#Long_ROI_result["Gradient_bossting"] = evaluate(pipeGB)[3]
#Test_set_accuracies["Gradient_bossting"] = evaluate(pipeGB)[1]

In [210]:
eclf = EnsembleClassifier(clfs=[pipeLR, pipesvm2, pipeRF, pipeET, pipeNB], voting='hard')
r = evaluate(eclf)
Long_ROI_result["Ensemble"] = r[3]
Test_set_accuracies["Ensemble"] = r[1]
print "train accuracy: {0}".format(r[0])
print "test accuracy: {0}".format(r[1])
print "ROI baseline: {0}".format(r[2])
print "ROI long-only: {0}".format(r[3])
print "ROI long-short: {0}".format(r[4])

train accuracy: 0.685214626391
test accuracy: 0.655021834061
ROI baseline: 0.123313
ROI long-only: 0.2848998651
ROI long-short: 0.4493206961


In [211]:
print "---Test accuracy results---"
Test_set_accuracies

---Test accuracy results---


{'Ensemble': 0.65502183406113534,
 'Extra_Trees': 0.64628820960698685,
 'Gaussian_NB': 0.6506550218340611,
 'RBF_SVM': 0.65502183406113534,
 'Random_forest': 0.61572052401746724,
 'log_regression': 0.63755458515283847}

In [212]:
print "ROI baseline: {0}".format(evaluate(pipeGB)[2])
print "---Long ROI results---"
Long_ROI_result

ROI baseline: 0.123313
---Long ROI results---


{'Ensemble': 0.28489986509999943,
 'Extra_Trees': 0.24646817659999962,
 'Gaussian_NB': 0.28500482089999968,
 'RBF_SVM': 0.21102884259999827,
 'Random_forest': 0.24653713299999927,
 'log_regression': 0.28556798950000029}