# Visualize Trading Signal for different classifiers

In [2]:
%matplotlib inline
%run talibref.py
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import time, datetime

ticker = 'IYZ'
startdate=datetime.date(2010, 1, 1)
enddate = datetime.date(2015, 12, 1)
df = generate_ticker_data(ticker, startdate, enddate)
dftouse=df.copy()

IGNORE = ['date', 'result_1','close_1','perf_1','result_14','close_14','perf_14','results']

INDICATORS=[]
for v in df.columns:
    l=df[v].unique()
    if len(l) <= 10 and v not in IGNORE:
        #print v, l
        INDICATORS.append(v)

STANDARDIZABLE = []
for v in df.columns:
    if v not in INDICATORS and v not in IGNORE:
        #print v
        STANDARDIZABLE.append(v)
dftouse['date'] = pd.to_datetime(dftouse['date'])
mask = (dftouse.date < '2015-01-01').values

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(dftouse[mask][STANDARDIZABLE])
dftouse[STANDARDIZABLE] = scaler.transform(dftouse[STANDARDIZABLE])

lcols=[]
for c in list(dftouse.columns):
    if c not in IGNORE: 
        lcols.append(c)
def cv_optimize(clf, parameters, X, y, n_folds, score_func):
    fitmodel = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    fitmodel.fit(X, y)
    return fitmodel.best_estimator_

def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask is not None:
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split is not None:
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    return clf

X=dftouse[lcols].values
y=dftouse['results'].values
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]  
reuse_split=dict(Xtrain=Xtrain, Xtest=Xtest, ytrain=ytrain, ytest=ytest)

Available data: Ticker(IYZ) from 2000-05-26 to 2015-12-08
Usable data: Ticker(IYZ) from 2001-03-14 to 2015-12-08 
Returned data: Ticker(IYZ) from 2010-01-04 to 2015-12-01 
Save path: data/IYZ_from_2010-01-04_2015-12-01.csv


## Test all classifiersres using SelectKbest

In [7]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LogisticRegression

def evaluate_performance(clf, signal_col='pred', strategy=[1], commission=False, price_column='close'):
    df_pred = df[~mask].reset_index(drop=True)
    df_pred['baseline'] = np.ones(df_pred.shape[0])
    df_pred['pred'] = clf.predict(Xtest)
    
    #datetime.date(2015, 7, 1)
    _,_,ROI,balance_over_time,signals = evaluate_profit(df_pred, firstday(df_pred), lastday(df_pred), 10000, signal_col, 
                                                        price_column, commission, strategy)
    return ROI, balance_over_time, signals

def evaluate_features(clfpipe):     
    return clfpipe.get_params()['selectk__k'], np.array(lcols)[clfpipe.named_steps['selectk'].get_support()].tolist()

def plot_signal(title, signals):
    fig, (ax0) = plt.subplots(nrows=1, ncols=1, figsize=(20, 8))
    ax0.plot(df.date[~mask], df.close[~mask], lw=1)
    
    short_days=zip(*signals[0])[0]
    short_prices=zip(*signals[0])[1]
    long_days=zip(*signals[1])[0]
    long_prices=zip(*signals[1])[1]
    ax0.scatter(long_days, long_prices, color='g', label="Long Signals") 
    ax0.scatter(short_days, short_prices, color='r', label="Short Signals")
    ax0.legend(frameon=False, loc='upper right')
    ax0.set_title(title)

def print_result(clf, name):
    print "train accuracy: {0}".format(clf.score(Xtrain, ytrain))
    print "test accuracy: {0}".format(clf.score(Xtest, ytest))
    print evaluate_features(clf)
    ROI0, BOT0, signals0 = evaluate_performance(clf, strategy=[1], signal_col="baseline")
    print "ROI baseline:{0}".format(ROI0)
    ROI1, BOT1, signals1 = evaluate_performance(clf, strategy=[1])
    print "ROI long-only: {0}".format(ROI1)
    ROI2, BOT2, signals2 = evaluate_performance(clf, strategy=[0])
    print "ROI short-only: {0}".format(ROI2)
    ROI3, BOT3, signals3 = evaluate_performance(clf, strategy=[1,0])
    print "ROI long-short: {0}".format(ROI3)
    
    Long_ROI_result[name] = ROI1
    Short_ROI_result[name] = ROI2
    LS_ROI_result[name] = ROI3

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score

## Run all classifiersres using SelectKbest

In [9]:
Signal_Column = u'results'
Long_ROI_result={}
Short_ROI_result={}
LS_ROI_result={}
selectk = SelectKBest(score_func=f_regression)

In [10]:
print "#############====================== Log Regression =====================#############"
selectk = SelectKBest(score_func=f_regression)
pipeLR = Pipeline([('selectk', selectk), ('LR', LogisticRegression(penalty="l1"))])
pipeLR = do_classify(pipeLR, {"selectk__k":range(1, 11), 
                              "LR__C": [1e-3, 1e-2, 1e-1, 1, 1e2]}, 
                     dftouse, lcols, Signal_Column, 1,  mask=mask)

print pipeLR.get_params()['LR__C']
print_result(pipeLR, "Log Regression")

0.1
train accuracy: 0.693958664547
test accuracy: 0.714285714286
(4, ['bb_pct', 'cci', 'rsi', 'stoch_slowk'])
ROI baseline:0.018359966
ROI long-only: 0.1681285893
ROI short-only: 0.1419936364
ROI long-short: 0.3340181447


In [21]:
%%time
print "#############====================== RBF SVM ===========================#############"
pipesvm2 = Pipeline([('selectk', selectk), ('svm2', SVC())])
pipesvm2 = do_classify(pipesvm2, {"selectk__k":range(1,11), 
                                  "svm2__C": [1e-100, 1e-10, 1e-1, 1, 1e10], 
                                  "svm2__gamma": [ 1e-9, 1e-10, 1e-11]}, 
                       dftouse, lcols, Signal_Column, 1, mask=mask)

print pipesvm2.get_params()['svm2__C'], pipesvm2.get_params()['svm2__gamma']
print_result(pipesvm2, "RBF_SVM")

10000000000.0 1e-11
train accuracy: 0.688941925219
test accuracy: 0.676724137931
(5, ['bb_pct', 'plus_di', 'cci', 'rsi', 'stoch_slowk'])
ROI baseline:0.0208009318
ROI long-only: 0.0961226444
ROI short-only: 0.0702277201
ROI long-short: 0.1730073331
Wall time: 32.1 s


In [22]:
%%time
print "#############====================== Random Forest =====================#############"
pipeRF = Pipeline([('selectk', selectk), ('RF', RandomForestClassifier())])
pipeRF = do_classify(pipeRF, {"selectk__k": [5,6,7,8,9,10], "RF__max_depth": [3,5,7,10], 
                              "RF__n_estimators": [5,10,20,40], "RF__max_features": [1,2,3,4,5]}, 
                     dftouse, lcols, Signal_Column, 1, mask=mask)

print pipeRF.get_params()['RF__max_depth'], pipeRF.get_params()['RF__n_estimators'], pipeRF.get_params()['RF__max_features']
print_result(pipeRF, "Random_forest")

3 20 2
train accuracy: 0.703261734288
test accuracy: 0.685344827586
(9, ['bb_pct', 'plus_di', 'cci', 'rsi', 'ult_osc', 'willr', 'stoch_slowk', 'stoch_slowd', 'stoch_fastd'])
ROI baseline:0.0208009318
ROI long-only: 0.1380506379
ROI short-only: 0.1108427158
ROI long-short: 0.2644123104
Wall time: 1min 34s


In [14]:
%%time
print "#############====================== Gaussian NB ==========================#############"
pipeNB = Pipeline([('selectk', selectk), ('NB', GaussianNB())])
pipeNB = do_classify(pipeNB, {"selectk__k":range(1,10)},
                     dftouse, lcols, Signal_Column, 1, mask=mask)

print_result(pipeNB, "Gaussian_NB")

train accuracy: 0.690779014308
test accuracy: 0.679653679654
(5, ['bb_pct', 'plus_di', 'cci', 'rsi', 'stoch_slowk'])
ROI baseline:0.018359966
ROI long-only: 0.0934196795
ROI short-only: 0.0702277201
ROI long-short: 0.170224369
Wall time: 599 ms


In [None]:
print "Ticker: " + ticker
print "ROI baseline: {0}".format(evaluate_performance(pipeNB, strategy=[1], signal_col="baseline")[0])
print "AVG ROI: {0}".format(np.average(Long_ROI_result.values()))
Long_ROI_result

## Visualize trading signal if for 100% accurate

In [None]:
ROI, BOT, signals = evaluate_performance(pipeLR, signal_col= 'result_1', strategy=[1,0])
plot_signal('IYZ(Dow Jones US Telecom), trading signal based on 1 day performance, ROI: {0:.2f}%'.format(ROI*100), signals)

In [None]:
ROI, BOT, signals = evaluate_performance(pipeLR, signal_col= 'result_14')
plot_signal('IYZ(Dow Jones US Telecom), trading signal based on 14 days performance, ROI: {0:.2f}%'.format(ROI*100), signals)

In [None]:
ROI, BOT, signals = evaluate_performance(pipeLR, signal_col= 'results', strategy=[1])
plot_signal('IYZ(Dow Jones US Telecom), trading signal based on moving average, ROI: {0:.2f}%'.format(ROI*100), signals)

## Visualize trading signal generated by ML classifiers

In [None]:
## view trading signals of results
pipeLR = Pipeline([('selectk', selectk), ('LR', LogisticRegression(penalty="l1"))])
pipeLR = do_classify(pipeLR, {"selectk__k":range(1, 11), 
                              "LR__C": [1e-3, 1e-2, 1e-1, 1, 1e2]}, 
                     dftouse, lcols, u'results', 1, mask=mask)
ROI, BOT, signals = evaluate_performance(pipeLR)
plot_signal('Logistic Regression, results, Long-only, RIO={0}'.format(ROI), signals)

In [None]:
## view trading signals of result_14
pipeLR = Pipeline([('selectk', selectk), ('LR', LogisticRegression(penalty="l1"))])
pipeLR = do_classify(pipeLR, {"selectk__k":range(1, 11), 
                              "LR__C": [1e-3, 1e-2, 1e-1, 1, 1e2]}, 
                     dftouse, lcols, u'result_14', 1, mask=mask)
ROI, BOT, signals = evaluate_performance(pipeLR)
plot_signal('Logistic Regression, result_14, Long-only, RIO={0}'.format(ROI), signals)

In [None]:
## view trading signals of result_1
pipeLR = Pipeline([('selectk', selectk), ('LR', LogisticRegression(penalty="l1"))])
pipeLR = do_classify(pipeLR, {"selectk__k":range(1, 11), 
                              "LR__C": [1e-3, 1e-2, 1e-1, 1, 1e2]}, 
                     dftouse, lcols, u'result_1', 1, mask=mask)
ROI, BOT, signals = evaluate_performance(pipeLR)
plot_signal('Logistic Regression, result_1, Long-only, RIO={0}'.format(ROI), signals)