# Exploratory analysis (con'td)


## Feature analysis and initial ML implementation

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

### Feature engineering

Let's first read in data with a few features we want to use.  For the sake of this example `result_14` is the actual result (False means stock is down in 14 days, True means stock is up).  The features we will use are:

* `ema_signal1` (signal from exponential moving average)
* `kama_signal1` (signal from adaptive moving average)
* `sar_signal` (signal from parabolic SAR)
* `cv_signal`
* `trix_signal`
* `ad_signal`


In [2]:
df=pd.read_csv("./data/UXI.csv")


# Variables with binary signals 
SIGNAL_VARS = ['ema_signal1', 'kama_signal1','sar_signal','cv_signal','trix_signal',
               'bb_signal','ad_signal','aroon_signal','adx_signal','cci_signal']

dftouse = df.copy()
dftouse = dftouse[SIGNAL_VARS]
dftouse['result_14'] = df['result_14'] 
print dftouse.dtypes
print dftouse.shape

ema_signal1      bool
kama_signal1     bool
sar_signal       bool
cv_signal        bool
trix_signal      bool
bb_signal       int64
ad_signal        bool
aroon_signal     bool
adx_signal      int64
cci_signal      int64
result_14        bool
dtype: object
(1258, 11)


### Correlation analysis

In [3]:
# Code adapted from HW3 code
from scipy.stats.stats import pearsonr
correlations=[]
for col in SIGNAL_VARS:
    r=pearsonr(dftouse[col], dftouse['result_14'])
    r_val=r[0]
    p_val=r[1]
    correlations.append(dict(feature=col,corr=r_val, abscorr=np.abs(r_val),p_val=p_val))

bpdf=pd.DataFrame(correlations).sort('abscorr', ascending=False)
bpdf.set_index(['feature'], inplace=True)
bpdf.head(25)

Unnamed: 0_level_0,abscorr,corr,p_val
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ema_signal1,0.074386,-0.074386,0.008305
aroon_signal,0.06282,-0.06282,0.025873
ad_signal,0.061355,-0.061355,0.029551
kama_signal1,0.054071,-0.054071,0.055201
sar_signal,0.047943,0.047943,0.089181
bb_signal,0.03606,-0.03606,0.201205
adx_signal,0.023721,-0.023721,0.400553
trix_signal,0.021312,0.021312,0.450117
cv_signal,0.020341,0.020341,0.471014
cci_signal,0.015993,0.015993,0.570916


As we can see, most of the indicators have very high p-values and may turn out to be useless.  I will try the same analysis, but using quantitative indicators that I will normalize.

In [4]:
# Variables with binary signals 
QUANT_VARS = ['ema50', 'kama50','sar','trix','ad_osc','aroon_osc','adx','cci']

dftouse = df.copy()
dftouse = dftouse[QUANT_VARS]
dftouse['perf_14'] = df['perf_14'] 
print dftouse.dtypes
print dftouse.shape

ema50        float64
kama50       float64
sar          float64
trix         float64
ad_osc       float64
aroon_osc    float64
adx          float64
cci          float64
perf_14      float64
dtype: object
(1258, 9)


In [5]:
from sklearn.preprocessing import StandardScaler

dfTemp = dftouse[QUANT_VARS].copy() # Temporary data frame with only standardizable data

# Perform scaling 
dfTemp = StandardScaler().fit_transform(dfTemp)

dftouse[QUANT_VARS] = dfTemp 

In [6]:
# Code adapted from HW3 code
from scipy.stats.stats import pearsonr
correlations=[]
for col in QUANT_VARS:
    r=pearsonr(dftouse[col], dftouse['perf_14'])
    r_val=r[0]
    p_val=r[1]
    correlations.append(dict(feature=col,corr=r_val, abscorr=np.abs(r_val),p_val=p_val))

bpdf=pd.DataFrame(correlations).sort('abscorr', ascending=False)
bpdf.set_index(['feature'], inplace=True)
bpdf.head(25)

Unnamed: 0_level_0,abscorr,corr,p_val
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sar,0.157397,-0.157397,1.999204e-08
kama50,0.145411,-0.145411,2.21994e-07
ema50,0.144496,-0.144496,2.647119e-07
adx,0.10251,0.10251,0.0002706859
trix,0.07992,-0.07992,0.004563108
aroon_osc,0.062322,-0.062322,0.02707711
ad_osc,0.061914,-0.061914,0.02809861
cci,0.059511,-0.059511,0.03481235


Conclusion: better used quantitative indicators whenever we can!

In [8]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV## Trying a classifier

In [9]:

"""
Function
--------
cv_optimize

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV (see above)
X: a samples-features matrix in the scikit-learn style
y: the response vectors of 1s and 0s (+ives and -ives)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
The best estimator from the GridSearchCV, after the GridSearchCV has been used to
fit the model.
     
Notes
-----
see do_classify and the code below for an example of how this is used
"""

# Adapted from lab code
def cv_optimize(clf, parameters, X, y, n_folds=5,score_func=None):
    if score_func is None:
        score_func = "accuracy"
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func, verbose=1)
    gs.fit(X,y)
    print "BEST PARAMS", gs.best_params_
    best = gs.best_estimator_
    return best

In [10]:
# Adapted from HW3 code
from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [11]:
dftouse['result_14'] = df['result_14'] 
del dftouse['perf_14']
dftouse.head()

Unnamed: 0,ema50,kama50,sar,trix,ad_osc,aroon_osc,adx,cci,result_14
0,-1.199394,-1.266628,-1.101811,0.704543,0.026528,0.939467,-1.36868,0.46337,False
1,-1.1962,-1.26457,-1.153055,0.700694,0.391518,0.939467,-1.391379,0.914802,False
2,-1.192901,-1.261528,-1.15204,0.698001,0.314637,1.266125,-1.381587,1.115226,False
3,-1.188704,-1.256352,-1.149746,0.698782,1.275523,1.266125,-1.291869,1.297294,False
4,-1.183413,-1.247987,-1.145148,0.705502,2.355499,0.939467,-1.110247,2.008873,False


In [13]:
%%time
from sklearn.svm import LinearSVC
clfsvm, Xtrain, ytrain, Xtest, ytest = do_classify(LinearSVC(loss="hinge"), {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}, dftouse,QUANT_VARS, 'result_14',True)

UnboundLocalError: local variable 'Xtrain' referenced before assignment