In [2]:
import pandas as pd
import numpy as np
import talib as ta

import sys, os
sys.path.append('..')
from Data.TimeSeries import *
from ETF.AAA import *

from Data import factors
import Quandl
import pandas as pd
import matplotlib

import cvxopt as opt
from cvxopt import blas, solvers
import math

%matplotlib inline

from Data.TimeSeries import *
from Data.StockDataManager import  *


tickers = ['GOOG/NYSE_SPY']

settings = Settings()
dp = TimeSeries(settings).get_ETF_data(tickers)

df = dp[tickers[0]][['Open', 'High', 'Low', 'Close', 'volume']]
df = df['2001-01-01'::].dropna()

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


# Target Variables

In [3]:
def next_day_atr_return_distance(df, win=250):
    delta_o = np.array(df['Open'].shift(-2) - df['Open'].shift(-1))
    atr = ta.ATR(df['High'].values, df['Low'].values, df['Close'].values, win)
    if win == 1:
        v = delta_o
    else :
        v = delta_o / atr

    ndard = pd.DataFrame(data=v, index=df.index, columns=['ndard'])
    return ndard

def subsequent_day_atr_return_distance(df, seq, win=250) :
    n = next_day_atr_return_distance(df, win)
    n_s = n.shift(-1*seq)
    col_new = 'ndard_{s}'.format(s=seq)
    n_s.columns = [col_new]
    return n_s

def next_month_atr_return_distance(df, win = 250):
    df_o = df.copy()

    ## get the first trading day of the month
    b = df_o.index[0:(len(df_o)- 1)]
    b = b.insert(0, df_o.index[0])
    a = df_o.index

    ## get the delta of opening price, between month(1) and month(2)
    df_f_month = df_o.loc[a[a.month <> b.month]]
    df_f_month['delta_o'] = np.array(df_f_month['Open'].shift(-1) - df_f_month['Open'])

    df_o['delta_o'] = df_f_month['delta_o']
    df_o = df_o.fillna(method='bfill')
    df_o['delta_o'] = df_o['delta_o'].shift(-1)

    ## get ATR in the window 
    df_o['atr'] = ta.ATR(df_o['High'].values, df_o['Low'].values, df_o['Close'].values, win)
    if win == 1:
        v = df_o['delta_o']
    else :
        v = df_o['delta_o'] / df_o['atr']

    nmatd = pd.DataFrame(data=v, index=df_o.index, columns=['nmatd'])
    return nmatd

def hit_or_miss_up_down_cutoff_atr(df, up=2, down=5, cutoff = 40, atrdist=250):
    list_up_down = []

    price = df['Open']
    for i in range(len(price)) :
        if i > len(price) - cutoff - 1:
            list_up_down.append(0)
        else :   
            price_co = price[(i+1):(i+1+cutoff)]
            v_max = max(price_co)
            v_min = min(price_co)

            i_max = price_co.argmax()
            i_min = price_co.argmin()

            # check the upper bound
            atr = df_o['atr'][i]
            max_reach = 0
            min_reach = 0

            if (v_max - price_co[0]) > up * atr:
                max_reach = 1
            if (price_co[0] - v_min) > down * atr:
                min_reach = 1

            # normalize the returns with ATR
            if math.isnan(atr) :
                up_down = np.NAN
            else:

                if atr <> 0:
                    up_down = (price_co[-1] - price_co[0]) / atr
                else :
                    up_down = price_co[-1] - price_co[0]


                if max_reach == 1 and min_reach == 0:
                    up_down = up
                elif min_reach == 1 and max_reach == 0:
                    up_down = -1 * down
                elif max_reach == 1 and min_reach == 1:
                    if i_max > i_min:
                        up_down = up
                    if i_max < i_min:
                        up_down = -1*down
    #             else :
    #                 print "max/min no reaches"
    #                 print "max_reach={a}:min_reach={b}".format(a=max_reach, b=min_reach)
    #                 print "price_0={a}:price_1={b}".format(a=price_co[0], b=price_co[-1])
    #                 print "up_ratio = {a}".format(a = (v_max - price_co[0])/atr)
    #                 print "down_ratio = {a}".format(a = (price_co[0] - v_min)/atr)


            list_up_down.append(up_down)
    #         print "max={max}:min={min}:price={price}:atr={atr}:hit={hit}".format(
    #                 max=v_max, min=v_min, price=price[i], atr=atr, hit=up_down
    #             )
    #         print "---------------------------------------------------------------"

    hmatr = pd.DataFrame(data=list_up_down, index=df.index, columns=['hmatr'])
    return hmatr

## Features



In [5]:
# 1-day log return
def log_return(df) :
    df_price = np.log(df[['Close']])
    l_ret = df_price - df_price.shift(1)
    return pd.DataFrame(data=l_ret.values, index=df.index, columns=['logret'])

# n-day close to close (log return)
def close_to_close(df, histLen=5) :
    df_price = df['Close']
    return np.log(df_price / df_price.shift(histLen))

# ATR
def atr(df, atrLen = 10):
    import talib as ta
    l_atr = ta.ATR(df['High'].values, df['Low'].values, df['Close'].values, atrLen)
    return pd.DataFrame(data=l_atr, index=df.index, columns=['atr_{n}'.format(n=atrLen)])

# Moving Average 
def sma(df, maLen=10) :
    if maLen == 1:
        l_sma = df[['Close']].values
    else:
        l_sma = ta.SMA(df['Close'].values, maLen)
    return pd.DataFrame(data=l_sma, index=df.index, columns=['ma_{n}'.format(n=maLen)])

# Close - MovingAverage 
# Normlaized by Log ATR
def close_ma_ATR_log(df, histLen = 5, atrLen = 30) :
    df_price = df[['Close']]
    df_ma = sma(df, histLen)
    df_atr = atr(df,  atrLen)

    v1 = np.log(df_price.values / df_ma.values) #/ np.sqrt(histLen) 
    v2 =  np.log(df_atr.values)
    v = v1/v2
    
    return pd.DataFrame(data=v, index=df.index, columns=['cmatr_{n}_{m}'.format(n=histLen, m=atrLen)])

# MA different, normalized by ATR
# Params: ShortLen, LongLen, Lag
# ShortLen - the lenght for the short MA
# longLen - the length for the long MA
# lag - the long MA will apply with a lag. If the lag >= shortLen, the long and the short MA 
# will by apply for the 2 separated windows
# The delta of the 2 MAs will be normalized with the ATR with len = Lag + LongLength
def ma_diff_ATR_log(df, shortLen = 5, longLen = 51, lag = 5) :
    df_short_ma = sma(df, shortLen)
    df_long_ma = sma(df, longLen).shift(lag)
    df_atr = atr(df, longLen + lag)

    delta = np.log(df_short_ma.values) - np.log(df_long_ma.values)

    o = delta / np.log(df_atr.values)
    return pd.DataFrame(data=o, index=df.index, columns=['mdatr'])

def ma_diff_ATR(df, shortLen = 5, longLen = 51, lag = 5) :
    df_short_ma = sma(df, shortLen)
    df_long_ma = sma(df, longLen).shift(lag)
    df_atr = atr(df, longLen + lag)

    delta = df_short_ma.values - df_long_ma.values
    o = delta / df_atr.values
    return pd.DataFrame(data=o, index=df.index, columns=['mdatr'])

'''
# ABS Price Change Oscillator
# Using the absolute log dialy price changes (abs daily log return ). 
# The short MA using shortLen, and the long MA using shortLen*multiplier.
# the difference is normalized by ATR using 
'''
shortLength = 5
multiplier = 5
df_abs_logret = np.abs(log_return(df).values)

l_ma = ta.SMA(df_abs_logret.ravel(), shortLength)
l_ma_long = ta.SMA(df_abs_logret.ravel(), shortLength * multiplier)
delta = l_ma - l_ma_long
l_atr = np.log(atr(df, shortLength*multiplier))
a = atr(df, shortLength * multiplier)


'''
Line Per ATR
calcuate the least-square deviation line of the data - using the 
mean(sum(high+low+open+close)) with the window. the slope of the line divided 
by the ATR.
parameters
- histLen - window for the slope estimation
- atrLen - ATR window
output:
DataFrame containing two columns
- 'lpatr' line slope adjusted by ATR
- 'slope_predict' the next day predict using the line slope
- 'delta_predict' the 
'''
def line_per_atr(df, histLen = 50, atrLen = 200) :
    l_mean = df[['Open', 'High','Low', 'Close']].mean(axis=1)
    nr = len(l_mean)

    l_slope = []
    l_predict = []
    l_delta = []
    
    # sklearn linear model
    from sklearn import linear_model
    regr = linear_model.LinearRegression()
    
    for i in range(nr) :
        if i < histLen:
            l_slope.append(np.nan)
            l_predict.append(np.nan)
            l_delta.append(np.nan)
        else :
            y = np.reshape(l_mean[(i-histLen):i].values, [histLen, 1])
            x = np.reshape(np.arange(histLen), [histLen, 1])
            regr.fit(x,y)
            slope = regr.coef_[0,0]
            l_slope.append(slope)
            l_predict.append(l_mean[i] + slope)

    df_atr = atr(df, atrLen)

    df_slope = pd.DataFrame(data=l_slope, index=df.index, columns=['slope'])
    df_slope['lpatr'] = l_slope / df_atr.values.ravel()
    #df_slope['close'] = df['Close']
    df_slope['slope_predict'] = l_predict
    df_slope['delta_predict'] = df_slope['slope_predict'].shift(1) - df['close']
    
    df_slope.columns = ['lpatr_{m}_{n}'.format(m=histLen, n=atrLen), 
                       'slope_predict_{m}'.format(m=histLen), 
                       'delta_predict_{m}'.format(m=histLen)]
    return df_slope

# data preparation


In [110]:
# df_target = next_day_atr_return_distance(df, win=10)
df_target = log_return(df)
df_target.columns = ['target']

df_target[df_target > 0.04] = 1
df_target[df_target < 0.04] = 0

list_df_features = []
list_df_features.append(log_return(df))
list_df_features.append(atr(df, atrLen=10))
list_df_features.append(atr(df, atrLen=5))
list_df_features.append(close_ma_ATR_log(df))

df_features = pd.DataFrame()
for l in list_df_features:
    df_features = df_features.join(l, how='outer')
df_features = df_features.join(df_target, how='outer')
df_features.dropna(inplace=True)

In [111]:
df_target[df_target==1].sum()

target    23
dtype: float64

In [101]:
df_features.iloc[:, :].head()

Unnamed: 0_level_0,logret,atr_10,atr_5,cmatr_5_30,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-02-14 00:00:00+00:00,-0.001513,2.123884,1.991586,-0.004019,0
2001-02-15 00:00:00+00:00,0.009646,2.064495,1.899269,0.006701,0
2001-02-16 00:00:00+00:00,-0.022296,2.262046,2.327415,-0.016148,0
2001-02-20 00:00:00+00:00,-0.015534,2.339841,2.469932,-0.024935,0
2001-02-21 00:00:00+00:00,-0.021811,2.439857,2.643946,-0.037432,0


# XGBOOST

In [115]:
import numpy
import xgboost as xgb


nr_training = int(0.5*len(df_features))
df_1 = df_features.iloc[:nr_training, :]
df_2 = df_features.iloc[(nr_training +1):, :]
dtrain = xgb.DMatrix(df_1.iloc[:, :-1], label=df_1['target'].values)
dtest = xgb.DMatrix(df_2.iloc[:, :-1], label=df_2['target'].values)

# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }

# specify validations set to watch performance
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = 2


bst = xgb.cv(param, dtrain, num_boost_round=100)
# this is prediction
#preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
#xgb.plot_importance(bst)

error=0.719809


In [117]:
type(bst)

pandas.core.frame.DataFrame

In [88]:
clf = xgb.XGBRegressor()
clf.fit(df_1.iloc[:, :-1], df_1['target'])
predictions = clf.predict(df_1.iloc[:, :-1])

In [91]:
from sklearn.metrics import confusion_matrix, mean_absolute_error, mean_squared_error
print(mean_squared_error(np.array(df_1['target']), predictions))

0.333377835754


In [92]:
# Load evals result by calling the evals_result() function
evals_result = clf.evals_result()

print('Access logloss metric directly from validation_0:')
print(evals_result['validation_0']['logloss'])

print('')
print('Access metrics through a loop:')
for e_name, e_mtrs in evals_result.items():
    print('- {}'.format(e_name))
    for e_mtr_name, e_mtr_vals in e_mtrs.items():
        print('   - {}'.format(e_mtr_name))
        print('      - {}'.format(e_mtr_vals))
 
print('')
print('Access complete dict:')
print(evals_result)

AttributeError: 'XGBRegressor' object has no attribute 'evals_result_'