In [1]:
import pandas as pd # 0.21.0
import numpy as np
from functools import reduce
from time import time

import scipy
from scipy import stats
import matplotlib.pyplot as plt

import quandl

In [2]:
np.random.seed(203)

### NOTE: 
1. Created base case predictions for comparison.
2. Using the previous day direction results in around 50%.
3. Using random prediction results in around 50%.

In [3]:
import talib as ta
ta.get_function_groups

<function talib.get_function_groups>

In [4]:
np.random.seed(100)
quandl.ApiConfig.api_key = "eFys5fhhsTNVrsJEcQ5y"

In [5]:
df = quandl.get("WIKI/"+"AAPL", start_date="10-01-09", end_date="17-10-17")
df.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-11,212.8,213.0,208.45,210.11,16508200.0,0.0,1.0,27.347766,27.373469,26.78873,27.002063,115557400.0
2010-01-12,209.19,209.77,206.42,207.72,21230700.0,0.0,1.0,26.883831,26.958369,26.527847,26.694915,148614900.0
2010-01-13,207.87,210.93,204.1,210.65,21639000.0,0.0,1.0,26.714192,27.107445,26.229695,27.071461,151473000.0


In [82]:
def get_features(ticker, start, end):
    try:
        df = quandl.get("WIKI/"+ticker, start_date=start, end_date=end)
    except Exception as e:
        print("Error in getting stock data: {}".format(e))

    log_ret_df = get_log_returns(df['Adj. Close'])
    # Handle look ahead-bias
    shifted_df = df.shift(1)
    momentum_df = get_momentum_indicators(shifted_df)
    hist_vol_df = get_hist_vol_indicators(shifted_df)
    pattern_df = get_pattern_recognition_indicators(shifted_df)
    cycle_df = get_cycle_indicators(shifted_df)
    overlap_df = get_overlap_indicators(shifted_df)
    dfs = [shifted_df, log_ret_df,momentum_df, hist_vol_df, pattern_df, cycle_df, overlap_df]
    df_final = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), dfs).dropna()
    return df_final

def get_log_returns(df):
    df_log =  pd.DataFrame(np.log(df/df.shift(1))).dropna()
    df_log = df_log.rename(index=str, columns={"Adj. Close": "log_ret"})
    return df_log

def get_momentum_indicators(df):
    # http://mrjbq7.github.io/ta-lib/func_groups/momentum_indicators.html
    df_index = df.index
    high = np.array(df['Adj. High'], dtype='f8') 
    low = np.array(df['Adj. Low'], dtype='f8')
    close = np.array(df['Adj. Close'], dtype='f8')
    
    momentum = dict()
    days = [7,14,28]
    for t in days:
        momentum['adx_{}'.format(t)] = ta.ADX(high, low, close, timeperiod=t)
        momentum['adxr_{}'.format(t)] = ta.ADXR(high, low, close, timeperiod=t)
        momentum['aroondown_{}'.format(t)], momentum['aroonup_{}'.format(t)] = ta.AROON(high, low, timeperiod=t)
        momentum['aroon_{}'.format(t)] = ta.AROONOSC(high, low, timeperiod=t)
        momentum['rsi_{}'.format(t)] = ta.RSI(close, timeperiod=t)
        momentum['mom_{}'.format(t)] = ta.MOM(close, timeperiod=t)
        momentum['roc_{}'.format(t)] = ta.ROC(close, timeperiod=t)
        momentum['willr_{}'.format(t)] = ta.WILLR(high, low, close, timeperiod=t)
        momentum['trix_{}'.format(t)] = ta.TRIX(close, timeperiod=t)
        
        
    momentum['apo'] = ta.APO(close, fastperiod=12, slowperiod=26, matype=0)
    momentum['macd'], momentum['macdsignal'], momentum['macdhist'] = ta.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
    momentum['ppo'] = ta.PPO(close, fastperiod=12, slowperiod=26, matype=0)
    momentum['slowk'], momentum['slowd'] = ta.STOCH(high, low, close, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    momentum['fastk'], momentum['fastd'] = ta.STOCHF(high, low, close, fastk_period=5, fastd_period=3, fastd_matype=0)
    momentum['fastkrsi'], momentum['fastdrsi'] = ta.STOCHRSI(close, timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
    
    
    df = pd.DataFrame.from_dict(momentum)
    df = df.set_index(df_index)
    return df

def get_hist_vol_indicators(df):
    df_index = df.index
    high = np.array(df['Adj. High'], dtype='f8') 
    low = np.array(df['Adj. Low'], dtype='f8')
    close = np.array(df['Adj. Close'], dtype='f8')
    
    hist_vol = dict()
    days = [7,14,28]
    for t in days:
        hist_vol['atr_{}'.format(t)] = ta.ATR(high, low, close, timeperiod=t)
        hist_vol['natr_{}'.format(t)] = ta.NATR(high, low, close, timeperiod=t)
        
    hist_vol['trange'] = ta.TRANGE(high, low, close)
    df = pd.DataFrame.from_dict(hist_vol)
    df = df.set_index(df_index)
    return df

def get_pattern_recognition_indicators(df):
    df_index = df.index
    open = np.array(df['Adj. Open'], dtype='f8') 
    high = np.array(df['Adj. High'], dtype='f8') 
    low = np.array(df['Adj. Low'], dtype='f8')
    close = np.array(df['Adj. Close'], dtype='f8')
    
    patterns = dict()
    patterns['CDL2CROWS'] = ta.CDL2CROWS(open, high, low, close)
    patterns['CDL3BLACKCROWS'] = ta.CDL3BLACKCROWS(open, high, low, close)
    patterns['CDL3INSIDE'] = ta.CDL3INSIDE(open, high, low, close)
    patterns['CDL3LINESTRIKE'] = ta.CDL3LINESTRIKE(open, high, low, close)
    patterns['CDL3OUTSIDE'] = ta.CDL3OUTSIDE(open, high, low, close)
    patterns['CDL3STARSINSOUTH'] = ta.CDL3STARSINSOUTH(open, high, low, close)
    patterns['CDL3WHITESOLDIERS'] = ta.CDL3WHITESOLDIERS(open, high, low, close)
    patterns['CDLABANDONEDBABY'] = ta.CDLABANDONEDBABY(open, high, low, close, penetration=0)
    patterns['CDLADVANCEBLOCK'] = ta.CDLADVANCEBLOCK(open, high, low, close)
    patterns['CDLBELTHOLD'] = ta.CDLBELTHOLD(open, high, low, close)
    patterns['CDLBREAKAWAY'] = ta.CDLBREAKAWAY(open, high, low, close)
    patterns['CDLCLOSINGMARUBOZU'] = ta.CDLCLOSINGMARUBOZU(open, high, low, close)
    patterns['CDLCONCEALBABYSWALL'] = ta.CDLCONCEALBABYSWALL(open, high, low, close)
    patterns['CDLCOUNTERATTACK'] = ta.CDLCOUNTERATTACK(open, high, low, close)
    patterns['CDLDARKCLOUDCOVER'] = ta.CDLDARKCLOUDCOVER(open, high, low, close, penetration=0)
    patterns['CDLDOJI'] = ta.CDLDOJI(open, high, low, close)
    patterns['CDLDOJISTAR'] = ta.CDLDOJISTAR(open, high, low, close)
    patterns['CDLDRAGONFLYDOJI'] = ta.CDLDRAGONFLYDOJI(open, high, low, close)
    patterns['CDLENGULFING'] = ta.CDLENGULFING(open, high, low, close)
    patterns['CDLEVENINGDOJISTAR'] = ta.CDLEVENINGDOJISTAR(open, high, low, close, penetration=0)
    patterns['CDLEVENINGSTAR'] = ta.CDLEVENINGSTAR(open, high, low, close, penetration=0)
    patterns['CDLGAPSIDESIDEWHITE'] = ta.CDLGAPSIDESIDEWHITE(open, high, low, close)
    patterns['CDLGRAVESTONEDOJI'] = ta.CDLGRAVESTONEDOJI(open, high, low, close)
    patterns['CDLHAMMER'] = ta.CDLHAMMER(open, high, low, close)
    patterns['CDLHANGINGMAN'] = ta.CDLHANGINGMAN(open, high, low, close)
    patterns['CDLHARAMI'] = ta.CDLHARAMI(open, high, low, close)
    patterns['CDLHARAMICROSS'] = ta.CDLHARAMICROSS(open, high, low, close)
    patterns['CDLHIGHWAVE'] = ta.CDLHIGHWAVE(open, high, low, close)
    patterns['CDLHIKKAKE'] = ta.CDLHIKKAKE(open, high, low, close)
    patterns['CDLHIKKAKEMOD'] = ta.CDLHIKKAKEMOD(open, high, low, close)
    patterns['CDLHOMINGPIGEON'] = ta.CDLHOMINGPIGEON(open, high, low, close)
    patterns['CDLIDENTICAL3CROWS'] = ta.CDLIDENTICAL3CROWS(open, high, low, close)
    patterns['CDLINNECK'] = ta.CDLINNECK(open, high, low, close)
    patterns['CDLINVERTEDHAMMER'] = ta.CDLINVERTEDHAMMER(open, high, low, close)
    patterns['CDLKICKING'] = ta.CDLKICKING(open, high, low, close)
    patterns['CDLKICKINGBYLENGTH'] = ta.CDLKICKINGBYLENGTH(open, high, low, close)
    patterns['CDLLADDERBOTTOM'] = ta.CDLLADDERBOTTOM(open, high, low, close)
    patterns['CDLLONGLEGGEDDOJI'] = ta.CDLLONGLEGGEDDOJI(open, high, low, close)
    patterns['CDLLONGLINE'] = ta.CDLLONGLINE(open, high, low, close)
    patterns['CDLMARUBOZU'] = ta.CDLMARUBOZU(open, high, low, close)
    patterns['CDLMATCHINGLOW'] = ta.CDLMATCHINGLOW(open, high, low, close)
    patterns['CDLMATHOLD'] = ta.CDLMATHOLD(open, high, low, close, penetration=0)
    patterns['CDLMORNINGDOJISTAR'] = ta.CDLMORNINGDOJISTAR(open, high, low, close, penetration=0)
    patterns['CDLMORNINGSTAR'] = ta.CDLMORNINGSTAR(open, high, low, close, penetration=0)
    patterns['CDLONNECK'] = ta.CDLONNECK(open, high, low, close)
    patterns['CDLPIERCING'] = ta.CDLPIERCING(open, high, low, close)
    patterns['CDLRICKSHAWMAN'] = ta.CDLRICKSHAWMAN(open, high, low, close)
    patterns['CDLRISEFALL3METHODS'] = ta.CDLRISEFALL3METHODS(open, high, low, close)
    patterns['CDLSEPARATINGLINE'] = ta.CDLSEPARATINGLINES(open, high, low, close)
    patterns['CDLSHOOTINGSTAR'] = ta.CDLSHOOTINGSTAR(open, high, low, close)
    patterns['CDLSHORTLINE'] = ta.CDLSHORTLINE(open, high, low, close)
    patterns['CDLSPINNINGTOP'] = ta.CDLSPINNINGTOP(open, high, low, close)
    patterns['CDLSTALLEDPATTERN'] = ta.CDLSTALLEDPATTERN(open, high, low, close)
    patterns['CDLSTICKSANDWICH'] = ta.CDLSTICKSANDWICH(open, high, low, close)
    patterns['CDLTAKURI'] = ta.CDLTAKURI(open, high, low, close)
    patterns['CDLTASUKIGAP'] = ta.CDLTASUKIGAP(open, high, low, close)
    patterns['CDLTHRUSTING'] = ta.CDLTHRUSTING(open, high, low, close)
    patterns['CDLTRISTAR'] = ta.CDLTRISTAR(open, high, low, close)
    patterns['CDLUNIQUE3RIVER'] = ta.CDLUNIQUE3RIVER(open, high, low, close)
    patterns['CDLUPSIDEGAP2CROWS'] = ta.CDLUPSIDEGAP2CROWS(open, high, low, close)
    patterns['DLXSIDEGAP3METHODS'] = ta.CDLXSIDEGAP3METHODS(open, high, low, close)
    df = pd.DataFrame.from_dict(patterns)
    df = df.set_index(df_index)
    return df


def get_cycle_indicators(df):
    df_index = df.index
    open = np.array(df['Adj. Open'], dtype='f8') 
    high = np.array(df['Adj. High'], dtype='f8') 
    low = np.array(df['Adj. Low'], dtype='f8')
    close = np.array(df['Adj. Close'], dtype='f8')
    
    cycle = dict()
    cycle['HT_DCPERIOD'] = ta.HT_DCPERIOD(close)
    cycle['HT_DCPHASE'] = ta.HT_DCPHASE(close)
    cycle['HT_DCPERIOD'] = ta.HT_DCPERIOD(close)
    cycle['inphase'], cycle['quadrature'] = ta.HT_PHASOR(close)
    cycle['sine'], cycle['leadsine'] = ta.HT_SINE(close)
    cycle['integer'] = ta.HT_TRENDMODE(close)
    
    df = pd.DataFrame.from_dict(cycle)
    df = df.set_index(df_index)
    return df

def get_overlap_indicators(df):
    df_index = df.index
    open = np.array(df['Adj. Open'], dtype='f8') 
    high = np.array(df['Adj. High'], dtype='f8') 
    low = np.array(df['Adj. Low'], dtype='f8')
    close = np.array(df['Adj. Close'], dtype='f8')
    
    overlap = dict()
    days = [7, 28, 56]
    for t in days:
        bbands = ta.BBANDS(close, timeperiod=t, nbdevup=2, nbdevdn=2, matype=0)
        overlap['bb_upperband_{}'.format(t)], overlap['bb_middleband_{}'.format(t)], overlap['bb_lowerband_{}'.format(t)] = bbands
            
        overlap['DEMA_{}'.format(t)] = ta.DEMA(close, timeperiod=t)
        overlap['EMA_{}'.format(t)] = ta.EMA(close, timeperiod=t)
        
        overlap['KAMA_{}'.format(t)] = ta.KAMA(close, timeperiod=t)
        overlap['MA_{}'.format(t)] = ta.MA(close, timeperiod=t, matype=0)
        overlap['MIDPOINT_{}'.format(t)] = ta.MIDPOINT(close, timeperiod=t)
        overlap['MIDPRICE_{}'.format(t)] = ta.MIDPRICE(high, low, timeperiod=t)
        overlap['SMA_{}'.format(t)] = ta.SMA(close, timeperiod=t)
        overlap['T3_{}'.format(t)] = ta.T3(close, timeperiod=t, vfactor=0)
        overlap['TEMA(_{}'.format(t)] = ta.TEMA(close, timeperiod=t)
        overlap['TRIMA_{}'.format(t)] = ta.TRIMA(close, timeperiod=t)
        overlap['WMA_{}'.format(t)] = ta.WMA(close, timeperiod=t)
        
        
        overlap['HT_TRENDLINE'] = ta.HT_TRENDLINE(close)
        overlap['mama'], overlap['fama'] = ta.MAMA(close, fastlimit=0.9, slowlimit=0.1)
        overlap['SAR'] = ta.SAR(high, low, acceleration=0, maximum=0)
        overlap['SAREXT'] = ta.SAREXT(high, low, startvalue=0, offsetonreverse=0, accelerationinitlong=0, accelerationlong=0, accelerationmaxlong=0, accelerationinitshort=0, accelerationshort=0, accelerationmaxshort=0)
    
    df = pd.DataFrame.from_dict(overlap)
    df = df.set_index(df_index)
    return df

In [83]:
df = get_features("AAPL", "09-01-09", "17-10-17")

In [84]:
print("Any Nans: {}".format(df.isnull().values.any()))
print("Nan count: {}".format(df.isnull().sum().sum()))

Any Nans: False
Nan count: 0


In [85]:
del df['Open']
del df['High']
del df['Low']
del df['Close']
del df['Ex-Dividend']
del df['Split Ratio']



In [86]:
categorical_features = df.select_dtypes(include = ["object"]).columns
numerical_features = df.select_dtypes(exclude = ["object"]).columns

print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = df[numerical_features]
train_cat = df[categorical_features]

Numerical features : 170
Categorical features : 0


### Date, Time features

In [87]:
df['Date'] = df.index
#df[['year','month','day']] = df.Date.str.split('-',expand=True)
df['month'] = df.Date.dt.month
df['day'] = df.Date.dt.day
df.head(3)

Unnamed: 0_level_0,Volume,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,log_ret,adx_14,adx_28,adx_7,...,bb_middleband_56,bb_middleband_7,bb_upperband_28,bb_upperband_56,bb_upperband_7,fama,mama,Date,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-05-05,25850700.0,33.785029,33.836435,32.995953,33.243985,180954900.0,-0.010473,45.13626,29.777463,40.395487,...,29.72636,33.925844,35.284471,35.121029,34.914997,33.579632,33.771089,2010-05-05,5,5
2010-05-06,31539400.0,32.517882,33.174588,31.965272,32.89764,220775800.0,-0.038772,42.009384,29.185824,38.810615,...,29.853968,33.67772,35.253894,35.204408,34.702076,33.312041,32.984985,2010-05-06,5,6
2010-05-07,45923600.0,32.620693,33.188724,25.606402,31.646557,321465200.0,-0.043109,42.763943,29.243772,44.091913,...,29.952304,33.387829,35.192517,35.227971,35.140158,33.288996,32.851142,2010-05-07,5,7


In [88]:
del df['Date']
df.head(3)

Unnamed: 0_level_0,Volume,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,log_ret,adx_14,adx_28,adx_7,...,bb_middleband_28,bb_middleband_56,bb_middleband_7,bb_upperband_28,bb_upperband_56,bb_upperband_7,fama,mama,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-05-05,25850700.0,33.785029,33.836435,32.995953,33.243985,180954900.0,-0.010473,45.13626,29.777463,40.395487,...,31.944592,29.72636,33.925844,35.284471,35.121029,34.914997,33.579632,33.771089,5,5
2010-05-06,31539400.0,32.517882,33.174588,31.965272,32.89764,220775800.0,-0.038772,42.009384,29.185824,38.810615,...,32.079234,29.853968,33.67772,35.253894,35.204408,34.702076,33.312041,32.984985,5,6
2010-05-07,45923600.0,32.620693,33.188724,25.606402,31.646557,321465200.0,-0.043109,42.763943,29.243772,44.091913,...,32.149687,29.952304,33.387829,35.192517,35.227971,35.140158,33.288996,32.851142,5,7


### Create lagged features

In [89]:
steps = 20
lags_df = (pd.concat(
    [df.log_ret.shift(i) for i in range(steps)],
    axis=1,
    keys=['log_ret'] + ['log_ret_L%s' % i for i in range(1, steps)]
).dropna()
)
# To ensure that only past frames are used to predict current steering.
# (e.g to predict target at t, we use t-1,t-2, etc.)
lags_df = lags_df.drop(['log_ret'], axis=1)
final_df = pd.merge(df, lags_df, left_index=True, right_index=True)

In [90]:

final_df.head(3)

Unnamed: 0_level_0,Volume,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,log_ret,adx_14,adx_28,adx_7,...,log_ret_L10,log_ret_L11,log_ret_L12,log_ret_L13,log_ret_L14,log_ret_L15,log_ret_L16,log_ret_L17,log_ret_L18,log_ret_L19
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-06-02,31302600.0,33.373784,34.176997,33.279969,33.52029,219118200.0,0.011891,35.027386,25.911122,33.266021,...,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772,-0.010473
2010-06-03,24591000.0,33.997077,34.030491,33.456676,33.921254,172137000.0,-0.003149,33.270956,25.407941,29.872039,...,-0.016058,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772
2010-06-04,23218100.0,34.079326,34.126876,33.466315,33.814587,162526700.0,-0.027569,31.553821,24.898773,27.283272,...,-0.043537,-0.016058,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109


In [91]:
print("Any Nans: {}".format(final_df.isnull().values.any()))
print("Nan count: {}".format(final_df.isnull().sum().sum()))

Any Nans: False
Nan count: 0


In [92]:
# Convert return into binary to format for classification
final_df["y"] = final_df.apply(lambda x: 1 if x['log_ret'] > 0 else 0 , axis=1)


In [93]:
# Remove log_ret which can lead to look ahead effects
del final_df["log_ret"]

In [94]:
print("shape: {}".format(final_df.shape))


shape: (1858, 191)


### Base case using prior day direction.

In [95]:
final_df.head(3)

Unnamed: 0_level_0,Volume,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,adx_14,adx_28,adx_7,adxr_14,...,log_ret_L11,log_ret_L12,log_ret_L13,log_ret_L14,log_ret_L15,log_ret_L16,log_ret_L17,log_ret_L18,log_ret_L19,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-06-02,31302600.0,33.373784,34.176997,33.279969,33.52029,219118200.0,35.027386,25.911122,33.266021,38.053326,...,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772,-0.010473,1
2010-06-03,24591000.0,33.997077,34.030491,33.456676,33.921254,172137000.0,33.270956,25.407941,29.872039,36.704011,...,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772,0
2010-06-04,23218100.0,34.079326,34.126876,33.466315,33.814587,162526700.0,31.553821,24.898773,27.283272,35.621771,...,-0.016058,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,0


In [96]:
y_lag = pd.DataFrame(final_df.y.shift(1).dropna())
y_lag.rename(columns={'y':'y_lag'}, inplace=True)

df = pd.merge(final_df, y_lag, left_index=True, right_index=True)
temp = df[['y','y_lag']]
temp.head()

Unnamed: 0_level_0,y,y_lag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-06-03,0,1.0
2010-06-04,0,0.0
2010-06-07,0,0.0
2010-06-08,0,0.0
2010-06-09,0,0.0


In [97]:
accuracy = sum(temp['y']==temp['y_lag']) / len(temp.index)
print("Using the previous day direction for prediction: {:0.2f}% accuracy".format(accuracy * 100))

Using the previous day direction for prediction: 50.46% accuracy


### Base case using random direction.

In [111]:
final_df.head(3)

Unnamed: 0_level_0,Volume,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,adx_14,adx_28,adx_7,adxr_14,...,log_ret_L11,log_ret_L12,log_ret_L13,log_ret_L14,log_ret_L15,log_ret_L16,log_ret_L17,log_ret_L18,log_ret_L19,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-06-02,31302600.0,33.373784,34.176997,33.279969,33.52029,219118200.0,35.027386,25.911122,33.266021,38.053326,...,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772,-0.010473,1
2010-06-03,24591000.0,33.997077,34.030491,33.456676,33.921254,172137000.0,33.270956,25.407941,29.872039,36.704011,...,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772,0
2010-06-04,23218100.0,34.079326,34.126876,33.466315,33.814587,162526700.0,31.553821,24.898773,27.283272,35.621771,...,-0.016058,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,0


In [112]:
y_random = pd.DataFrame(final_df.y.apply(lambda x: np.random.choice(2)))
y_random.rename(columns={'y':'y_random'}, inplace=True)
temp = pd.merge(final_df, y_random, left_index=True, right_index=True)
temp.head()

Unnamed: 0_level_0,Volume,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,adx_14,adx_28,adx_7,adxr_14,...,log_ret_L12,log_ret_L13,log_ret_L14,log_ret_L15,log_ret_L16,log_ret_L17,log_ret_L18,log_ret_L19,y,y_random
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-06-02,31302600.0,33.373784,34.176997,33.279969,33.52029,219118200.0,35.027386,25.911122,33.266021,38.053326,...,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772,-0.010473,1,0
2010-06-03,24591000.0,33.997077,34.030491,33.456676,33.921254,172137000.0,33.270956,25.407941,29.872039,36.704011,...,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,-0.038772,0,0
2010-06-04,23218100.0,34.079326,34.126876,33.466315,33.814587,162526700.0,31.553821,24.898773,27.283272,35.621771,...,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,-0.043109,0,0
2010-06-07,27082300.0,33.183584,33.6578,32.723504,32.89507,189576100.0,30.498241,24.552963,24.615916,34.939811,...,-0.016058,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0.074056,0,1
2010-06-08,31676500.0,33.193865,33.304387,32.199167,32.249288,221735500.0,29.871349,24.317626,24.03307,34.331855,...,-0.043537,-0.016058,-0.007343,0.001575,-0.017729,-0.014334,0.021481,0.009912,0,0


In [113]:
accuracy = sum(temp['y']==temp['y_random']) / len(temp.index)
print("Using random direction for prediction: {:0.2f}% accuracy".format(accuracy * 100))

Using random direction for prediction: 48.76% accuracy
