In [21]:
#i mport libraries
from marketstackAPI import Marketstack
import pandas as pd
import numpy as np
import cufflinks as cf
import ta
import holidays
import matplotlib as plt
from datetime import datetime
from datetime import timedelta
from imblearn.over_sampling import SMOTE
from IPython.core.display import display, HTML

In [22]:
# jupyter notebook settings and chart size configs
display(HTML("<style>.container { width:100% !important; }</style>"))
plt.rcParams['figure.figsize'] = [12, 5]
plt.rcParams['figure.dpi'] = 200
pd.options.plotting.backend = "plotly"

In [23]:
# initialize and set parameters
MS = Marketstack()
cf.set_config_file(theme='henanigans',sharing='public',offline=True)

In [85]:
def raw_data_preprocessing(raw_data):
    """
    Clean raw_data by removing extra columns, renaming columns, order by date in descending order, reset index number.
    this data format will be used as the standard format for all other feature engineering related function calls.
    
    Parameters
    ----------
    raw_data : pandas dataframe that contains ['date','adj_high','adj_low','adj_close','adj_open','adj_volume'] columns, ordered by date in ascending order.
    
    Return:
    ----------
    standard_data: pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    
    """
    data = raw_data[:]
    data = data[['date','adj_high','adj_low','adj_close','adj_open','adj_volume']]
    data.columns = ['date','high','low','close','open','volume']
    data = data[::-1]
    data.reset_index(inplace=True, drop=True)
    return data

In [124]:
def get_ta_indicators(standard_data, prefix = ''):
    """
    Compute technical indicators for every period, each row within standard_data is a period.
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    prefix: string that will be concatennated to before all technical indicators names
    
    Return:
    ----------
    data: pandas dataframe that contains computed technical indicators with corrsponding name
    
    """
    data = standard_data[:]
    df = pd.DataFrame()
    df.insert(0, prefix+'_stochrsi_14' if prefix else 'stochrsi_14', ta.momentum.stochrsi(close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_mfi_14' if prefix else 'mfi_14', ta.volume.money_flow_index(high = data.high, low = data.low, close = data.close, volume= data.volume)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_adx_14' if prefix else 'adx_14', ta.trend.adx(high = data.high, low = data.low, close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_adx_neg_14' if prefix else 'adx_neg_14', ta.trend.adx_neg(high = data.high, low = data.low, close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_adx_pos_14' if prefix else 'adx_pos_14', ta.trend.adx_pos(high = data.high, low = data.low, close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_aroon_up_25' if prefix else 'aroon_up_25', ta.trend.aroon_up(close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_aroon_down_25' if prefix else 'aroon_down_25', ta.trend.aroon_down(close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_aroon_25' if prefix else 'aroon_25', (ta.trend.aroon_up(close = data.close) - ta.trend.aroon_down(close = data.close))/100) # range 0 to 100 rescaled to 0 to 1
    
    return df

In [60]:
def get_percent_changes(standard_data, prefix = ''):
    """
    Compute basic % changes
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    prefix: string that will be concatennated to before all technical indicators names
    
    Return:
    ----------
    data: pandas dataframe that contains computed % changes indicators with corrsponding name
    
    """
    data = standard_data[:]
    df = pd.DataFrame()
    #add volume % change from yesterday to today
    df.insert(0,'volume_change',data.volume/data.volume.shift(1)-1)
    #add price % change from yesterday to today
    df.insert(0,'price_change',data.close/data.close.shift(1)-1)
    return df

In [75]:
def get_target_variable(standard_data):
    """
    Compute target variable.
    the target variable indicates three classes.
    1 : next day is going up significantly
    -1: next day is going down significantly
    0 : no significant movement for the next day.
    
    How significant change is defined using more than 1% change at the moment. could be changing to something else.
    
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    
    Return:
    ----------
    data: pandas dataframe that contains target variable
    
    """
    data = standard_data[:]
    #creating Y
    #calculate daily % change using daily close using the NEXT day close / today close
    df = pd.DataFrame()
    target = data.close.shift(-1)/data.close-1
    target[target > 0.01] = 1
    target[target < -0.01] = -1
    target[(target < 1) & (target > -1)] = 0
    df.insert(0,'target',target)
    return df

In [25]:
def plot(standard_data):
    """
    Draw interactive candle stick chart OHLC Volume
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns.
    
    """
    qf = cf.QuantFig(standard_data,legend='bottom')
    qf.add_volume()
    qf.iplot()

In [61]:
def hist(data):
    """
    Draw histogram with 100 bins.
    
    Parameters
    ----------
    data : pandas series
    
    """
    fig = data.hist()
    fig.update_traces(xbins=dict( # bins used for histogram
        start=min(data),
        end=max(data),
        size=(max(data)-min(data))/100
    ))
    fig.show()

In [80]:
# get raw data
raw_data = MS.get('NDAQ')

In [86]:
# data prep
data = raw_data_preprocessing(raw_data)
target = get_target_variable(data)

In [126]:
indicators = get_ta_indicators(data, 'daily')


invalid value encountered in double_scalars


invalid value encountered in double_scalars



In [129]:
full_data = indicators

In [130]:
full_data.tail()

Unnamed: 0,daily_aroon_25,daily_aroon_down_25,daily_aroon_up_25,daily_adx_pos_14,daily_adx_neg_14,daily_adx_14,daily_mfi_14,daily_stochrsi_14
2512,0.64,0.04,0.68,0.199109,0.234348,0.263674,0.440764,0.002351
2513,0.6,0.04,0.64,0.265529,0.21207,0.252835,0.439239,0.004555
2514,0.56,0.04,0.6,0.253531,0.202487,0.242771,0.419931,0.00443
2515,0.52,0.04,0.56,0.285683,0.188221,0.24012,0.476818,0.005735
2516,0.4,0.12,0.52,0.269465,0.185157,0.236215,0.548719,0.004814


In [12]:
# holiday info
min_year = int(min(data['date'])[:4])-2
max_year = int(max(data['date'])[:4])+2
min_date = str(min_year)+'-01-01'
max_date = str(max_year)+'-12-31'
dates = pd.date_range(min_date,max_date).values
holidays = holidays.UnitedStates(years=range(min_year,max_year))