In [82]:
#import libraries
from marketstackAPI import Marketstack
import pandas as pd
import numpy as np
import cufflinks as cf
import ta
import holidays
import matplotlib as plt
from datetime import datetime
from datetime import timedelta
from imblearn.over_sampling import SMOTE
from IPython.core.display import display, HTML

In [33]:
plt.rcParams['figure.figsize'] = [12, 5]
plt.rcParams['figure.dpi'] = 200
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.options.plotting.backend = "plotly"

In [2]:
#initialize and set parameters
MS = Marketstack()
cf.set_config_file(theme='henanigans',sharing='public',offline=True)

In [134]:
#function for data prepration
def raw_data_preprocessing(data):
    #drop extra columns from data source
    data = data[['date','adj_high','adj_low','adj_close','adj_open','adj_volume']]
    data.columns = ['date','high','low','close','open','volume']
    
    #add common technical indicators into dataframe also reverse the data into descending order.
    data = ta.add_all_ta_features(data[::-1], open="open", high="high", low="low", close="close", volume="volume",)
    
    #add volume % change from yesterday to today
    data.insert(1,'volume_change',data.volume/data.volume.shift(1)-1)

    #add price % change from yesterday to today
    data.insert(1,'price_change',data.close/data.close.shift(1)-1)

    
    #creating Y
    #calculate daily % change using daily close using the NEXT day close / today close
    data.insert(1,'next_close_change',data.close.shift(-1)/data.close-1)
    
    #reset index
    data.reset_index(inplace=True, drop=True)
    return data

In [135]:
def plot(df):
    qf = cf.QuantFig(data,legend='bottom')
    qf.add_volume()
#     qf.add_rsi()
#     qf.add_adx()
    qf.iplot()

In [136]:
def summary(indicator):
#     print('min:', min(indicator))
#     print('max:', max(indicator))
    fig = indicator.hist()
    fig.update_traces(xbins=dict( # bins used for histogram
        start=min(indicator),
        end=max(indicator),
        size=(max(indicator)-min(indicator))/100
    ))
    fig.show()

In [137]:
# get data
data = MS.get('SPY')
data = raw_data_preprocessing(data)

70273756d5520f9fe1c43de060cf78ba



invalid value encountered in double_scalars


invalid value encountered in double_scalars



In [138]:
data.columns

Index(['date', 'next_close_change', 'price_change', 'volume_change', 'high',
       'low', 'close', 'open', 'volume', 'volume_adi', 'volume_obv',
       'volume_cmf', 'volume_fi', 'volume_mfi', 'volume_em', 'volume_sma_em',
       'volume_vpt', 'volume_nvi', 'volume_vwap', 'volatility_atr',
       'volatility_bbm', 'volatility_bbh', 'volatility_bbl', 'volatility_bbw',
       'volatility_bbp', 'volatility_bbhi', 'volatility_bbli',
       'volatility_kcc', 'volatility_kch', 'volatility_kcl', 'volatility_kcw',
       'volatility_kcp', 'volatility_kchi', 'volatility_kcli',
       'volatility_dcl', 'volatility_dch', 'volatility_dcm', 'volatility_dcw',
       'volatility_dcp', 'volatility_ui', 'trend_macd', 'trend_macd_signal',
       'trend_macd_diff', 'trend_sma_fast', 'trend_sma_slow', 'trend_ema_fast',
       'trend_ema_slow', 'trend_adx', 'trend_adx_pos', 'trend_adx_neg',
       'trend_vortex_ind_pos', 'trend_vortex_ind_neg', 'trend_vortex_ind_diff',
       'trend_trix', 'trend_mass_ind

In [None]:
full_basic_data = data[['date']]

In [124]:
summary(data['volume_change'])

In [28]:
plot(data)

In [8]:
min_year = int(min(data['date'])[:4])-2
max_year = int(max(data['date'])[:4])+2
min_date = str(min_year)+'-01-01'
max_date = str(max_year)+'-12-31'
dates = pd.date_range(min_date,max_date).values
holidays = holidays.UnitedStates(years=range(min_year,max_year))

In [51]:
fig = data.change.hist()
fig.update_traces(xbins=dict( # bins used for histogram
        start=-1,
        end=1,
        size=0.001
    ))
fig.show()

In [77]:
x = data.close_change[:]

In [78]:
x[x>0.01] = 1
x[x<-0.01] = -1

In [79]:
x[(x > -1) & (x < 1)] = 0

In [84]:
smote = SMOTE(random_state = 1)

In [85]:
x.hist()

In [98]:
#oversampling to balance data
X_res, y_res = smote.fit_resample([int(i) for i in x[:-1]], data[:-1])

TypeError: '<' not supported between instances of 'float' and 'str'

[1,
 1,
 0,
 0,
 -1,
 1,
 0,
 1,
 0,
 0,
 -1,
 0,
 0,
 -1,
 1,
 -1,
 1,
 1,
 0,
 0,
 1,
 -1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 -1,
 0,
 -1,
 1,
 -1,
 0,
 1,
 0,
 0,
 0,
 -1,
 -1,
 1,
 -1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 -1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 -1,
 0,
 -1,
 1,
 -1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 -1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 -1,
 0,
 -1,
 1,
 1,
 0,
 1,
 0]

0      1.0
1      1.0
2      0.0
3      0.0
4     -1.0
      ... 
165    1.0
166    1.0
167    0.0
168    1.0
169    0.0
Name: change, Length: 170, dtype: float64

In [139]:
data.head()

Unnamed: 0,date,next_close_change,price_change,volume_change,high,low,close,open,volume,volume_adi,...,momentum_wr,momentum_ao,momentum_kama,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,others_dr,others_dlr,others_cr
0,2020-06-02T00:00:00+0000,0.013308,,,308.13,305.1,308.08,306.55,73635043.0,71204840.0,...,,,,,,,,-10.555622,,0.0
1,2020-06-03T00:00:00+0000,0.0353,0.013308,0.246011,313.22,309.94,312.18,310.24,91750087.0,104771900.0,...,,,,,,,,1.330823,1.322046,1.330823
2,2020-06-08T00:00:00+0000,-0.007457,0.0353,-0.200979,323.41,319.63,323.2,320.22,73310274.0,169936600.0,...,,,,,,,,3.530015,3.469138,4.907816
3,2020-06-09T00:00:00+0000,-0.00558,-0.007457,0.052713,323.2849,319.36,320.79,320.3,77174695.0,148997700.0,...,,,,,,,,-0.745668,-0.748462,4.125552
4,2020-06-10T00:00:00+0000,-0.057649,-0.00558,0.2173,322.39,318.2209,319.0,321.42,93944722.0,90164770.0,...,,,,,,,,-0.557997,-0.55956,3.544534
