In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import json

<h5>Cleaning up Yahoo data</h5>

In [2]:
def clean_yahoo_df(path):
    ticker = path.split('/')[-1].split('.')[0]
    df = pd.read_csv(path)
    
    df.drop(['High','Low','Open','Close'], axis=1, inplace=True)
    df['Adj Close'] = df['Adj Close'].round(2)
    #df['Volume'] = df['Volume'].astype('int32')
    df.rename(columns={'Volume': f'{ticker}_volume', 'Adj Close': ticker}, inplace=True)
    df.set_index('Date', inplace=True)
    
    return df

In [3]:
main_df = clean_yahoo_df('final_dfs/spy.csv')
main_df.head()

Unnamed: 0_level_0,spy_volume,spy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1993-01-29,1003200.0,26.3
1993-02-01,480500.0,26.49
1993-02-02,201300.0,26.54
1993-02-03,529400.0,26.82
1993-02-04,531500.0,26.94


A few economic indicators are updated weekly, monthly, etc. Some of those days are on the weekend when equity markets are not opened. For that, we will have to outer join those dataframes with the main dataframes, forward fill the NaN data, and remove weekend dates. Below is the list of SPY dates that will be used later.

In [4]:
spy_dates = list(main_df.index)

Below are all tickers pulled from Yahoo and saved locally. I will need to find a way to automate the refresh daily but for now will use the data I have to optimize the model.

Also note, for the individual sector ETFs, I will use Vanguard ETFs instead of SPDR even if SPY is an SPDR ETFs. Even though most of the SPDR ETFs are larger and more liquid I find the Vanguard ETFs more accurately reflects their respective idustries as the portfolios include small and mid caps. SPDR ETFs are heavily concentrated in stocks already included in SPY.

In [5]:
# note: leaving out USO becuase we have /CL

etfs = ['qqq', 'iwm']

sector_etf_path = 'sector etfs - Vanguard'
sector_etf = ['vaw', 'vcr', 'vdc', 'vde', 'vfh', 'vgt', 'vht', 'vis', 'vnq', 'vox', 'vpu']

futures_path = 'futures'
futures = ['cl=f', 'gc=f']

bonds_etf_path = 'bonds etfs'
bonds = ['ief', 'tlt']

In [6]:
for ticker in etfs:
    temp_df = clean_yahoo_df(f'final_dfs/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in sector_etf:
    temp_df = clean_yahoo_df(f'final_dfs/{sector_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in futures:
    temp_df = clean_yahoo_df(f'final_dfs/{futures_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')
    
for ticker in bonds:
    temp_df = clean_yahoo_df(f'final_dfs/{bonds_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

In [7]:
main_df.head()

Unnamed: 0_level_0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,vpu_volume,vpu,cl=f_volume,cl=f,gc=f_volume,gc=f,ief_volume,ief,tlt_volume,tlt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-29,1003200.0,26.3,,,,,,,,,...,,,,,,,,,,
1993-02-01,480500.0,26.49,,,,,,,,,...,,,,,,,,,,
1993-02-02,201300.0,26.54,,,,,,,,,...,,,,,,,,,,
1993-02-03,529400.0,26.82,,,,,,,,,...,,,,,,,,,,
1993-02-04,531500.0,26.94,,,,,,,,,...,,,,,,,,,,


<h5>Other charts (Treasure bond yields, Libor rate, Mortgage rate, M1 & M2 money supply, VIX, DIX)</h5>
<br>
Data pulled from various sources.

In [8]:
# maybe include VVIX in the future?
# Notes:
# 30 year fixed mortgage rate is updated monthly (need may numbers)
# money supply data is updated weekly

economics_path = 'economics'
economics = ['CPIAUCSL', 'DFF', 'DGS10', 'DGS30', 'M1', 'M2', 'MORTGAGE15US', 'MORTGAGE30US',
             'T10YIE', 'UNRATE', 'USD1MTD156N', 'USD6MTD156N', 'USD12MD156N']
economic_names = ['CPI', 'fed_funds_rate', '10_yr_treasury', '30_yr_treasury', 'M1_supply',
                  'M2_supply', '15_yr_mortgage', '30_yr_mortgage', '10_yr_inflation',
                  'unemployment', '1m_libor', '3m_libor', '12m_libor']

currency_path = 'currency'
currency = ['dxy']

vix = '^vix'
dix = 'DIX'

In [9]:
# economic data
for code, name in zip(economics, economic_names):
    try:
        economics_df = pd.read_csv(f'final_dfs/{economics_path}/{code}.csv')
        economics_df = economics_df.loc[economics_df['DATE'] >= '1993-01-01']
        economics_df.rename(columns={code: name}, inplace=True)
        economics_df.set_index('DATE', inplace=True)
        if code in ['M1_supply', 'M2_supply']:
            economics_df = economics_df * 1000000000
        elif code == 'GFDEBTN':
            economics_df = economics_df * 1000000
        elif code == 'CPIAUCSL':
            economics_df
        else:
            economics_df = economics_df / 100
    except:
        economics_df.replace('.', np.nan, inplace=True)
        economics_df.ffill(inplace=True)
        economics_df = economics_df.astype(float)
        economics_df = economics_df / 100
    main_df = main_df.join(economics_df, how='outer')

# currency (check date format)
currency_df = pd.read_csv(f'final_dfs/{currency_path}/{currency[0]}.csv')
currency_df.drop([' Open',' High',' Low'], axis=1, inplace=True)
currency_df.rename(columns={' Close': currency[0]}, inplace=True)
currency_df['Date'] = currency_df['Date'].apply(lambda x: dt.strptime(x, '%m/%d/%y').strftime('%Y-%m-%d'))
currency_df.set_index('Date', inplace=True)
main_df = main_df.join(currency_df, how='left')

# vix
vix_df = clean_yahoo_df(f'final_dfs/{vix}.csv')
vix_df.drop('^vix_volume', axis=1, inplace=True)
main_df = main_df.join(vix_df, how='left')

# dix
dix_df = pd.read_csv(f'final_dfs/{dix}.csv')
dix_df.drop('price', axis=1, inplace=True)
dix_df.set_index('date', inplace=True)
main_df = main_df.join(dix_df, how='left')

In [10]:
main_df.head(3)

Unnamed: 0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,30_yr_mortgage,10_yr_inflation,unemployment,1m_libor,3m_libor,12m_libor,dxy,^vix,dix,gex
1993-01-01,,,,,,,,,,,...,,,0.073,,,,,,,
1993-01-02,,,,,,,,,,,...,,,,,,,,,,
1993-01-03,,,,,,,,,,,...,,,,,,,,,,


Forward fill any data that is updated weekly, monthly, etc. Remove all dates when market is closed (weekends, holidays).

In [11]:
main_df.ffill(inplace=True)

In [12]:
main_df = main_df[main_df.index.isin(spy_dates)]
main_df.head()

Unnamed: 0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,30_yr_mortgage,10_yr_inflation,unemployment,1m_libor,3m_libor,12m_libor,dxy,^vix,dix,gex
1993-01-29,1003200.0,26.3,,,,,,,,,...,0.0786,,0.073,0.03125,0.033672,0.036875,92.48,12.42,,
1993-02-01,480500.0,26.49,,,,,,,,,...,0.0786,,0.071,0.03125,0.03375,0.036875,93.39,12.33,,
1993-02-02,201300.0,26.54,,,,,,,,,...,0.0786,,0.071,0.031875,0.034375,0.0375,93.94,12.25,,
1993-02-03,529400.0,26.82,,,,,,,,,...,0.0786,,0.071,0.031875,0.034375,0.0375,94.33,12.12,,
1993-02-04,531500.0,26.94,,,,,,,,,...,0.0786,,0.071,0.031875,0.034375,0.0375,94.33,12.29,,


<h5>Handling Null Values</h5>
<br>
I have decided to drop any rows with NaN. DIX/GEX data had the most NaN values since it was not recorded until 2011. I could just remove the columns but the main purpose of this project was to see how accurate the DIX index was at predicting SPY prices so I left it in.
<br>
Will do another analysis without DIX/GEX. When that happens we will have data from ~2004

In [13]:
main_df.isna().sum().sort_values(ascending=False).head(5)

gex           4598
dix           4598
vox           2939
vox_volume    2939
vnq           2939
dtype: int64

In [14]:
main_df = main_df.dropna(axis=0)
main_df.index = pd.to_datetime(main_df.index,format='%Y-%m-%d')
main_df.head()

Unnamed: 0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,30_yr_mortgage,10_yr_inflation,unemployment,1m_libor,3m_libor,12m_libor,dxy,^vix,dix,gex
2011-05-02,126278700.0,113.55,48149400.0,53.57,48962400.0,74.88,59000.0,74.33,119000.0,58.26,...,0.0478,0.0256,0.09,0.002102,0.004305,0.00761,72.95,15.99,0.378842,1897313000.0
2011-05-03,138375000.0,113.14,38100000.0,53.31,56070000.0,73.94,65800.0,73.38,40400.0,58.0,...,0.0478,0.0255,0.09,0.002095,0.004302,0.007578,73.14,16.7,0.383411,1859731000.0
2011-05-04,182678500.0,112.39,53097200.0,53.23,64754600.0,72.98,89000.0,72.09,26000.0,57.78,...,0.0478,0.0255,0.09,0.00209,0.004285,0.007562,73.03,17.08,0.392122,1717764000.0
2011-05-05,226900000.0,111.37,72200000.0,52.94,91858900.0,72.65,89400.0,71.42,26300.0,57.68,...,0.0471,0.0248,0.09,0.002062,0.00427,0.00752,74.19,18.2,0.405457,1361864000.0
2011-05-06,222787200.0,111.86,72507000.0,53.11,76446200.0,73.03,101100.0,71.95,23800.0,57.8,...,0.0471,0.0248,0.09,0.002038,0.00425,0.00749,74.84,18.4,0.418649,1490329000.0


In [15]:
cols_to_normalize = []
for i in main_df.columns:
    if main_df[i].iloc[0] >= 1:
        cols_to_normalize.append(i)
    #print(i, main_df[i].iloc[0])
cols_to_normalize.remove('spy')

<h5>Last Value Method</h5>
We will be shifiting the target value up by one day, meaning we will set the current adjusted closing price as the previous day's adjusted closing price

In [16]:
main_df['spy_closed'] = main_df['spy'].shift(-1)
main_df.drop(main_df.tail(1).index, inplace=True)

Normalizing Data

In [17]:
spy_price = main_df['spy_closed']
temp_df = main_df[cols_to_normalize].copy()
tickers_df = main_df.drop(['spy','spy_closed'], axis=1).copy()
tickers_df = tickers_df.drop(cols_to_normalize, axis=1).copy()

In [18]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
temp_df = pd.DataFrame(scalar.fit_transform(temp_df), columns=cols_to_normalize)
tickers_df = tickers_df.join(temp_df.set_index(main_df.index))
tickers_df = tickers_df.join(spy_price)
tickers_df.head()

Unnamed: 0,fed_funds_rate,10_yr_treasury,30_yr_treasury,15_yr_mortgage,30_yr_mortgage,10_yr_inflation,unemployment,1m_libor,3m_libor,12m_libor,...,ief,tlt_volume,tlt,CPI,M1_supply,M2_supply,dxy,^vix,gex,spy_closed
2011-05-02,0.0009,0.0331,0.0438,0.0397,0.0478,0.0256,0.09,0.002102,0.004305,0.00761,...,0.0,0.057076,0.0,0.0,0.0,0.00017,0.0,0.093134,0.334306,113.14
2011-05-03,0.0009,0.0328,0.0436,0.0397,0.0478,0.0255,0.09,0.002095,0.004302,0.007578,...,0.00485,0.073032,0.004318,0.0,0.0,0.00017,0.006271,0.102787,0.331719,112.39
2011-05-04,0.0009,0.0325,0.0433,0.0397,0.0478,0.0255,0.09,0.00209,0.004285,0.007562,...,0.009469,0.085246,0.007711,0.0,0.0,0.00017,0.00264,0.107954,0.321944,111.37
2011-05-05,0.0009,0.0318,0.0426,0.0389,0.0471,0.0248,0.09,0.002062,0.00427,0.00752,...,0.018707,0.114378,0.015011,0.0,0.0,0.00017,0.040924,0.123182,0.297442,111.86
2011-05-06,0.0009,0.0319,0.0429,0.0389,0.0471,0.0248,0.09,0.002038,0.00425,0.00749,...,0.01963,0.112406,0.011618,0.0,0.0,0.00017,0.062376,0.125901,0.306286,112.3


In [19]:
tickers_df.columns

Index(['fed_funds_rate', '10_yr_treasury', '30_yr_treasury', '15_yr_mortgage',
       '30_yr_mortgage', '10_yr_inflation', 'unemployment', '1m_libor',
       '3m_libor', '12m_libor', 'dix', 'spy_volume', 'qqq_volume', 'qqq',
       'iwm_volume', 'iwm', 'vaw_volume', 'vaw', 'vcr_volume', 'vcr',
       'vdc_volume', 'vdc', 'vde_volume', 'vde', 'vfh_volume', 'vfh',
       'vgt_volume', 'vgt', 'vht_volume', 'vht', 'vis_volume', 'vis',
       'vnq_volume', 'vnq', 'vox_volume', 'vox', 'vpu_volume', 'vpu',
       'cl=f_volume', 'cl=f', 'gc=f_volume', 'gc=f', 'ief_volume', 'ief',
       'tlt_volume', 'tlt', 'CPI', 'M1_supply', 'M2_supply', 'dxy', '^vix',
       'gex', 'spy_closed'],
      dtype='object')

In [44]:
valid_test_size = int(len(tickers_df) * 0.2)
train_size = len(tickers_df) - valid_test_size - valid_test_size

# train = tickers_df[:train_size].copy()
valid = tickers_df.iloc[train_size:train_size+valid_test_size].copy()
train_valid = tickers_df.iloc[:train_size+valid_test_size].copy()
test = tickers_df.iloc[train_size+valid_test_size:].copy()

Training model with Time Series Cross Validation using Linear Regression.

Will start with forward CV for now.

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

def train_test_time_split(df, train_size, target_col):
    target = df[target_col]
    df = df.drop(target_col, axis=1)
    
    #test_size = len(df) - int(len(df) * train_size)
    train_size = int(len(df) * train_size)
    
    prediction_list = []
    for i in range(train_size, len(df)):
        X_train, X_test = df.iloc[:i], np.array(df.iloc[i]).reshape(1,-1)
        y_train = target.iloc[:i]
        
        model = LinearRegression()
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        prediction_list.append(pred[0])
    
    return prediction_list

# train_test_time_split(train_valid, 0.75, 'spy_closed')

In [89]:
sample = train_test_time_split(train_valid, 0.75, 'spy_closed')

In [90]:
from sklearn.metrics import mean_squared_error
import math

math.sqrt(mean_squared_error(valid['spy_closed'], sample))

1.7182848961430208

In [23]:
# notes
# CPI needs April data
# MI & M2 missing last week of April data
# explore adding options data if possible
# add industry ticker data and volume
# build funciton to find optimal parameters
# experiment with a few models (xgboost, decision tree)
# find a way to predict trend data (trend line for week/month out)