In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import json

<h5>Cleaning up Yahoo data</h5>

In [2]:
def clean_yahoo_df(path):
    ticker = path.split('/')[-1].split('.')[0]
    df = pd.read_csv(path)
    
    df.drop(['High','Low','Open','Close'], axis=1, inplace=True)
    df['Adj Close'] = df['Adj Close'].round(2)
    #df['Volume'] = df['Volume'].astype('int32')
    df.rename(columns={'Volume': f'{ticker}_volume', 'Adj Close': ticker}, inplace=True)
    df.set_index('Date', inplace=True)
    
    return df

In [3]:
main_df = clean_yahoo_df('final_dfs/spy.csv')
main_df.head()

Unnamed: 0_level_0,spy_volume,spy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1993-01-29,1003200.0,26.3
1993-02-01,480500.0,26.49
1993-02-02,201300.0,26.54
1993-02-03,529400.0,26.82
1993-02-04,531500.0,26.94


Below are all tickers pulled from Yahoo and saved locally. I will need to find a way to automate the refresh daily but for now will use the data I have to optimize the model.

Also note, for the individual sector ETFs, I will use Vanguard ETFs instead of SPDR even if SPY is an SPDR ETFs. Even though most of the SPDR ETFs are larger and more liquid I find the Vanguard ETFs more accurately reflects their respective idustries as the portfolios include small and mid caps. SPDR ETFs are heavily concentrated in stocks already included in SPY.

In [4]:
# note: leaving out USO becuase we have /CL

etfs = ['qqq', 'iwm']

sector_etf_path = 'sector etfs - Vanguard'
sector_etf = ['vaw', 'vcr', 'vdc', 'vde', 'vfh', 'vgt', 'vht', 'vis', 'vnq', 'vox', 'vpu']

futures_path = 'futures'
futures = ['cl=f', 'gc=f']

bonds_etf_path = 'bonds etfs'
bonds = ['ief', 'tlt']

In [5]:
for ticker in etfs:
    temp_df = clean_yahoo_df(f'final_dfs/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in sector_etf:
    temp_df = clean_yahoo_df(f'final_dfs/{sector_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in futures:
    temp_df = clean_yahoo_df(f'final_dfs/{futures_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')
    
for ticker in bonds:
    temp_df = clean_yahoo_df(f'final_dfs/{bonds_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

In [6]:
main_df.head()

Unnamed: 0_level_0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,vpu_volume,vpu,cl=f_volume,cl=f,gc=f_volume,gc=f,ief_volume,ief,tlt_volume,tlt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-29,1003200.0,26.3,,,,,,,,,...,,,,,,,,,,
1993-02-01,480500.0,26.49,,,,,,,,,...,,,,,,,,,,
1993-02-02,201300.0,26.54,,,,,,,,,...,,,,,,,,,,
1993-02-03,529400.0,26.82,,,,,,,,,...,,,,,,,,,,
1993-02-04,531500.0,26.94,,,,,,,,,...,,,,,,,,,,


Other charts (Treasure bond yields, Libor rate, Mortgage rate, M1 & M2 money supply, VIX, DIX)

In [7]:
# maybe include VVIX in the future?
# Notes:
# 30 year fixed mortgage rate is updated monthly (need may numbers)
# money supply data is updated weekly


yields_path = 'yields'
treasury_yields = ['10-year-treasury-bond-rate-yield-chart', '30-year-fixed-mortgage-rate-chart',
                   '30-year-treasury-bond-rate-yield-chart', 'fed-funds-rate-historical-chart']
libor_yields = ['historical-libor-rates-chart']

money_supply_path = 'money supply'
money_supply = ['M1', 'M2']

currency_path = 'currency'
currency = ['dxy']

vix = '^vix'
dix = 'DIX'

In [22]:
# # treasury/mortgage rate
# for i in treasury_yields:
#     bonds_df = pd.read_csv(f'final_dfs/{yields_path}/{i}.csv', skiprows=15)
#     bonds_df[' value'] = bonds_df[' value'] / 100
#     bonds_df.rename(columns={' value': f'{i}'.replace('-chart', '')}, inplace=True)
#     bonds_df.set_index('date', inplace=True)
#     main_df = main_df.join(bonds_df, how='left')
    
# libor rate
libor_df = pd.read_csv(f'final_dfs/{yields_path}/{libor_yields[0]}.csv', skiprows=15)
libor_df.rename(columns={'1M': '1M_libor_rate',
                         '3M': '3M_libor_rate',
                         '6M': '6M_libor_rate',
                         '12M': '12M_libor_rate'},
               inplace=True)
#libor_df.set_index('date', inplace=True)
#libor_df = libor_df / 100
# main_df = main_df.join(libor_df, how='left')
libor_df.head()

# money supply
# for i in money_supply:
#     money_supply_df = pd.read_csv(f'final_dfs/{money_supply_path}/{i}.csv')
#     money_supply_df.rename(columns={'M1': 'M1_money', 'M2': 'M2_money'}, inplace=True)
#     money_supply_df.set_index('DATE', inplace=True)
#     money_supply_df = money_supply_df * 1000000000
#     main_df = main_df.join(money_supply_df, how='left')

# # currency (check date format)
# currency_df = pd.read_csv(f'final_dfs/{currency_path}/{currency[0]}.csv')
# currency_df.drop([' Open',' High',' Low'], axis=1, inplace=True)
# currency_df.rename(columns={' Close': currency[0]}, inplace=True)
# currency_df.set_index('Date', inplace=True)
# main_df = main_df.join(currency_df, how='left')

# # vix
# vix_df = clean_yahoo_df(f'final_dfs/{vix}.csv')
# vix_df.drop('^vix_volume', axis=1, inplace=True)
# main_df = main_df.join(vix_df, how='left')

# # dix
# dix_df = pd.read_csv(f'final_dfs/{dix}.csv')
# dix_df.drop('price', axis=1, inplace=True)
# dix_df.set_index('date', inplace=True)
# main_df = main_df.join(dix_df, how='left')

Unnamed: 0,date,1M_libor_rate,3M_libor_rate,6M_libor_rate,12M_libor_rate
0,01/01/1986,8.25,8.25,8.25,8.5
1,02/01/1986,7.94,7.88,7.88,7.94
2,03/01/1986,7.5,7.44,7.44,7.44
3,04/01/1986,6.94,6.88,6.88,6.94
4,05/01/1986,7.13,7.13,7.19,7.38


In [19]:
main_df.iloc[-30:,35:]

Unnamed: 0_level_0,tlt,10-year-treasury-bond-rate-yield,30-year-fixed-mortgage-rate,30-year-treasury-bond-rate-yield,fed-funds-rate-historical,1M_libor_rate,3M_libor_rate,6M_libor_rate,12M_libor_rate,M1_money,M2_money,dxy,^vix,dix,gex
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-03-20,158.99,0.0092,,0.0155,0.0015,,,,,,,,66.04,0.411404,-641778600.0
2020-03-23,165.55,0.0076,,0.0133,0.0015,,,,,4323700000000.0,16270400000000.0,,61.59,0.366571,-67803560.0
2020-03-24,162.46,0.0084,,0.0139,0.0012,,,,,,,,61.67,0.402208,600220000.0
2020-03-25,162.1,0.0088,,0.0145,0.001,,,,,,,,63.95,0.461185,-157646700.0
2020-03-26,162.89,0.0083,,0.0142,0.001,,,,,,,,61.0,0.480941,1106730000.0
2020-03-27,167.24,0.0072,,0.0129,0.001,,,,,,,,65.54,0.482826,-289385500.0
2020-03-30,165.86,0.007,,0.0131,0.0009,,,,,4525800000000.0,16666000000000.0,,57.08,0.47274,98620250.0
2020-03-31,164.52,0.007,,0.0135,0.0008,,,,,,,,53.54,0.497417,-148210800.0
2020-04-01,166.82,0.0062,0.0323,0.0127,0.0006,,,,,,,,57.06,0.511401,-772850000.0
2020-04-02,167.88,0.0063,,0.0126,0.0005,,,,,,,,50.91,0.485221,-507398400.0


In [None]:
# change datetime format for libor data