In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import json

<h5>Cleaning up Yahoo data</h5>

In [49]:
def clean_yahoo_df(path):
    ticker = path.split('/')[-1].split('.')[0]
    df = pd.read_csv(path)
    
    df.drop(['High','Low','Open','Close'], axis=1, inplace=True)
    df['Adj Close'] = df['Adj Close'].round(2)
    #df['Volume'] = df['Volume'].astype('int32')
    df.rename(columns={'Volume': f'{ticker}_volume', 'Adj Close': ticker}, inplace=True)
    df.set_index('Date', inplace=True)
    
    return df

In [50]:
main_df = clean_yahoo_df('final_dfs/spy.csv')
main_df.head()

Unnamed: 0_level_0,spy_volume,spy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1993-01-29,1003200.0,26.3
1993-02-01,480500.0,26.49
1993-02-02,201300.0,26.54
1993-02-03,529400.0,26.82
1993-02-04,531500.0,26.94


Below are all tickers pulled from Yahoo and saved locally. I will need to find a way to automate the refresh daily but for now will use the data I have to optimize the model.

Also note, for the individual sector ETFs, I will use Vanguard ETFs instead of SPDR even if SPY is an SPDR ETFs. Even though most of the SPDR ETFs are larger and more liquid I find the Vanguard ETFs more accurately reflects their respective idustries as the portfolios include small and mid caps. SPDR ETFs are heavily concentrated in stocks already included in SPY.

In [4]:
# note: leaving out USO becuase we have /CL

etfs = ['qqq', 'iwm']

sector_etf_path = 'sector etfs - Vanguard'
sector_etf = ['vaw', 'vcr', 'vdc', 'vde', 'vfh', 'vgt', 'vht', 'vis', 'vnq', 'vox', 'vpu']

futures_path = 'futures'
futures = ['cl=f', 'gc=f']

bonds_etf_path = 'bonds etfs'
bonds = ['ief', 'tlt']

In [5]:
for ticker in etfs:
    temp_df = clean_yahoo_df(f'final_dfs/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in sector_etf:
    temp_df = clean_yahoo_df(f'final_dfs/{sector_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in futures:
    temp_df = clean_yahoo_df(f'final_dfs/{futures_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')
    
for ticker in bonds:
    temp_df = clean_yahoo_df(f'final_dfs/{bonds_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

In [6]:
main_df.tail()

Unnamed: 0_level_0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,vpu_volume,vpu,cl=f_volume,cl=f,gc=f_volume,gc=f,ief_volume,ief,tlt_volume,tlt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-27,77896600.0,287.05,36451700.0,215.56,39021300.0,127.26,83200.0,109.09,200600.0,173.22,...,185500.0,129.43,282967221.0,12.34,80903668.0,1720.3,2508100.0,121.36,9039500.0,167.44
2020-04-28,105270000.0,285.73,46862800.0,211.5,47292700.0,129.11,56100.0,111.16,75600.0,173.41,...,165500.0,130.01,411961998.0,13.4,96211082.0,1721.0,2847600.0,121.89,9905400.0,169.37
2020-04-29,118745600.0,293.21,48716400.0,219.0,50390500.0,135.46,106800.0,114.6,73400.0,178.42,...,244700.0,129.28,510049.0,12.34,156725.0,1722.2,2600100.0,121.83,8619600.0,168.49
2020-04-30,122901700.0,290.48,42955500.0,218.91,43727500.0,130.31,87100.0,110.97,73500.0,177.22,...,188200.0,125.87,501420.0,15.06,110088974.0,1695.4,5371500.0,121.54,11606100.0,166.52
2020-05-01,125063900.0,282.79,48748200.0,212.74,42651700.0,125.14,46100.0,108.4,88900.0,169.61,...,197300.0,122.93,177199414.0,19.69,86315473.0,1710.2,8357700.0,121.71,10208600.0,167.95


Other charts (Treasure bond yields, Libor rate, Mortgage rate, M1 & M2 money supply, VIX, DIX)

In [18]:
# maybe include VVIX in the future?

yields_path = 'yields'
treasury_yields = ['10-year-treasury-bond-rate-yield-chart', '30-year-fixed-mortgage-rate-chart',
                   '30-year-treasury-bond-rate-yield-chart', 'fed-funds-rate-historical-chart']
libor_yields = ['historical-libor-rates-chart']

money_supply_path = 'money supply'
money_supply = ['M1', 'M2']

currency_path = 'currency'
currency = ['dxy']

vix = '^vix'
dix = 'DIX'

In [37]:
# treasury/mortgage rate
for i in treasury_yields:
    bonds_df = pd.read_csv(f'final_dfs/{yields_path}/{i}.csv', skiprows=15)
    bonds_df[' value'] = bonds_df[' value'] / 100
    bonds_df.rename(columns={' value': f'{i}'.replace('-chart', '')}, inplace=True)
    bonds_df.set_index('date', inplace=True)
    main_df = main_df.join(bonds_df, how='left')
    
# # libor rate
# libor_df = pd.read_csv(f'final_dfs/{yields_path}/{libor_yields[0]}.csv', skiprows=15)
# libor_df.rename(columns={'1M': '1M_libor_rate',
#                          '3M': '3M_libor_rate',
#                          '6M': '6M_libor_rate',
#                          '12M': '12M_libor_rate'},
#                inplace=True)
# libor_df.set_index('date', inplace=True)
# libor_df = libor_df / 100
# main_df = main_df.join(libor_df, how='left')

# # money supply
# for i in money_supply:
#     money_supply_df = pd.read_csv(f'final_dfs/{money_supply_path}/{i}.csv')
#     money_supply_df.rename(columns={'M1': 'M1_money', 'M2': 'M2_money'}, inplace=True)
#     money_supply_df.set_index('DATE', inplace=True)
#     money_supply_df = money_supply_df * 1000000000
#     main_df = main_df.join(money_supply_df, how='left')

# # currency (check date format)
# currency_df = pd.read_csv(f'final_dfs/{currency_path}/{currency[0]}.csv')
# currency_df.drop([' Open',' High',' Low'], axis=1, inplace=True)
# currency_df.rename(columns={' Close': currency[0]}, inplace=True)
# currency_df.set_index('Date', inplace=True)
# main_df = main_df.join(currency_df, how='left')

# # vix
# vix_df = clean_yahoo_df(f'final_dfs/{vix}.csv')
# vix_df.drop('^vix_volume', axis=1, inplace=True)
# main_df = main_df.join(vix_df, how='left')

# # dix
# dix_df = pd.read_csv(f'final_dfs/{dix}.csv')
# dix_df.drop('price', axis=1, inplace=True)
# dix_df.set_index('date', inplace=True)
# main_df = main_df.join(dix_df, how='left')

ValueError: columns overlap but no suffix specified: Index(['10-year-treasury-bond-rate-yield'], dtype='object')

In [53]:
# treasury/mortgage rate
for i in treasury_yields:
    bonds_df = pd.read_csv(f'final_dfs/{yields_path}/{i}.csv', skiprows=15)
    bonds_df[' value'] = bonds_df[' value'] / 100
    bonds_df.rename(columns={' value': f'{i}'.replace('-chart', '')}, inplace=True)
    #bonds_df.set_index('date', inplace=True)
    #main_df = main_df.join(bonds_df, how='left')
    print(bonds_df.tail(100))
    break

             date  10-year-treasury-bond-rate-yield
14507  2020-01-09                            0.0185
14508  2020-01-10                            0.0183
14509  2020-01-13                            0.0185
14510  2020-01-14                            0.0182
14511  2020-01-15                            0.0179
...           ...                               ...
14602  2020-05-18                               NaN
14603  2020-05-19                               NaN
14604  2020-05-20                               NaN
14605  2020-05-21                               NaN
14606  2020-05-22                               NaN

[100 rows x 2 columns]
