In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import json

<h5>Cleaning up Yahoo data</h5>

In [2]:
def clean_yahoo_df(path):
    ticker = path.split('/')[-1].split('.')[0]
    df = pd.read_csv(path)
    
    df.drop(['High','Low','Open','Close'], axis=1, inplace=True)
    df['Adj Close'] = df['Adj Close'].round(2)
    #df['Volume'] = df['Volume'].astype('int32')
    df.rename(columns={'Volume': f'{ticker}_volume', 'Adj Close': ticker}, inplace=True)
    df.set_index('Date', inplace=True)
    
    return df

In [3]:
main_df = clean_yahoo_df('final_dfs/spy.csv')
main_df.head()

Unnamed: 0_level_0,spy_volume,spy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1993-01-29,1003200.0,26.3
1993-02-01,480500.0,26.49
1993-02-02,201300.0,26.54
1993-02-03,529400.0,26.82
1993-02-04,531500.0,26.94


A few economic indicators are updated weekly, monthly, etc. Some of those days are on the weekend when equity markets are not opened. For that, we will have to outer join those dataframes with the main dataframes, forward fill the NaN data, and remove weekend dates. Below is the list of SPY dates that will be used later.

In [6]:
spy_dates = list(main_df.index)

Below are all tickers pulled from Yahoo and saved locally. I will need to find a way to automate the refresh daily but for now will use the data I have to optimize the model.

Also note, for the individual sector ETFs, I will use Vanguard ETFs instead of SPDR even if SPY is an SPDR ETFs. Even though most of the SPDR ETFs are larger and more liquid I find the Vanguard ETFs more accurately reflects their respective idustries as the portfolios include small and mid caps. SPDR ETFs are heavily concentrated in stocks already included in SPY.

In [7]:
# note: leaving out USO becuase we have /CL

etfs = ['qqq', 'iwm']

sector_etf_path = 'sector etfs - Vanguard'
sector_etf = ['vaw', 'vcr', 'vdc', 'vde', 'vfh', 'vgt', 'vht', 'vis', 'vnq', 'vox', 'vpu']

futures_path = 'futures'
futures = ['cl=f', 'gc=f']

bonds_etf_path = 'bonds etfs'
bonds = ['ief', 'tlt']

In [8]:
for ticker in etfs:
    temp_df = clean_yahoo_df(f'final_dfs/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in sector_etf:
    temp_df = clean_yahoo_df(f'final_dfs/{sector_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

for ticker in futures:
    temp_df = clean_yahoo_df(f'final_dfs/{futures_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')
    
for ticker in bonds:
    temp_df = clean_yahoo_df(f'final_dfs/{bonds_etf_path}/{ticker}.csv')
    main_df = main_df.join(temp_df, how='left')

In [9]:
main_df.head()

Unnamed: 0_level_0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,vpu_volume,vpu,cl=f_volume,cl=f,gc=f_volume,gc=f,ief_volume,ief,tlt_volume,tlt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-29,1003200.0,26.3,,,,,,,,,...,,,,,,,,,,
1993-02-01,480500.0,26.49,,,,,,,,,...,,,,,,,,,,
1993-02-02,201300.0,26.54,,,,,,,,,...,,,,,,,,,,
1993-02-03,529400.0,26.82,,,,,,,,,...,,,,,,,,,,
1993-02-04,531500.0,26.94,,,,,,,,,...,,,,,,,,,,


Other charts (Treasure bond yields, Libor rate, Mortgage rate, M1 & M2 money supply, VIX, DIX)

In [10]:
# maybe include VVIX in the future?
# Notes:
# 30 year fixed mortgage rate is updated monthly (need may numbers)
# money supply data is updated weekly

economics_path = 'economics'
economics = ['CPIAUCSL', 'DFF', 'DGS10', 'DGS30', 'M1', 'M2', 'MORTGAGE15US', 'MORTGAGE30US',
             'T10YIE', 'UNRATE', 'USD1MTD156N', 'USD6MTD156N', 'USD12MD156N']
economic_names = ['CPI', 'fed_funds_rate', '10_yr_treasury', '30_yr_treasury', 'M1_supply',
                  'M2_supply', '15_yr_mortgage', '30_yr_mortgage', '10_yr_inflation',
                  'unemployment', '1m_libor', '3m_libor', '12m_libor']

currency_path = 'currency'
currency = ['dxy']

vix = '^vix'
dix = 'DIX'

In [11]:
# economic data
for code, name in zip(economics, economic_names):
    try:
        economics_df = pd.read_csv(f'final_dfs/{economics_path}/{code}.csv')
        economics_df = economics_df.loc[economics_df['DATE'] >= '1993-01-29']
        economics_df.rename(columns={code: name}, inplace=True)
        economics_df.set_index('DATE', inplace=True)
        if code in ['M1_supply', 'M2_supply']:
            economics_df = economics_df * 1000000000
        elif code == 'GFDEBTN':
            economics_df = economics_df * 1000000
        elif code == 'CPIAUCSL':
            economics_df
        else:
            economics_df = economics_df / 100
    except:
        economics_df.replace('.', np.nan, inplace=True)
        economics_df.ffill(inplace=True)
    main_df = main_df.join(economics_df, how='outer')

# currency (check date format)
currency_df = pd.read_csv(f'final_dfs/{currency_path}/{currency[0]}.csv')
currency_df.drop([' Open',' High',' Low'], axis=1, inplace=True)
currency_df.rename(columns={' Close': currency[0]}, inplace=True)
currency_df['Date'] = currency_df['Date'].apply(lambda x: dt.strptime(x, '%m/%d/%y').strftime('%Y-%m-%d'))
currency_df.set_index('Date', inplace=True)
main_df = main_df.join(currency_df, how='left')

# vix
vix_df = clean_yahoo_df(f'final_dfs/{vix}.csv')
vix_df.drop('^vix_volume', axis=1, inplace=True)
main_df = main_df.join(vix_df, how='left')

# dix
dix_df = pd.read_csv(f'final_dfs/{dix}.csv')
dix_df.drop('price', axis=1, inplace=True)
dix_df.set_index('date', inplace=True)
main_df = main_df.join(dix_df, how='left')

In [18]:
main_df.head()

Unnamed: 0,spy_volume,spy,qqq_volume,qqq,iwm_volume,iwm,vaw_volume,vaw,vcr_volume,vcr,...,30_yr_mortgage,10_yr_inflation,unemployment,1m_libor,3m_libor,12m_libor,dxy,^vix,dix,gex
1993-01-29,1003200.0,26.3,,,,,,,,,...,,,,3.125,3.36719,3.6875,92.48,12.42,,
1993-01-30,,,,,,,,,,,...,,,,,,,,,,
1993-01-31,,,,,,,,,,,...,,,,,,,,,,
1993-02-01,480500.0,26.49,,,,,,,,,...,,,,3.125,3.375,3.6875,93.39,12.33,,
1993-02-02,201300.0,26.54,,,,,,,,,...,,,,3.1875,3.4375,3.75,93.94,12.25,,


In [None]:
# CPI needs April data
# MI & M2 missing last week of April data
# full outer join, then ffill, then remove all dates not in SPY df