In [1]:
#Usual dataframe and plotting libraries
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt

#API libraries
from fredapi import Fred
import yfinance as yf

#Maths and date/time libraries
import numpy as np
from datetime import datetime

In [2]:
# Just a few ETFs for testing to start with, to see if everything works and model is accurate enough to proceed with the full list (below)
ticker_list1 = ["QQQ",
                "QQQE",
                "SKY",
                "VOO",
                "IVV",
                "VTI",
                "ITOT"]

# Once we find out everything is working, we can use the full list (below) to see if we can run multiple ETF models successfully
ticker_list2 = [
    'SKY', 'VOO', 'IVV', 'VTI', 'ITOT',    # Core S&P/Total Market
    'QQQ', 'QQQE',                         # Nasdaq/Growth
    'IWM', 'IWF', 'IWD', 'MDY',            # Small/Mid/Growth/Value
    'XLK', 'XLF', 'XLE', 'XLV',             # Tech/Finance/Energy
    'TLT', 'BND', 'HYG',                   # Bonds
    'VEU', 'EFA'                           # International (US-listed)
]

#needed for both API data extractions
start_date = "2020-01-01"
end_date = "2025-12-31"

#Only required for Yahoo Finance data extraction
interval = "1d"

In [None]:
def fred_data_extract(obs_start,obs_end):

    api_key = '4dda1239a9e25d81bcda77196a642bb1' # Please add your own FRED API key here
    fred = Fred(api_key=api_key)
    
    series_dict = {
        # Some Random Macroeconomic Indicator:Names & Descriptions from FRED to see if the model uses them (if at all)
        'GDP': 'GDP',
        'CPIAUCSL': 'CPI_Inflation',
        'APU0000708111': 'Average_price_of_eggs',
        'APU000072610': 'Average_price_of_electricity',
        'USSLIND': 'Leading_Index_for_US',
        'USREC': 'NBER_Recession_Indicator',
        'PPIACO': 'Producer_Price_Index',
        'T5YIFR': '5-Year_Breakeven_Inflation',
        'NROU': 'Natural_Rate__of_Unemployment_(est.)',
        'CIVPART': 'Civilian_Labor_Force_Participation',
        'BAMLC0A4CBBB': 'BBB_Corporate_Spread',
        # Growth & Business Cycle
        'GDPC1': 'Real_Gross_Domestic_Product', 
        'PCE': 'Personal_Consumption_Expenditures',
        'INDPRO': 'Industrial_Production_Index',
        'RECPROUSM156N': 'Recession_Probabilities',
        # Inflation & Real Rates
        'CPILFESL': 'Core_CPI_(ex_Food/Energy)', 
        'T10YIE': '10-Year_Breakeven_Inflation',
        'DFII10': '10-Year_TIPS_Yield',
        'DGS10': '10-Year_Treasury_Yield',
        # Labour Market
        'UNRATE': 'Unemployment_Rate', 
        'PAYEMS': 'Nonfarm_Payrolls',
        'JTSJOL': 'Job_Openings_(JOLTS)',
        # Rates & Financial Conditions
        'FEDFUNDS': 'Effective_Fed_Funds_Rate',
        'DGS2': '2-Year_Treasury_Yield',
        'T10Y2Y': '10Y-2Y_Treasury_Spread',
        'BAMLH0A0HYM2': 'High_Yield_OAS',
        #Tech/Growth Specific
        'EMVMACROINTEREST': 'Equity_Mkt_Vol:_Macro_News_Tracker',
        'WLEMUINDXD': 'Equity_Market_Uncertainty_Index',
        'MICH': 'U_Mich_Inflation_Expectations',
        #Housing & Consumer Credit
        'HOUST': 'Housing_Starts:_Total',
        'PERMIT': 'New_Private_Housing_Units_Authorized',
        'MORTGAGE30US': '30-Year_Fixed_Mortgage_Rate',
        #Money Supply & Credit
        'M2SL': 'M2_Money_Stock',
        'RRPONTSYD': 'ON_RRP_(Reverse_Repo)_Balance',
        'BOGMBASE': 'Monetary_Base',
        'BUSLOANS': 'Commercial_&_Industrial_Loans',
        #Consumer & Business Sentiment
        'UMCSENT': 'U_Mich_Consumer_Sentiment',
        'CFNAI': 'Chicago_Fed_National_Activity_Index',
        #Additional Volatility & Financial Conditions
        'VIXCLS': 'CBOE_Volatility_Index (VIX)',
        'NFCI': 'Chicago_Fed_National_Financial_Conditions',
        'EMVMACROBROAD': 'Equity_Vol_Tracker:_Broad Macro',
        'T10Y3M': '10Y-3M_Treasury_Spread',
        'BAA10Y': 'Moodys_BAA_Corporate_Bond_Yield',
        #High-Frequency & Forward-Looking
        'WPSFD49207': 'Capacity_Utilization:_Manufacturing',
        'DGS1MO': '1-Month_Treasury_Bill',
        'DCOILWTICO': 'WTI_Crude_Oil_Price'
    }

    data = {}
    
    for id, name in series_dict.items():
        data[id] = fred.get_series(id, 
                                   observation_start=obs_start, 
                                   observation_end=obs_end)

    df = pd.DataFrame(data).ffill().dropna()

    df = df.rename(columns=series_dict)

    print(f"Observation Start/end: {obs_start}/{obs_end} | Shape: {df.shape}\n")
    print(df.head(2))
    print(df.tail(2),"\n")

    df = df.rename_axis("Date")
    df.to_csv(r'..\resources\raw_files\fred\fred_macro_data.csv', index=True)

In [None]:
def yahoo_fin_fetch_list(tickers,start,end,interval):
    
    if isinstance(tickers, str): #adds single ticker to list
        tickers = [tickers]

    for ticker in tickers:
        data = yf.Ticker(ticker)

        df = data.history(
            start = start,
            end = end,
            interval = interval
        )

        df['Date'] = df.index.date  # Extracts YYYY-MM-DD as date objects
        df.reset_index(drop=True, 
                       inplace=True)  # If you want Date as column
        date_col = df.pop('Date')    # remove column
        df.insert(0, 
                  'Date', 
                  date_col)  # insert at position 0
        
        print(f"Ticker: {ticker} | Shape: {df.shape}\n")
        print(df.head(2))
        print(df.tail(2),"\n")

        filename = r"..\resources\raw_files\yfinance\{ticker}_{interval}_data.csv"
        df.to_csv(filename, index = False)

In [None]:
def add_features(tickers, interval):

    if isinstance(tickers, str): #adds single ticker to list
        tickers = [tickers]

    for ticker in tickers:
        
        filename_in = r"..\resources\raw_files\yfinance\{ticker}_{interval}_data.csv"
        
        df = pd.read_csv(filename_in)
        
              
        df.insert(1, 'Ticker', ticker)
        
        df['Price_Movement'] = df['Close'] - df['Open']
        df['Price_Movement_Pct'] = ((df['Close'] - df['Open'])/df['Open']) * 100

        df['Date'] = pd.to_datetime(df['Date'])

        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Day'] = df['Date'].dt.day
        df['Day_of_Year'] = df['Date'].dt.dayofyear
        df['Day_of_Week'] = df['Date'].dt.dayofweek
        df['quarter'] = df['Date'].dt.quarter

        lags = [1, 
                3, 
                5, 
                7, 
                14, 
                21, 
                28, 
                365]
        
        for lag in lags:
            df[f'Close_lag{lag}'] = df['Close'].shift(lag)

        df['Month_Sin'] = np.sin(2 * np.pi * df['Month'] / 12).round(6)
        df['Month_Cos'] = np.cos(2 * np.pi * df['Month'] / 12).round(6)

        df['Dayofweek_Sin'] = np.sin(2 * np.pi * df['Day_of_Week'] / 7)
        df['Dayofweek_Cos'] = np.cos(2 * np.pi * df['Day_of_Week'] / 7)

        df.fillna(0, inplace=True)
        
        df = df.assign(
            Price_Movement_Category=lambda x: x["Price_Movement"].apply(
                lambda m: "high" if m > 0 else ("low" if m < 0 else "same")
                )
        )

        movement_type = CategoricalDtype(categories=['high',
                                                   'same', 
                                                   'low'],
                                       ordered=True)
        df["Price_Movement_Category"] = df["Price_Movement_Category"].astype(movement_type)
        df["Price_Movement_Code"] = df["Price_Movement_Category"].cat.codes

        print(f"Ticker: {ticker} | Shape: {df.shape}\n")
        print(df.head(2))
        print(df.tail(2),"\n")

        filename_out = f"..\Resources\Featured_Files\YFINANCE\{ticker}_{interval}_features.csv"

        df.to_csv(filename_out, index = False)

In [6]:
def load_and_stack_tickerdf(tickers, interval):
     
    if isinstance(tickers, str): #adds single ticker to list
        tickers = [tickers]

    data = {}

    for ticker in tickers:
        
        filename_in = f"..\Resources\Featured_Files\YFINANCE\{ticker}_{interval}_features.csv"
        data[ticker] = pd.read_csv(filename_in, parse_dates=['Date'], index_col='Date')
        print(f"Loaded {ticker}: {data[ticker].shape}")

    dfs = []
    for ticker, df in data.items():
        df_reset = df.reset_index()
        df_reset['Ticker'] = ticker
        dfs.append(df_reset)
    return pd.concat(dfs, axis=0, ignore_index=True)

In [None]:
def make_stack1(ticker, interval):
    df_stacked1 = load_and_stack_tickerdf(ticker, interval)

    print(f"Shape: {df_stacked1.shape}")
    print(df_stacked1['Ticker'].value_counts())
    print(df_stacked1.head())
    print(df_stacked1.columns)

    df_stacked1.to_csv(r"..\resources\stacked_etf_files\ticker_data_stacked1.csv", index = False)

In [None]:
def make_stack2(ticker, interval):
    df_stacked2 = load_and_stack_tickerdf(ticker, interval)

    print(f"Shape: {df_stacked2.shape}")
    print(df_stacked2['Ticker'].value_counts())
    print(df_stacked2.head())
    print(df_stacked2.columns)

    df_stacked2.to_csv(r"..\resources\stacked_etf_files\ticker_data_stacked2.csv", 
                       index = False)

    close_pivot = df_stacked2.pivot(index="Date", 
                                    columns="Ticker", 
                                    values="Close")

    close_pivot.head(), close_pivot.tail()

    close_pivot.to_csv(r"..\resources\pivoted_files\close_pivot.csv", 
                       index = True)

In [None]:
def load_and_merge_etf_econ_data():

    df1 = pd.read_csv(r'..\resources\stacked_etf_files\ticker_data_stacked1.csv')
    df2 = pd.read_csv(r'..\resources\stacked_etf_files\ticker_data_stacked2.csv')
    df3 = pd.read_csv(r'..\resources\raw_files\fred\fred_macro_data.csv')

    df1["Date"] = pd.to_datetime(df1["Date"],
                                  format='mixed', 
                                  dayfirst=True)
    df2["Date"] = pd.to_datetime(df2["Date"], 
                                 format='mixed', 
                                 dayfirst=True)
    df3["Date"] = pd.to_datetime(df3["Date"], 
                                 format='mixed', 
                                 dayfirst=True)

    merged1 = df1.merge(df3, 
                        on="Date", 
                        how="inner").fillna(0)
    
    merged2 = df2.merge(df3, 
                        on="Date", 
                        how="inner").fillna(0)

    merged1.to_csv("..\Resources\Merged_Files\merged1.csv", 
                   index=False)
    merged2.to_csv("..\Resources\Merged_Files\merged2.csv", 
                   index=False)

In [10]:
def etf_pipeline(ticker1, ticker2, start, end, interval):
    fred_data_extract(start, end)
    yahoo_fin_fetch_list(ticker2, start, end, interval)
    add_features(ticker2, interval)
    make_stack1(ticker1, interval)
    make_stack2(ticker2, interval)
    load_and_merge_etf_econ_data()

In [11]:
#run all of the functions defined above in order to extract, process, stack and merge the data and save files in relevant directory folders for later analysis
etf_pipeline(ticker_list1, ticker_list2, start_date, end_date, interval)

Observation Start/end: 2020-01-01/2025-12-31 | Shape: (2190, 46)

                  GDP  CPI_Inflation  Average_price_of_eggs  \
2020-01-03  21751.238        259.127                  1.461   
2020-01-04  21751.238        259.127                  1.461   

            Average_price_of_electricity  Leading_Index_for_US  \
2020-01-03                         0.134                  1.57   
2020-01-04                         0.134                  1.57   

            NBER_Recession_Indicator  Producer_Price_Index  \
2020-01-03                       0.0                 199.3   
2020-01-04                       0.0                 199.3   

            5-Year_Breakeven_Inflation  Natural_Rate__of_Unemployment_(est.)  \
2020-01-03                        1.85                              4.403836   
2020-01-04                        1.85                              4.403836   

            Civilian_Labor_Force_Participation  ...  \
2020-01-03                                63.3  ...   
2020-01