# Notebook 0(a): Creates a quick sample file to use for EDA

In [26]:
import pandas as pd

In [27]:
# Just a few ETFs for testing to start with, to see if everything works and model is accurate enough to proceed with the full list (below)
ticker_list1 = ["QQQ",
                "QQQE",
                "SKY",
                "VOO",
                "IVV",
                "VTI",
                "ITOT"]

# Once we find out everything is working, we can use the full list (below) to see if we can run multiple ETF models successfully
ticker_list2 = [
    'SKY', 'VOO', 'IVV', 'VTI', 'ITOT',    # Core S&P/Total Market
    'QQQ', 'QQQE',                         # Nasdaq/Growth
    'IWM', 'IWF', 'IWD', 'MDY',            # Small/Mid/Growth/Value
    'XLK', 'XLF', 'XLE', 'XLV',             # Tech/Finance/Energy
    'TLT', 'BND', 'HYG',                   # Bonds
    'VEU', 'EFA'                           # International (US-listed)
]

#needed for both API data extractions
start_date = "2020-01-01"
end_date = "2025-12-31"

#Only required for Yahoo Finance data extraction
interval = "1d"

In [28]:
def load_and_stack_raw_tickerdf(tickers, interval):
     
    if isinstance(tickers, str): #adds single ticker to list
        tickers = [tickers]

    data = {}

    for ticker in tickers:
        
        filename_in = f"..\\resources\\raw_files\yfinance\{ticker}_{interval}_data.csv"
        
        data[ticker] = pd.read_csv(filename_in, 
                                   parse_dates=['Date'], 
                                   index_col='Date')
        
        print(f"Loaded {ticker}: {data[ticker].shape}")

    dfs = []

    for ticker, df in data.items():

        df_reset = df.reset_index()
        df_reset['Ticker'] = ticker
        dfs.append(df_reset)
        
    return pd.concat(dfs, axis=0, ignore_index=True)

In [29]:
def stack_and_merge_raw_etf_econ_data(ticker, interval):

    raw_stacked_df = load_and_stack_raw_tickerdf(ticker, interval)
    print(f"\nFinal stacked raw dataframe shape: {raw_stacked_df.shape}")

    macro_df = pd.read_csv(f"..\\resources\\raw_files\\fred\\fred_macro_data.csv")

    print('\nMacro file shape:', 
          macro_df.shape)

    raw_stacked_df["Date"] = pd.to_datetime(raw_stacked_df["Date"], 
                                            format='mixed', 
                                            dayfirst=True)
    
    macro_df["Date"] = pd.to_datetime(macro_df["Date"], 
                                      format='mixed', 
                                      dayfirst=True)

    merged1 = raw_stacked_df.merge(macro_df, 
                                   on="Date", 
                                   how="inner").fillna(0)

    #Moving the Ticker column to the second position - at the front of the file after Date
    col = merged1.pop("Ticker")
    merged1.insert(1, 
                   "Ticker", 
                   col)

    print('\nMerged file shape:', 
          merged1.shape)

    merged1.to_csv(f"..\\resources\\raw_merged\\raw_merged.csv", 
                   index=False)

In [30]:
stack_and_merge_raw_etf_econ_data(ticker_list2, interval)

Loaded SKY: (1507, 7)
Loaded VOO: (1507, 8)
Loaded IVV: (1507, 8)
Loaded VTI: (1507, 8)
Loaded ITOT: (1507, 8)
Loaded QQQ: (1507, 8)
Loaded QQQE: (1507, 8)
Loaded IWM: (1507, 8)
Loaded IWF: (1507, 8)
Loaded IWD: (1507, 8)
Loaded MDY: (1507, 8)
Loaded XLK: (1507, 8)
Loaded XLF: (1507, 8)
Loaded XLE: (1507, 8)
Loaded XLV: (1507, 8)
Loaded TLT: (1507, 8)
Loaded BND: (1507, 8)
Loaded HYG: (1507, 8)
Loaded VEU: (1507, 8)
Loaded EFA: (1507, 8)

Final stacked raw dataframe shape: (30140, 10)

Macro file shape: (2190, 47)

Merged file shape: (30120, 56)
