# Economic Data Exploratory Data Analysis

## Setup

In [None]:
# imports
import os
import time
import pandas as pd
import numpy as np
import requests
import yfinance as yf

In [None]:
# FOLDERS
OUTPUT_FOLDER = "../Data/Output"
INPUT_FOLDER = "../Data/Input"

INPUT_ECON_FOLDER = os.path.join(INPUT_FOLDER, "Economic")
OUTPUT_ECON_FOLDER = os.path.join(OUTPUT_FOLDER, "Economic")

fred_input_path = os.path.join(INPUT_ECON_FOLDER, "fred_economic_data.csv")
bls_input_path = os.path.join(INPUT_ECON_FOLDER, "bls_economic_data.csv")
yahoo_input_path = os.path.join(INPUT_ECON_FOLDER, "yahoo_economic_data.csv")
econ_output_path = os.path.join(OUTPUT_ECON_FOLDER, "economic_data.csv")

# Check contents of folders
output_contents = os.listdir(OUTPUT_FOLDER)
print(output_contents)

In [None]:
# API Keys
FRED_API_KEY = os.getenv("FRED_API_KEY")
BLS_API_KEY = os.getenv("BLS_API_KEY")

## Sources

### FRED Data (API)
https://fred.stlouisfed.org/docs/api/fred/

In [None]:
# Retrieve FRED API key from environment variables
FRED_API_KEY = os.getenv("FRED_API_KEY")
print(FRED_API_KEY)

if not FRED_API_KEY:
    raise ValueError("API key not found. Please set FRED_API_KEY in your .env file.")

# Base URL for FRED API
FRED_BASE_URL = "https://api.stlouisfed.org/fred/series/observations"

# Define economic indicators and series IDs
FRED_SERIES = {
    "gdp_growth": "A191RL1Q225SBEA",          # Real GDP growth rate (Quarterly)
    "interest_rate_fed_funds": "FEDFUNDS",    # Federal Funds Rate (Monthly)
    "interest_rate_10y_treasury": "GS10",     # 10-Year Treasury Yield (Daily)
    "money_supply_m2": "M2SL",                # M2 Money Supply (Monthly)
    "yield_curve_10y_2y": "T10Y2Y",           # 10Y-2Y Yield Curve Spread (Daily)
    "unemployment_rate": "UNRATE",            # Unemployment Rate (Monthly)
    "cpi_inflation": "CPIAUCSL",              # CPI Inflation Rate (Monthly)           # S&P 500 Index (Daily)
    "consumer_sentiment": "UMCSENT",          # Consumer Sentiment Index (Monthly)
    "leading_economic_index": "USSLIND",      # Leading Economic Index (Monthly)
    "corporate_bond_yield": "BAA",            # Corporate Bond Yield (Monthly)
    "vix_index": "VIXCLS"                     # VIX Index (Daily)
}

In [None]:
# Function to fetch data from FRED API
def fetch_fred_series(series_id):
    """Fetch a single time-series from FRED API and return as DataFrame."""
    params = {
        "series_id": series_id,
        "api_key": FRED_API_KEY,
        "file_type": "json",  # Use JSON for easier parsing
        "sort_order": "asc",   # Sort data in ascending order by date
        "start_date": "2004-01-01",  # Specify a reasonable start date
        "end_data": "2025-01-01"
    }

    response = requests.get(FRED_BASE_URL, params=params, timeout=10)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Extract and format data
    data = response.json()["observations"]
    df = pd.DataFrame(data)
    df["date"] = pd.to_datetime(df["date"])
    df["value"] = pd.to_numeric(df["value"], errors="coerce")  # Convert value to numeric
    df.set_index("date", inplace=True)
    return df["value"]  # Return only the series of values


In [None]:
def get_all_fred_data():
    # Fetch all series and combine into a single DataFrame
    fred_data = {}
    for key, series_id in FRED_SERIES.items():
        print(f"Fetching {key}, {series_id}...")
        fred_data[key] = fetch_fred_series(series_id)

    df_fred = pd.DataFrame(fred_data)

    return df_fred

In [None]:
def save_fred_data(df_fred):
    df_fred.to_csv(fred_input_path, index=True)
    print(f"FRED data saved to {fred_input_path}\n")
    print(df_fred.head())

In [None]:
df_fred = get_all_fred_data()
save_fred_data(df_fred)

## BLS
https://www.bls.gov/developers/home.htm

In [None]:
# Set API URL
BLS_API_URL = "https://api.bls.gov/publicAPI/v2/timeseries/data/"

# Define economic indicators from BLS
BLS_SERIES = {
    "avg_hourly_earnings": "CES0500000003",  # Average hourly earnings (Operational cost pressure)
    "ppi_final_demand": "WPSFD49207",  # Producer Price Index (Final Demand)
    "job_openings": "JTS000000000000000JOL",  # Job Openings (Labor market demand and hiring competition)
    "labor_force_participation_rate": "LNS11300000", # Labor Force Participation Rate (Workforce availability and long-term growth potential),
}

In [None]:
def fetch_bls_series(series_id, start_year=2004, end_year=2025):
    """
    Fetch BLS series data using pagination for date ranges
    """
    print(f"Fetching series: {series_id}")
    
    df_list = []
    for year in range(start_year, end_year + 1, 20):
        payload = {
            "seriesid": [series_id],
            "startyear": str(year),
            "endyear": str(min(year + 19, end_year)),
            "registrationkey": BLS_API_KEY
        }

        # Request data
        response = requests.post(BLS_API_URL, json=payload)
        if response.status_code != 200:
            raise Exception(f"API request failed with status code {response.status_code}")

        json_data = response.json()
        if "Results" not in json_data or "series" not in json_data["Results"]:
            continue

        # Extract data points
        data_points = json_data["Results"]["series"][0]["data"]
        if not data_points:
            continue

        formatted_data = {
            pd.to_datetime(f"{item['year']}-{item['period'][1:]}-01"): float(item["value"])
            for item in data_points if item["period"][0] == "M"  # Exclude annual averages
        }

        if formatted_data:
            df_list.append(pd.Series(formatted_data))

    if df_list:
        return pd.concat(df_list).sort_index()
    else:
        return pd.Series(dtype="float64")


In [None]:
def fetch_bls_data():
    bls_data = {}
    for key, series_id in BLS_SERIES.items():
        bls_data[key] = fetch_bls_series(series_id)

    df_bls = pd.DataFrame(bls_data)
    return df_bls

In [None]:
def save_bls_data(df_bls):
    df_bls.to_csv(bls_input_path)
    print(f"BLS data saved to {bls_input_path}")
    print()
    print(df_bls.head())

In [None]:
df_bls = fetch_bls_data()
save_bls_data(df_bls)

## Yahoo
https://github.com/ranaroussi/yfinance

In [None]:
# List of ETFs to track
ETF_SYMBOLS = ["SPY", "^GSPC", "XLK", "VHT", "PBW", "XLY", "IYT", "VOX", "IYZ", "VNQ", "FINX"]

In [None]:
def fetch_with_retries(symbol, max_retries=3):
    """
    Fetch ETF data with error handling and retries.
    """
    for attempt in range(max_retries):
        try:
            print(f"Fetching {symbol} (Attempt {attempt + 1})...")
            return yf.download(symbol, start="2004-01-01", end="2025-12-31", auto_adjust=False, progress=False)
        except Exception as e:
            print(f"Error fetching {symbol}: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff delay

    print(f"Skipping {symbol} after {max_retries} failed attempts.")
    return None

In [None]:
def process_etf_data(etf_symbols):
    """
    Fetch and process ETF data into a DataFrame with relevant financial metrics.
    """
    etf_data = []

    for symbol in etf_symbols:
        df = fetch_with_retries(symbol)
        if df is not None and not df.empty:
            df["Daily Return"] = df["Adj Close"].pct_change(fill_method=None)  # Daily return
            df["Volatility"] = df["Daily Return"].rolling(30).std()  # 30-day rolling volatility
            df["Momentum"] = df["Adj Close"].pct_change(14, fill_method=None).rolling(14).mean()  # Momentum (14-day)
            df["50-Day MA"] = df["Adj Close"].rolling(50).mean()  # 50-day moving average
            df["200-Day MA"] = df["Adj Close"].rolling(200).mean()  # 200-day moving average

            df = df.rename(columns={"Adj Close": f"Adj Close {symbol}"})
            # etf_data[symbol] = df[[f"Adj Close {symbol}", "Daily Return", "Volatility", "Momentum", "50-Day MA", "200-Day MA"]]

            # Ensure Date is indexed correctly
            df = df.reset_index()

            # Insert ETF column at position 0 to properly label each row
            df.insert(0, "ETF", symbol)

            # Store only necessary columns
            etf_data.append(df[["ETF", "Date", "Daily Return", 
                                "Volatility", "Momentum", "50-Day MA", "200-Day MA", f"Adj Close {symbol}"]])

            time.sleep(2)

    df_combined = pd.concat(etf_data, ignore_index=True)
    
    return df_combined

In [None]:
def save_yf_etf_data(df_etf):
    df_etf.to_csv(yahoo_input_path, index=False)
    print(f"ETF data saved to {yahoo_input_path}")
    print()
    print(df_etf.head())

In [None]:
df_etf = process_etf_data(ETF_SYMBOLS)
save_yf_etf_data(df_etf)


## Merging

### Preprocessing

In [None]:
# Define datasets
fred = pd.read_csv(fred_input_path)
bls = pd.read_csv(bls_input_path)
yfinance = pd.read_csv(yahoo_input_path)

# Standardizing column names: lowercase and replace spaces with underscores
fred.columns = fred.columns.str.lower().str.replace(" ", "_")
bls.columns = bls.columns.str.lower().str.replace(" ", "_")
yfinance.columns = yfinance.columns.str.lower().str.replace(" ", "_")

# Convert date columns to datetime format if not already
fred["date"] = pd.to_datetime(fred["date"])
bls["date"] = pd.to_datetime(bls["date"])
yfinance["date"] = pd.to_datetime(yfinance["date"])

### Merging

In [None]:
# Merge FRED and BLS on "date"
merged_df = pd.merge(fred, bls, on="date", how="outer")
print("Merged FRED and BLS data")

# Merge YahooFinance based on "date" and "ETF"
merged_df = pd.merge(merged_df, yfinance, on="date", how="outer")
print("Merged YFinance data")

# Drop duplicate "adj_close" column if it appears multiple times
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

# Handle missing values (e.g., forward-fill for time-series continuity)
merged_df.ffill(inplace=True)

# Save the cleaned dataset
merged_df.to_csv(econ_output_path, index=False)
print(f"Cleaned economic data saved to {econ_output_path}")

In [None]:
df = pd.read_csv(econ_output_path)
df.info()

### Preview Data

In [None]:
# Compile summary of all dataframes and output to txt file
def compile_summary(filename, output_name):
    df = pd.read_csv(filename)
    PREVIEW_FOLDER = os.path.join(OUTPUT_FOLDER, "Preview")
    output_file_path = os.path.join(PREVIEW_FOLDER, output_name)

    with open(output_file_path, 'w') as f:
        f.write(f"DataFrame Summary\n")
        f.write(f"Number of rows: {df.shape[0]}\n")
        f.write(f"Number of columns: {df.shape[1]}\n")
        f.write("Column names and info:\n")
        df.info(buf=f)
        f.write("\nLast 50 rows:\n")
        f.write(df.tail(n=50).to_string())
        f.write("\n\n" + "="*80 + "\n\n")

    print(f"Dataframe information saved to {output_file_path}")

In [None]:
compile_summary(econ_output_path, "economic_data_summary.txt")