In [1]:
# Federal Reserve Policy Predictor - Data Collection
# Author: Vanessa Quintero
# Universidad de Granada
# Master's in Economics


# Cell 1

# Notebook Setup - Ensure proper working directory

import os
import sys
from pathlib import Path

# Get the directory where this notebook is located
notebook_path = Path().absolute()
print(f"Notebook location: {notebook_path}")

# Change working directory to notebook location
os.chdir(notebook_path)
print(f"Working directory set to: {os.getcwd()}")

# Verify we're in the correct project folder
project_name = "Federal Reserve Policy Predictor"
if project_name in str(notebook_path):
    print(f"✓ Running in correct project: {project_name}")
else:
    print(f"WARNING: Not in {project_name} folder!")
    print(f"Current location: {notebook_path}")

# Create data directories if they don't exist
data_dirs = ['data', 'data/raw', 'data/processed', 'outputs', 'outputs/figures']
for dir_name in data_dirs:
    Path(dir_name).mkdir(parents=True, exist_ok=True)

print("✓ Setup complete - files will save in correct location")

Notebook location: /Users/vanessaquintero/Federal Reserve Policy Predictor 
Working directory set to: /Users/vanessaquintero/Federal Reserve Policy Predictor 
✓ Running in correct project: Federal Reserve Policy Predictor
✓ Setup complete - files will save in correct location


In [2]:
# Cell 2 

# Set up and Installations and Imports 

!pip install fredapi yfinance beautifulsoup4 requests pandas numpy matplotlib seaborn plotly

import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# APIs
from fredapi import Fred
import yfinance as yf

# Web scraping
from bs4 import BeautifulSoup
import time

# Data processing
import re
import json

print("Libraries imported successfully")
print(f"Data collection started: {datetime.now()}")

Libraries imported successfully
Data collection started: 2025-06-24 12:44:09.645717


In [3]:
# Cell 3

# FRED API Setup

FRED_API_KEY = '4e3aceae8782aca0024c5f51615c5d86'  # Replace with your actual key
fred = Fred(api_key=FRED_API_KEY)

# Test API connection
try:
    test_data = fred.get_series('FEDFUNDS', limit=5)
    print("FRED API connected successfully")
    print(f"Latest Fed Funds Rate: {test_data.iloc[-1]:.2f}%")
except Exception as e:
    print(f"FRED API connection failed: {e}")

FRED API connected successfully
Latest Fed Funds Rate: 0.83%


In [4]:
# Cell 4

# Economic indicators to collect
indicators = {
    'FEDFUNDS': 'Federal Funds Rate',
    'UNRATE': 'Unemployment Rate', 
    'CPIAUCSL': 'Consumer Price Index',
    'GDP': 'Gross Domestic Product',
    'PAYEMS': 'Nonfarm Payrolls',
    'INDPRO': 'Industrial Production',
    'HOUST': 'Housing Starts',
    'DEXUSEU': 'USD/EUR Exchange Rate',
    'DGS10': '10-Year Treasury Rate',
    'VIXCLS': 'VIX Volatility Index'
}

def collect_economic_data(start_date='1990-01-01', end_date=None):
    """
    Collect economic indicators from FRED
    
    Parameters:
    start_date (str): Start date in 'YYYY-MM-DD' format
    end_date (str): End date in 'YYYY-MM-DD' format (None for most recent)
    
    Returns:
    pd.DataFrame: Combined economic indicators
    """
    
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    print("Collecting economic indicators from 1990 through June 2025...")
    print(f"Date range: {start_date} to {end_date}")
    
    economic_data = pd.DataFrame()
    
    for code, description in indicators.items():
        try:
            print(f"   Downloading {description}...")
            data = fred.get_series(code, start=start_date, end=end_date)
            economic_data[code] = data
            print(f"   Retrieved {len(data)} observations")
            time.sleep(0.1)  # Be nice to the API
            
        except Exception as e:
            print(f"   Warning: Failed to get {description}: {e}")
    
    # Show data range actually collected
    if not economic_data.empty:
        print(f"\nData collected successfully:")
        print(f"   Shape: {economic_data.shape}")
        print(f"   Date range: {economic_data.index.min()} to {economic_data.index.max()}")
        print(f"   Latest month: {economic_data.index.max().strftime('%B %Y')}")
    
    return economic_data

# Collect data from 1990 through current date (June 2025)
economic_df = collect_economic_data(start_date='1990-01-01')
print("\nFirst 5 rows:")
print(economic_df.head())
print("\nLast 5 rows (most recent data):")
print(economic_df.tail())

Collecting economic indicators from 1990 through June 2025...
Date range: 1990-01-01 to 2025-06-24
   Downloading Federal Funds Rate...
   Retrieved 851 observations
   Downloading Unemployment Rate...
   Retrieved 929 observations
   Downloading Consumer Price Index...
   Retrieved 941 observations
   Downloading Gross Domestic Product...
   Retrieved 317 observations
   Downloading Nonfarm Payrolls...
   Retrieved 1037 observations
   Downloading Industrial Production...
   Retrieved 1277 observations
   Downloading Housing Starts...
   Retrieved 797 observations
   Downloading USD/EUR Exchange Rate...
   Retrieved 6905 observations
   Downloading 10-Year Treasury Rate...
   Retrieved 16559 observations
   Downloading VIX Volatility Index...
   Retrieved 9254 observations

Data collected successfully:
   Shape: (851, 10)
   Date range: 1954-07-01 00:00:00 to 2025-05-01 00:00:00
   Latest month: May 2025

First 5 rows:
            FEDFUNDS  UNRATE  CPIAUCSL      GDP   PAYEMS   INDPRO 

In [5]:
# Cell 5

def get_fomc_meeting_dates():
    """
    Get historical FOMC meeting dates (one date per meeting - decision day)
    FOMC meetings are typically 2 days, we use the second day (decision announcement)
    """
    
    print("Collecting historical FOMC meeting dates (decision days only)...")
    
    # One date per meeting (typically the second/final day when decision is announced)
    meeting_dates = []
    
    # 2025 meetings that have already happened (decision days)
    meetings_2025_completed = [
        '2025-01-29',  # January meeting decision day
        '2025-03-19',  # March meeting decision day
        '2025-04-30',  # April/May meeting decision day
        '2025-06-11',  # June meeting decision day
    ]
    
    # 2024 meetings (decision days)
    meetings_2024 = [
        '2024-01-31', '2024-03-20', '2024-05-01', '2024-06-12',
        '2024-07-31', '2024-09-18', '2024-11-07', '2024-12-18'
    ]
    
    # 2023 meetings (decision days)
    meetings_2023 = [
        '2023-02-01', '2023-03-22', '2023-05-03', '2023-06-14',
        '2023-07-26', '2023-09-20', '2023-11-01', '2023-12-13'
    ]
    
    # Historical meetings from crisis periods (decision days)
    sample_historical = [
        # Financial Crisis period (2008-2009) - more frequent meetings
        '2008-01-30', '2008-03-18', '2008-04-29', '2008-06-24', '2008-08-05', 
        '2008-09-16', '2008-10-28', '2008-12-16',
        '2009-01-28', '2009-03-18', '2009-04-28', '2009-06-24', 
        '2009-08-12', '2009-09-22', '2009-11-04', '2009-12-16',
        
        # Dot-com period (2000-2002)
        '2000-02-02', '2000-03-21', '2000-05-16', '2000-06-27',
        '2000-08-22', '2000-10-03', '2000-12-19',
        '2001-01-03', '2001-01-31', '2001-03-20', '2001-04-18', '2001-05-15',
        '2001-06-26', '2001-08-21', '2001-09-17', '2001-10-02', '2001-11-06', '2001-12-11',
        '2002-01-29', '2002-03-19', '2002-05-07', '2002-06-25', '2002-08-13', 
        '2002-09-24', '2002-11-06', '2002-12-10'
    ]
    
    # Combine all meeting dates (one per decision)
    all_meetings = meetings_2025_completed + meetings_2024 + meetings_2023 + sample_historical
    
    # Create DataFrame
    fomc_df = pd.DataFrame({
        'meeting_date': pd.to_datetime(all_meetings),
    })
    
    # Add additional columns
    fomc_df['year'] = fomc_df['meeting_date'].dt.year
    fomc_df['month'] = fomc_df['meeting_date'].dt.month
    fomc_df['quarter'] = fomc_df['meeting_date'].dt.quarter
    
    # Sort by date
    fomc_df = fomc_df.sort_values('meeting_date').reset_index(drop=True)
    
    # Filter to only past meetings (extra safety check)
    current_date = pd.Timestamp.now()
    fomc_df = fomc_df[fomc_df['meeting_date'] <= current_date].copy()
    
    print(f"FOMC decisions collected: {len(fomc_df)} meetings")
    print(f"Date range: {fomc_df['meeting_date'].min().date()} to {fomc_df['meeting_date'].max().date()}")
    print(f"Years covered: {fomc_df['year'].min()}-{fomc_df['year'].max()}")
    print("(One date per meeting - decision announcement day)")
    
    # Show meeting frequency by year
    meetings_per_year = fomc_df.groupby('year').size()
    print(f"\nDecisions per year (recent):")
    print(meetings_per_year.tail(10))
    
    return fomc_df

fomc_meetings = get_fomc_meeting_dates()
print("\nFirst 10 meetings:")
print(fomc_meetings.head(10))
print("\nMost recent meetings:")
print(fomc_meetings.tail(10))
    
  

Collecting historical FOMC meeting dates (decision days only)...
FOMC decisions collected: 62 meetings
Date range: 2000-02-02 to 2025-06-11
Years covered: 2000-2025
(One date per meeting - decision announcement day)

Decisions per year (recent):
year
2000     7
2001    11
2002     8
2008     8
2009     8
2023     8
2024     8
2025     4
dtype: int64

First 10 meetings:
  meeting_date  year  month  quarter
0   2000-02-02  2000      2        1
1   2000-03-21  2000      3        1
2   2000-05-16  2000      5        2
3   2000-06-27  2000      6        2
4   2000-08-22  2000      8        3
5   2000-10-03  2000     10        4
6   2000-12-19  2000     12        4
7   2001-01-03  2001      1        1
8   2001-01-31  2001      1        1
9   2001-03-20  2001      3        1

Most recent meetings:
   meeting_date  year  month  quarter
52   2024-05-01  2024      5        2
53   2024-06-12  2024      6        2
54   2024-07-31  2024      7        3
55   2024-09-18  2024      9        3
56   202

In [6]:
# Cell 6

# Rate Decision History

def get_rate_decisions():
    """
    Create historical rate decision dataset
    
    Returns:
    pd.DataFrame: Rate decisions with changes
    """
    
    print("Processing rate decision history...")
    
    # Get Federal Funds Rate data
    fed_funds = fred.get_series('FEDFUNDS', start='1990-01-01')
    
    # Calculate rate changes
    rate_decisions = pd.DataFrame({
        'date': fed_funds.index,
        'rate': fed_funds.values
    })
    
    # Calculate rate changes
    rate_decisions['rate_change'] = rate_decisions['rate'].diff()
    rate_decisions['rate_change_bps'] = rate_decisions['rate_change'] * 100  # basis points
    
    # Classify decisions
    def classify_decision(change):
        if pd.isna(change):
            return 'HOLD'
        elif change > 0.05:  # More than 5 basis points
            return 'HIKE'
        elif change < -0.05:  # Less than -5 basis points
            return 'CUT'
        else:
            return 'HOLD'
    
    rate_decisions['decision'] = rate_decisions['rate_change'].apply(classify_decision)
    
    print(f"Rate decisions processed: {len(rate_decisions)} observations")
    print("\nDecision distribution:")
    print(rate_decisions['decision'].value_counts())
    
    return rate_decisions

rate_history = get_rate_decisions()
print(rate_history.tail())

Processing rate decision history...
Rate decisions processed: 851 observations

Decision distribution:
decision
HOLD    340
HIKE    289
CUT     222
Name: count, dtype: int64
          date  rate  rate_change  rate_change_bps decision
846 2025-01-01  4.33        -0.15            -15.0      CUT
847 2025-02-01  4.33         0.00              0.0     HOLD
848 2025-03-01  4.33         0.00              0.0     HOLD
849 2025-04-01  4.33         0.00              0.0     HOLD
850 2025-05-01  4.33         0.00              0.0     HOLD


In [7]:
# Cell 7

# Data Quality Checks 


def validate_data(economic_df, rate_history):
    """
    Perform data quality checks
    
    Parameters:
    economic_df (pd.DataFrame): Economic indicators
    rate_history (pd.DataFrame): Rate decision history
    """
    
    print("Performing data quality checks...")
    
    # Check for missing values
    print("\nMissing values in economic data:")
    missing_data = economic_df.isnull().sum()
    print(missing_data[missing_data > 0])
    
    # Check date ranges
    print(f"\nEconomic data range: {economic_df.index.min()} to {economic_df.index.max()}")
    print(f"Rate data range: {rate_history['date'].min()} to {rate_history['date'].max()}")
    
    # Check for outliers
    print("\nData summary statistics:")
    print(economic_df.describe())
    
    # Data completeness
    total_months = len(economic_df)
    complete_months = len(economic_df.dropna())
    completeness = (complete_months / total_months) * 100
    
    print(f"\nData completeness: {completeness:.1f}%")
    
    return True

# Run validation
validate_data(economic_df, rate_history)

Performing data quality checks...

Missing values in economic data:
GDP        568
HOUST       54
DEXUSEU    646
DGS10      364
VIXCLS     579
dtype: int64

Economic data range: 1954-07-01 00:00:00 to 2025-05-01 00:00:00
Rate data range: 1954-07-01 00:00:00 to 2025-05-01 00:00:00

Data summary statistics:
         FEDFUNDS      UNRATE    CPIAUCSL           GDP         PAYEMS  \
count  851.000000  851.000000  851.000000    283.000000     851.000000   
mean     4.607709    5.814454  132.899793   8287.190261  104024.779083   
std      3.557148    1.670580   86.358772   7898.658322   33322.479533   
min      0.050000    3.400000   26.710000    390.996000   48826.000000   
25%      1.855000    4.500000   41.450000   1248.489000   73053.000000   
50%      4.330000    5.600000  126.300000   5747.237000  108300.000000   
75%      6.140000    6.900000  208.107000  14391.659000  132733.500000   
max     19.100000   14.800000  320.580000  29976.638000  159561.000000   

           INDPRO        H

True

In [8]:
# Cell 8

# Create data directory and save raw data (as-is with missing values)

# Create data directories in current folder (more reliable)
import os

# Create data structure in current directory
data_dirs = ['data', 'data/raw', 'data/processed']
for dir_name in data_dirs:
    os.makedirs(dir_name, exist_ok=True)
    print(f"Directory created/verified: {dir_name}")

# Save raw data (using relative path from current directory)
print("\nSaving raw data...")

# Save economic indicators
economic_df.to_csv('data/raw/economic_indicators.csv')
print("Economic indicators saved")

# Save rate decisions
rate_history.to_csv('data/raw/rate_decisions.csv', index=False)
print("Rate decisions saved")

# Save FOMC meeting dates
fomc_meetings.to_csv('data/raw/fomc_meetings.csv', index=False)
print("FOMC meetings saved")

print("\nData collection complete!")
print("Files saved in ./data/raw/")

# Important notes for next steps
print("\n" + "="*60)
print("IMPORTANT NOTES FOR NOTEBOOK 2:")
print("="*60)
print("Core Fed Model Indicators to use:")
print("  COMPLETE: UNRATE (unemployment) - Complete data")
print("  COMPLETE: CPIAUCSL (inflation) - Complete data") 
print("  COMPLETE: PAYEMS (employment) - Complete data")
print("  COMPLETE: FEDFUNDS (current rate) - Complete data")
print("  MISSING: GDP (growth) - 568 missing values to handle")
print("  MISSING: DGS10 (10Y treasury) - 364 missing values to handle")
print("\nMissing Data Strategy for Notebook 2:")
print("  - Focus on 6 core indicators only")
print("  - Handle GDP missing values (forward fill quarterly data)")
print("  - Handle DGS10 missing values (interpolation)")  
print("  - Drop non-essential indicators (HOUST, DEXUSEU, VIXCLS)")
print("  - Target: Improve data completeness from 6.3% to 70%+")
print("\nNext: Run 02_exploratory_analysis.ipynb")
print("="*60)

Directory created/verified: data
Directory created/verified: data/raw
Directory created/verified: data/processed

Saving raw data...
Economic indicators saved
Rate decisions saved
FOMC meetings saved

Data collection complete!
Files saved in ./data/raw/

IMPORTANT NOTES FOR NOTEBOOK 2:
Core Fed Model Indicators to use:
  COMPLETE: UNRATE (unemployment) - Complete data
  COMPLETE: CPIAUCSL (inflation) - Complete data
  COMPLETE: PAYEMS (employment) - Complete data
  COMPLETE: FEDFUNDS (current rate) - Complete data
  MISSING: GDP (growth) - 568 missing values to handle
  MISSING: DGS10 (10Y treasury) - 364 missing values to handle

Missing Data Strategy for Notebook 2:
  - Focus on 6 core indicators only
  - Handle GDP missing values (forward fill quarterly data)
  - Handle DGS10 missing values (interpolation)
  - Drop non-essential indicators (HOUST, DEXUSEU, VIXCLS)
  - Target: Improve data completeness from 6.3% to 70%+

Next: Run 02_exploratory_analysis.ipynb
