
# Federal Open Market Committee (FOMC) Document Scraping Pipeline

This notebook scrapes PDF documents from the Federal Reserve website, downloads them, and extracts their text content for analysis.

The pipeline is broken down into the following steps:
1.  **Configuration**: Set all parameters for the scrape.
2.  **Core Functions**: Define the helper functions for the pipeline.
3.  **Step 1: Create Directories**: Set up the folder structure.
4.  **Step 2: Find PDF Links**: Scrape the website to find all target PDF URLs.
5.  **Step 3: Download PDFs**: Download the discovered PDFs to a local directory.
6.  **Step 4: Extract Text**: Process the downloaded PDFs to extract and save the text.

In [None]:
import os
import sys
import requests
import pdfplumber
from bs4 import BeautifulSoup
from urllib.parse import urljoin

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

notebook_dir = os.getcwd()

sys.path.append(os.path.join(notebook_dir, '../'))

In [4]:
START_YEAR_CONFIG = 2018
END_YEAR_CONFIG = 2019
URL_TEMPLATE_CONFIG = "https://www.federalreserve.gov/monetarypolicy/fomchistorical{year}.htm"
BASE_URL_CONFIG = "https://www.federalreserve.gov"
KEYWORD_PATTERN_CONFIG = r'fomc\d{8}tealbook[ab]\d{8}'
DOWNLOAD_LIMIT_CONFIG = 3

In [5]:
def create_directories(pdf_dir: str, text_dir: str) -> None:
    """
    Creates the necessary output directories if they don't already exist.

    Parameters
    ----------
    pdf_dir : str
        The file path for the directory where PDF files will be saved.
    text_dir : str
        The file path for the directory where extracted text files will be saved.
    """
    os.makedirs(pdf_dir, exist_ok=True)
    os.makedirs(text_dir, exist_ok=True)
    print(f"Ensured directories exist: '{pdf_dir}/' and '{text_dir}/'")

In [6]:
print("--- Step 1: Creating Directories ---")
BASE_DATA_PATH = os.path.join(notebook_dir, '../data')
TEALBOOKS = "tealbooks"
PDF_PATH_CONFIG = os.path.join(BASE_DATA_PATH, 'federal_open_market_committee_dataset', TEALBOOKS, 'pdfs')
TEXT_PATH_CONFIG = os.path.join(BASE_DATA_PATH, 'federal_open_market_committee_dataset', TEALBOOKS, 'text')
create_directories(pdf_dir=PDF_PATH_CONFIG, text_dir=TEXT_PATH_CONFIG)

--- Step 1: Creating Directories ---
Ensured directories exist: '/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/federal_open_market_committee_dataset/tealbooks/pdfs/' and '/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/federal_open_market_committee_dataset/tealbooks/text/'


In [7]:
def generate_yearly_urls(start_year: int, end_year: int, url_template: str) -> list:
    """
    Generates a list of yearly historical FOMC URLs based on a template.

    Parameters
    ----------
    start_year : int
        The first year in the desired range (inclusive).
    end_year : int
        The last year in the desired range (inclusive).
    url_template : str
        A string template for the URL, containing '{year}' as a placeholder.

    Returns
    -------
    list
        A list of URLs, one for each year in the specified range.
    """
    urls = []
    for year in range(start_year, end_year + 1):
        urls.append(url_template.format(year=year))
    print(f"Generated {len(urls)} yearly pages to scrape from {start_year} to {end_year}.")
    return urls

# --- Execute this step ---
print("\n--- Step 2: Generating Yearly URLs ---")
yearly_pages_to_scrape = generate_yearly_urls(
    start_year=START_YEAR_CONFIG,
    end_year=END_YEAR_CONFIG,
    url_template=URL_TEMPLATE_CONFIG
)
print("Pages to scrape:", yearly_pages_to_scrape)


--- Step 2: Generating Yearly URLs ---
Generated 2 yearly pages to scrape from 2018 to 2019.
Pages to scrape: ['https://www.federalreserve.gov/monetarypolicy/fomchistorical2018.htm', 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2019.htm']


In [8]:
def find_pdf_links(start_url: str, base_url: str, doc_keyword_pattern: str) -> set:
    """
    Finds all unique PDF links on a single page that match a regex pattern.

    Parameters
    ----------
    start_url : str
        The URL of the page to scrape for links.
    base_url : str
        The base URL used to resolve relative links.
    doc_keyword_pattern : str
        A regex pattern used to identify the target links.

    Returns
    -------
    set
        A set of unique, absolute URLs found on the page.
    """
    print(f"Fetching page: {start_url}")
    try:
        response = requests.get(start_url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return set()

    soup = BeautifulSoup(response.content, 'html.parser')
    pdf_links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.endswith('.pdf') and re.search(doc_keyword_pattern, href, re.IGNORECASE):
            full_url = urljoin(base_url, href)
            pdf_links.add(full_url)
    
    print(f"-> Found {len(pdf_links)} links matching pattern on this page.")
    return pdf_links

In [9]:
print("\n--- Step 3: Finding PDF Links ---")
all_pdf_links = set()
for page_url in yearly_pages_to_scrape:
    links_from_page = find_pdf_links(
        start_url=page_url,
        base_url=BASE_URL_CONFIG,
        doc_keyword_pattern=KEYWORD_PATTERN_CONFIG
    )
    all_pdf_links.update(links_from_page)
print(f"\nTotal unique links found across all years: {len(all_pdf_links)}")



--- Step 3: Finding PDF Links ---
Fetching page: https://www.federalreserve.gov/monetarypolicy/fomchistorical2018.htm
-> Found 16 links matching pattern on this page.
Fetching page: https://www.federalreserve.gov/monetarypolicy/fomchistorical2019.htm
-> Found 16 links matching pattern on this page.

Total unique links found across all years: 32


In [10]:
def download_pdfs(pdf_links: set, pdf_dir: str, num_to_download: int | str = "all") -> list:
    """
    Downloads a set of PDFs from a list of URLs to a specified directory.

    Parameters
    ----------
    pdf_links : set
        A set of absolute URLs for the PDF files to be downloaded.
    pdf_dir : str
        The local directory where the downloaded PDFs will be saved.
    num_to_download : int or str, optional
        The number of PDFs to download.
    """
    local_pdf_paths = []
    sorted_links = sorted(list(pdf_links), reverse=True)
    
    links_to_process = []
    if isinstance(num_to_download, int):
        print(f"Limiting download to the newest {num_to_download} files.")
        links_to_process = sorted_links[:num_to_download]
    else:
        print("Preparing to download all found files.")
        links_to_process = sorted_links
        
    for url in links_to_process:
        filename = url.split('/')[-1]
        pdf_path = os.path.join(pdf_dir, filename)
        
        if os.path.exists(pdf_path):
            print(f"Skipping (already exists): {filename}")
        else:
            print(f"Downloading: {filename}...")
            try:
                pdf_response = requests.get(url, timeout=30)
                pdf_response.raise_for_status()
                with open(pdf_path, 'wb') as f:
                    f.write(pdf_response.content)
            except requests.exceptions.RequestException as e:
                print(f"  -> Failed to download {url}: {e}")
                continue
        
        local_pdf_paths.append(pdf_path)
    return local_pdf_paths

# --- Execute this step ---
print("\n--- Step 4: Downloading PDFs ---")
local_pdf_paths = download_pdfs(
    pdf_links=all_pdf_links,
    pdf_dir=PDF_PATH_CONFIG,
    num_to_download=DOWNLOAD_LIMIT_CONFIG
)


--- Step 4: Downloading PDFs ---
Limiting download to the newest 3 files.
Downloading: FOMC20191211tealbookb20191205.pdf...
Downloading: FOMC20191211tealbooka20191126.pdf...
Downloading: FOMC20191030tealbookb20191024.pdf...


In [11]:
def extract_text_from_pdfs(local_pdf_paths: list, text_dir: str) -> None:
    """
    Extracts text from a list of local PDF files and saves it to a directory.

    Parameters
    ----------
    local_pdf_paths : list
        A list of local file paths to the PDF files that will be processed.
    text_dir : str
        The local directory where the extracted text files will be saved.
    """
    for pdf_path in local_pdf_paths:
        filename = os.path.basename(pdf_path)
        text_filename = filename.replace('.pdf', '.txt')
        text_path = os.path.join(text_dir, text_filename)
        
        if os.path.exists(text_path):
            print(f"Skipping (text already extracted): {filename}")
            continue

        print(f"Extracting text from: {filename}...")
        full_text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for i, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text:
                        full_text += page_text + f"\n\n--- End of Page {i+1} ---\n\n"
            
            with open(text_path, 'w', encoding='utf-8') as f:
                f.write(full_text)
                
        except Exception as e:
            print(f"  -> Could not process {pdf_path}: {e}")

# --- Execute this step ---
print("\n--- Step 5: Extracting Text ---")
extract_text_from_pdfs(local_pdf_paths=local_pdf_paths, text_dir=TEXT_PATH_CONFIG)
print("\n--- Pipeline Complete ---")


--- Step 5: Extracting Text ---
Extracting text from: FOMC20191211tealbookb20191205.pdf...
Extracting text from: FOMC20191211tealbooka20191126.pdf...
Extracting text from: FOMC20191030tealbookb20191024.pdf...

--- Pipeline Complete ---
