# Executive Order Scraping

Pulling from https://www.presidency.ucsb.edu/documents/app-categories/written-presidential-orders/presidential/executive-orders which seems more complete than https://www.federalregister.gov/documents/search with older EOs as text instead of source document scans.

In [51]:
%run notebooks/Setup.ipynb

import requests
import pandas
from bs4 import BeautifulSoup
import json
from pathlib import Path

In [None]:
host = "https://www.presidency.ucsb.edu"

def get_executive_order_links(url: str):
    """
    Fetch the main page of executive orders and extract links to individual orders.
    Returns a list of relative links (since the site uses relative paths).
    """
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    # find the div with class "view-content" which contains the list of executive orders
    view_content_div = soup.find("div", class_="view-content")

    if not view_content_div:
        return None

    # iterate through all the divs with class "field-title"
    eo_links = []
    for field_title_div in view_content_div.find_all("div", class_="field-title"):
        anchor_tag = field_title_div.find("a")
        if anchor_tag:
            eo_links.append(anchor_tag["href"])

    return eo_links

def get_eo_links(num_pages: int):
    """
    Fetch the main page of executive orders and extract links to individual orders.
    Returns a list of relative links (since the site uses relative paths).
    """
    base_url = host + "/documents/app-categories/written-presidential-orders/presidential/executive-orders?items_per_page=100"
    eo_links = []
    for i in range(num_pages):
        url = f"{base_url}?page={i}"
        page_links = get_executive_order_links(url)
        if page_links:
            eo_links.extend(page_links)
            
    return eo_links

def get_executive_order(url: str):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, "html.parser")

    def clean_text(text):
        return text.replace("\xa0", " ").replace("\u2014", "-").strip()

    # fetch person details
    person_div = soup.find("div", class_="field-docs-person")
    person_title = clean_text(person_div.find("div", class_="field-title").text)
    person_byline = clean_text(person_div.find("div", class_="field-ds-byline").text)
    eo_title = clean_text(person_div.find("div", class_="field-ds-doc-title").text)

    # fetch the date
    date_div = clean_text(soup.find("div", class_="field-docs-start-date-time").text)

    # read the content and collect each paragraph as an array of strings
    content_div = soup.find("div", class_="field-docs-content")
    content = [clean_text(paragraph.text) for paragraph in content_div.find_all("p")]

    citation = clean_text(soup.find("div", class_="field-prez-document-citation").text)

    return {
        "url": url,
        "doc": url.split("/")[-1],
        "president": person_title,
        "president_byline": person_byline,
        "title": eo_title,
        "date": date_div,
        "content": content,
        "citation": citation
    }

def save_executive_orders(eos_to_fetch: list[str], skip_existing: bool = True):
    # fetch each executive order and write to a json file
    for eos_link in eos_to_fetch:
        eo_path = Path("data/executive_orders/raw").joinpath(eos_link.split("/")[-1])
        # see if we need to skip
        if skip_existing and eo_path.exists():
            print(f"Skipping {eos_link}")
            continue

        eo = get_executive_order(host + eos_link)
        if eo:
            with open(eo_path, "w") as f:
                json.dump(eo, f, indent=4)

                print(f"Fetched {eos_link}")
        else:
            print(f"Failed to fetch {eos_link}")

In [None]:
# initial pull of all links
all_eo_links = get_eo_links(200)

# write the links to a csv file
pandas.DataFrame(all_eo_links, columns=["link"]) \
    .to_csv("data/executive_orders/links.csv", index=False)

In [None]:
# initial fetch of all eos
eos_to_fetch = pandas.read_csv("data/executive_orders/links.csv")["link"].tolist()
save_executive_orders(eos_to_fetch)

In [49]:
# update with recent eos
new_eo_links = get_eo_links(2)

# diff the new eos with those saved
prior_eos = pandas.read_csv("data/executive_orders/links.csv")["link"].tolist()
new_eos = list(set(new_eo_links) - set(prior_eos))

# merge the new eos to the top of the links and save
all_eo_links = new_eos + prior_eos
pandas.DataFrame(all_eo_links, columns=["link"]) \
    .to_csv("data/executive_orders/links.csv", index=False)

new_eos

['/documents/executive-order-14213-establishing-the-national-energy-dominance-council',
 '/documents/executive-order-implementing-the-presidents-department-government-efficiency-cost']

In [54]:
save_executive_orders(new_eos)

Skipping /documents/executive-order-14213-establishing-the-national-energy-dominance-council
Fetched /documents/executive-order-14213-establishing-the-national-energy-dominance-council
Skipping /documents/executive-order-implementing-the-presidents-department-government-efficiency-cost
Fetched /documents/executive-order-implementing-the-presidents-department-government-efficiency-cost
