# Fetch EIS Records via EPA E-NEPA API

This notebook replaces the web scraping approach (`scrape_record_set_V2.R` and `scrape_record_details_V2.R`) with the official EPA E-NEPA API.

**API Documentation:** https://cdxapps.epa.gov/cdx-enepa-II/apidocs/index.html

## Outputs
- `eis_record_api.parquet` - All EIS records with full metadata
- `eis_document_record_api.parquet` - All document/attachment records

These files are also saved as `.pkl` for easy Python reloading and can be converted to R-compatible formats.

In [1]:
# Install required packages if needed
# !pip install requests pandas pyarrow tqdm

In [2]:
import requests
import pandas as pd
import json
import time
import os
from datetime import datetime, timedelta
from pathlib import Path
from tqdm.notebook import tqdm
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# Configuration
BASE_URL = "https://cdxapps.epa.gov/cdx-enepa-II/rest"
SEARCH_ENDPOINT = f"{BASE_URL}/public/v1/eis/search"

# Output paths - relative to repository root
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"

# Ensure metadata directory exists
METADATA_DIR.mkdir(exist_ok=True)

# Output files
EIS_RECORD_FILE = METADATA_DIR / "eis_record_api.parquet"
EIS_RECORD_PKL = METADATA_DIR / "eis_record_api.pkl"
DOC_RECORD_FILE = METADATA_DIR / "eis_document_record_api.parquet"
DOC_RECORD_PKL = METADATA_DIR / "eis_document_record_api.pkl"

# Year range for fetching records
START_YEAR = 1987
END_YEAR = datetime.now().year

# Rate limiting
REQUEST_DELAY = 0.5  # seconds between requests

print(f"Repository root: {REPO_ROOT}")
print(f"Metadata directory: {METADATA_DIR}")
print(f"Will fetch records from {START_YEAR} to {END_YEAR}")

Repository root: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository
Metadata directory: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/metadata
Will fetch records from 1987 to 2026


## API Helper Functions

In [4]:
def search_eis_by_date_range(start_date: str, end_date: str, max_retries: int = 3) -> list:
    """
    Search for EIS records within a date range.
    
    Args:
        start_date: Start date in MM/dd/yyyy format
        end_date: End date in MM/dd/yyyy format
        max_retries: Number of retry attempts on failure
    
    Returns:
        List of EIS record dictionaries
    """
    params = {
        "startFRDate": start_date,
        "endFRDate": end_date
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(SEARCH_ENDPOINT, params=params, timeout=60)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                logger.error(f"Failed to fetch records for {start_date} to {end_date}")
                return []
    return []


def search_eis_by_id(eis_id: str, max_retries: int = 3) -> dict:
    """
    Search for a specific EIS record by ID.
    
    Args:
        eis_id: The EIS ID
        max_retries: Number of retry attempts on failure
    
    Returns:
        EIS record dictionary or None
    """
    params = {"eisId": eis_id}
    
    for attempt in range(max_retries):
        try:
            response = requests.get(SEARCH_ENDPOINT, params=params, timeout=60)
            response.raise_for_status()
            results = response.json()
            return results[0] if results else None
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for EIS {eis_id}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return None
    return None

In [5]:
def flatten_eis_record(record: dict) -> dict:
    """
    Flatten a nested EIS record into a flat dictionary for DataFrame storage.
    
    Args:
        record: Raw EIS record from API
    
    Returns:
        Flattened dictionary
    """
    flat = {
        "eisId": record.get("eisId"),
        "title": record.get("title"),
        "ceqNumber": record.get("ceqNumber"),
        "type": record.get("type"),
        "filedDate": record.get("filedDate"),
        "commentLetterDate": record.get("commentLetterDate"),
        "federalRegisterReportDate": record.get("federalRegisterReportDate"),
        "uniqueIdentificationNumber": record.get("uniqueIdentificationNumber"),
        "leadAgency": record.get("leadAgency"),
        "dueDate": record.get("dueDate"),
        "ammendedNoticeDate": record.get("ammendedNoticeDate"),
        "ammendedNoticeText": record.get("ammendedNoticeText"),
        "supplementalInformation": record.get("supplementalInformation"),
        "noticeOfIntent": record.get("noticeOfIntent"),
        "rating": record.get("rating"),
        "status": record.get("status"),
    }
    
    # Extract states
    states = record.get("states", [])
    flat["states"] = ", ".join([s.get("name", "") for s in states]) if states else None
    flat["primaryState"] = next((s.get("name") for s in states if s.get("primary")), None)
    
    # Extract cooperating agencies
    coop_agencies = record.get("cooperatingAgency", [])
    flat["cooperatingAgencies"] = ", ".join([a.get("name", "") for a in coop_agencies]) if coop_agencies else None
    
    # Count attachments
    attachments = record.get("attachments", [])
    flat["attachmentCount"] = len(attachments)
    
    # Store raw JSON for attachments and zipLinkMetadata for later processing
    flat["_attachments_json"] = json.dumps(attachments) if attachments else None
    flat["_zipLinkMetadata_json"] = json.dumps(record.get("zipLinkMetadata")) if record.get("zipLinkMetadata") else None
    
    return flat


def extract_attachments(record: dict) -> list:
    """
    Extract attachment/document records from an EIS record.
    
    Args:
        record: Raw EIS record from API
    
    Returns:
        List of attachment dictionaries
    """
    # Note: API uses "eisId" (lowercase 'd'), not "eisID"
    eis_id = record.get("eisId")
    ceq_number = record.get("ceqNumber")
    attachments = record.get("attachments", [])
    
    docs = []
    for att in attachments:
        doc = {
            "eisId": eis_id,
            "ceqNumber": ceq_number,
            "attachmentId": att.get("id"),
            "name": att.get("name"),
            "title": att.get("title"),
            "fileNameForDownload": att.get("fileNameForDownload"),
            "type": att.get("type"),
            "size": att.get("size"),
            "sizeKb": att.get("sizeKb"),
            "pages": att.get("pages"),
        }
        docs.append(doc)
    
    return docs

## Fetch All Records

The API is queried by year to manage response sizes. Each year's results are accumulated and saved incrementally.

In [6]:
def load_existing_records():
    """
    Load existing records if available.
    
    Returns:
        Tuple of (eis_records_df, doc_records_df) or (None, None)
    """
    eis_df = None
    doc_df = None
    
    if EIS_RECORD_PKL.exists():
        try:
            eis_df = pd.read_pickle(EIS_RECORD_PKL)
            logger.info(f"Loaded {len(eis_df)} existing EIS records")
        except Exception as e:
            logger.warning(f"Could not load existing EIS records: {e}")
    
    if DOC_RECORD_PKL.exists():
        try:
            doc_df = pd.read_pickle(DOC_RECORD_PKL)
            logger.info(f"Loaded {len(doc_df)} existing document records")
        except Exception as e:
            logger.warning(f"Could not load existing document records: {e}")
    
    return eis_df, doc_df


def save_records(eis_df: pd.DataFrame, doc_df: pd.DataFrame):
    """
    Save records to both pickle and parquet formats.
    """
    # Save pickle (fast Python serialization)
    eis_df.to_pickle(EIS_RECORD_PKL)
    doc_df.to_pickle(DOC_RECORD_PKL)
    
    # Save parquet (efficient, cross-platform)
    eis_df.to_parquet(EIS_RECORD_FILE, index=False)
    doc_df.to_parquet(DOC_RECORD_FILE, index=False)
    
    logger.info(f"Saved {len(eis_df)} EIS records and {len(doc_df)} document records")

In [7]:
def fetch_all_records(start_year: int = START_YEAR, end_year: int = END_YEAR, 
                      overwrite: bool = False, save_interval: int = 5):
    """
    Fetch all EIS records from the API.
    
    The API has a 500 record limit per query. This function automatically
    splits queries into half-years when the limit is reached to ensure
    complete data retrieval.
    
    Args:
        start_year: First year to fetch
        end_year: Last year to fetch
        overwrite: If True, fetch all records regardless of existing data.
                   If False, only fetch records not already in the database.
        save_interval: Save progress every N years
    
    Returns:
        Tuple of (eis_records_df, doc_records_df)
    """
    API_LIMIT = 500  # EPA API returns max 500 records per query
    
    # Load existing records
    existing_eis_df, existing_doc_df = load_existing_records()
    
    if overwrite or existing_eis_df is None:
        all_eis_records = []
        all_doc_records = []
        existing_ids = set()
    else:
        all_eis_records = existing_eis_df.to_dict('records')
        all_doc_records = existing_doc_df.to_dict('records') if existing_doc_df is not None else []
        existing_ids = set(existing_eis_df['eisId'].astype(str))
        logger.info(f"Starting with {len(existing_ids)} existing records")
    
    years = list(range(end_year, start_year - 1, -1))  # Most recent first
    
    for i, year in enumerate(tqdm(years, desc="Fetching years")):
        # Start with full year query
        date_ranges = [(f"01/01/{year}", f"12/31/{year}")]
        
        year_records = []
        
        while date_ranges:
            start_date, end_date = date_ranges.pop(0)
            
            logger.info(f"Fetching records for {start_date} to {end_date}...")
            records = search_eis_by_date_range(start_date, end_date)
            
            # Check if we hit the API limit
            if len(records) == API_LIMIT:
                # Parse dates to split the range
                from datetime import datetime
                start_dt = datetime.strptime(start_date, "%m/%d/%Y")
                end_dt = datetime.strptime(end_date, "%m/%d/%Y")
                
                # Calculate midpoint
                mid_dt = start_dt + (end_dt - start_dt) / 2
                mid_date = mid_dt.strftime("%m/%d/%Y")
                
                # Only split if there's room (more than 1 day range)
                if (end_dt - start_dt).days > 1:
                    logger.warning(f"Hit API limit ({API_LIMIT}) for {start_date}-{end_date}, splitting into sub-ranges")
                    # Add two sub-ranges to the queue
                    # First half: start to mid (inclusive)
                    date_ranges.append((start_date, mid_dt.strftime("%m/%d/%Y")))
                    # Second half: day after mid to end
                    next_day = mid_dt + timedelta(days=1)
                    date_ranges.append((next_day.strftime("%m/%d/%Y"), end_date))
                    continue  # Don't process these results, fetch sub-ranges instead
                else:
                    logger.warning(f"Cannot split further for {start_date}-{end_date}, using {len(records)} records")
            
            year_records.extend(records)
            time.sleep(REQUEST_DELAY)
        
        # Process collected records for this year
        new_count = 0
        for record in year_records:
            eis_id = str(record.get("eisId"))
            
            if not overwrite and eis_id in existing_ids:
                continue
            
            # Flatten and store EIS record
            flat_record = flatten_eis_record(record)
            all_eis_records.append(flat_record)
            existing_ids.add(eis_id)
            
            # Extract and store attachments
            attachments = extract_attachments(record)
            all_doc_records.extend(attachments)
            
            new_count += 1
        
        logger.info(f"Year {year}: Found {len(year_records)} records, {new_count} new")
        
        # Save progress periodically
        if (i + 1) % save_interval == 0:
            eis_df = pd.DataFrame(all_eis_records)
            doc_df = pd.DataFrame(all_doc_records)
            save_records(eis_df, doc_df)
        
        time.sleep(REQUEST_DELAY)
    
    # Final save
    eis_df = pd.DataFrame(all_eis_records)
    doc_df = pd.DataFrame(all_doc_records)
    save_records(eis_df, doc_df)
    
    return eis_df, doc_df

## Fetch Configuration

Configure the fetch settings before running.

In [None]:
# ============================================
# FETCH SETTINGS - MODIFY AS NEEDED
# ============================================

# Set to True to re-fetch ALL records from scratch (overwrites existing data)
# Set to False to only fetch NEW records not already in the database (incremental update)
OVERWRITE = False

# Year range to fetch
# Modify these to limit the fetch to specific years
FETCH_START_YEAR = 1987  # Earliest year in the database
FETCH_END_YEAR = datetime.now().year  # Current year

# How often to save progress (every N years)
# Lower values = more frequent saves, safer but slightly slower
SAVE_INTERVAL = 1

print(f"=== Fetch Configuration ===")
print(f"  OVERWRITE: {OVERWRITE}")
print(f"  Year range: {FETCH_START_YEAR} to {FETCH_END_YEAR}")
print(f"  Save interval: every {SAVE_INTERVAL} years")
print()
if OVERWRITE:
    print("WARNING: OVERWRITE=True will re-fetch ALL records and replace existing data!")
else:
    print("Mode: Incremental update (will skip existing records)")

## Run the Fetch

Execute the cell below to start fetching records. Progress will be saved incrementally.

In [9]:
# Run the fetch using configuration settings above
eis_df, doc_df = fetch_all_records(
    start_year=FETCH_START_YEAR,
    end_year=FETCH_END_YEAR,
    overwrite=OVERWRITE,
    save_interval=SAVE_INTERVAL
)

2026-01-27 15:22:16,928 - INFO - Loaded 16784 existing EIS records
2026-01-27 15:22:16,947 - INFO - Loaded 45704 existing document records


Fetching years:   0%|          | 0/40 [00:00<?, ?it/s]

2026-01-27 15:22:16,971 - INFO - Fetching records for 01/01/2026 to 12/31/2026...
2026-01-27 15:22:18,525 - INFO - Year 2026: Found 17 records, 17 new
2026-01-27 15:22:18,559 - INFO - Saved 17 EIS records and 273 document records
2026-01-27 15:22:19,070 - INFO - Fetching records for 01/01/2025 to 12/31/2025...
2026-01-27 15:22:26,844 - INFO - Year 2025: Found 172 records, 172 new
2026-01-27 15:22:26,877 - INFO - Saved 189 EIS records and 1938 document records
2026-01-27 15:22:27,389 - INFO - Fetching records for 01/01/2024 to 12/31/2024...
2026-01-27 15:22:35,837 - INFO - Year 2024: Found 238 records, 238 new
2026-01-27 15:22:35,863 - INFO - Saved 427 EIS records and 4436 document records
2026-01-27 15:22:36,369 - INFO - Fetching records for 01/01/2023 to 12/31/2023...
2026-01-27 15:22:39,764 - INFO - Year 2023: Found 182 records, 182 new
2026-01-27 15:22:39,798 - INFO - Saved 609 EIS records and 6069 document records
2026-01-27 15:22:40,310 - INFO - Fetching records for 01/01/2022 to 

In [10]:
# Display summary statistics
print(f"\n=== Summary ===")
print(f"Total EIS records: {len(eis_df)}")
print(f"Total document records: {len(doc_df)}")

# Use ceqNumber for year (more reliable than federalRegisterReportDate)
eis_df['year'] = eis_df['ceqNumber'].astype(str).str[:4]

print(f"\nEIS records by year:")
year_counts = eis_df['year'].value_counts().sort_index()
print(year_counts)

# Check for years that may have been truncated (exactly 500 records with old fetch)
# With the new half-year splitting logic, this should not be an issue anymore
potential_truncated = year_counts[year_counts >= 500]
if len(potential_truncated) > 0:
    print(f"\nNote: The following years have 500+ records. With the API limit handling,")
    print(f"these should now be complete. If you see exactly 500, re-run the fetch.")
    print(potential_truncated)


=== Summary ===
Total EIS records: 16784
Total document records: 45704

EIS records by year:
year
1987    431
1988    435
1989    370
1990    477
1991    457
1992    512
1993    466
1994    530
1995    606
1996    603
1997    498
1998    527
1999    502
2000    474
2001    496
2002    535
2003    594
2004    599
2005    557
2006    550
2007    563
2008    550
2009    455
2010    497
2011    443
2012    418
2013    398
2014    393
2015    379
2016    330
2017    248
2018    336
2019    302
2020    265
2021    186
2022    193
2023    182
2024    238
2025    189
Name: count, dtype: int64

Note: The following years have 500+ records. With the API limit handling,
these should now be complete. If you see exactly 500, re-run the fetch.
year
1992    512
1994    530
1995    606
1996    603
1998    527
1999    502
2002    535
2003    594
2004    599
2005    557
2006    550
2007    563
2008    550
Name: count, dtype: int64


In [11]:
# Preview EIS records
print("\n=== EIS Records Sample ===")
display(eis_df[['eisId','title', 'leadAgency', 'states', 'federalRegisterReportDate', 'attachmentCount']].head(10))


=== EIS Records Sample ===


Unnamed: 0,eisId,title,leadAgency,states,federalRegisterReportDate,attachmentCount
0,531723,Lower Missouri River Flood Risk and Resiliency...,U.S. Army Corps of Engineers,"MO, IA, KS, NE",01/23/2026,14
1,543266,Chesapeake Bay Crossing Study Tier 2 EIS,Federal Highway Administration,MD,01/23/2026,50
2,544683,Draft Supplemental Environmental Impact Statem...,Federal Energy Regulatory Commission,"OR, ID",01/23/2026,2
3,545169,Rattlesnake Creek Watershed Plan-EIS,Natural Resource Conservation Service,KS,01/23/2026,1
4,545269,South Revilla Integrated Resource Project,Forest Service,AK,01/23/2026,4
5,545421,Blue Water Offshore Port Deepwater Port Applic...,Maritime Administration,TX,01/23/2026,29
6,546302,ADOPTION - Warrior Met Coal Mines,Office of Surface Mining Reclamation and Enfor...,AL,01/23/2026,14
7,543136,Master Plan and Installation Development at Ne...,United States Air Force,NV,01/16/2026,3
8,543233,Austin Light Rail Phase 1 Project,Federal Transit Administration,TX,01/16/2026,64
9,543261,Bakersfield Field Office Oil and Gas Leasing a...,Bureau of Land Management,CA,01/16/2026,1


In [12]:
eis_df

Unnamed: 0,eisId,title,ceqNumber,type,filedDate,commentLetterDate,federalRegisterReportDate,uniqueIdentificationNumber,leadAgency,dueDate,...,noticeOfIntent,rating,status,states,primaryState,cooperatingAgencies,attachmentCount,_attachments_json,_zipLinkMetadata_json,year
0,531723,Lower Missouri River Flood Risk and Resiliency...,20250186,Draft,01/15/2026,,01/23/2026,PEIS-202-00-G5P-1728988668,U.S. Army Corps of Engineers,03/09/2026,...,04/28/2025,,Signed,"MO, IA, KS, NE",MO,"EPA, FEMA, USFWS, NRCS, USACE",14,"[{""id"": 544805, ""name"": ""LoMo FRR Comprehensiv...","{""eisDocument"": [{""set"": 1, ""eisId"": 531723, ""...",2025
1,543266,Chesapeake Bay Crossing Study Tier 2 EIS,20250190,Draft,01/21/2026,,01/23/2026,EISX-021-15-XMD-1729253019,Federal Highway Administration,03/09/2026,...,11/15/2024,,Signed,MD,MD,"Maryland Transportation Authority (MDTA), Stat...",50,"[{""id"": 545002, ""name"": ""BCST2_Draft EIS_Janua...","{""eisDocument"": [{""set"": 1, ""eisId"": 543266, ""...",2025
2,544683,Draft Supplemental Environmental Impact Statem...,20250185,Draft Supplement,01/14/2026,,01/23/2026,,Federal Energy Regulatory Commission,03/02/2026,...,06/13/2022,,Signed,"OR, ID",OR,FERC,2,"[{""id"": 544690, ""name"": ""Hells Canyon Draft SE...","{""eisDocument"": [{""set"": 1, ""eisId"": 544683, ""...",2025
3,545169,Rattlesnake Creek Watershed Plan-EIS,20250191,Final,01/21/2026,,01/23/2026,EISX-005-53-000-1768489172,Natural Resource Conservation Service,02/23/2026,...,12/28/2023,,Signed,KS,KS,"Kansas Department of Health and Environment\r,...",1,"[{""id"": 546408, ""name"": ""Rattlesnake Creek Wat...","{""eisDocument"": [{""set"": 1, ""eisId"": 545169, ""...",2025
4,545269,South Revilla Integrated Resource Project,20250187,Final,01/16/2026,,01/23/2026,,Forest Service,03/09/2026,...,08/08/2018,,Signed,AK,AK,"Ketchikan Indian Community, USFS",4,"[{""id"": 545274, ""name"": ""1_SRevilla_FEIS_Jan.p...","{""eisDocument"": [{""set"": 1, ""eisId"": 545269, ""...",2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16779,78828,Blankerbaker Road/I-64 Interchange Improvement...,19870003,Final,01/07/1987,03/04/1987,01/16/1987,,Federal Highway Administration,02/18/1987,...,,EC2,Signed,KY,KY,FHWA,0,,,1987
16780,83301,Lower San Joaquin River and Tributaries Flood ...,19870009,Revised Draft,01/09/1987,03/06/1987,01/16/1987,,U.S. Army Corps of Engineers,03/02/1987,...,,EU3,Signed,CA,CA,USACE,0,,,1987
16781,83553,Metropolitan Denver Water Supply Project Two ...,19870010,Draft,01/09/1987,04/23/1987,01/16/1987,,U.S. Army Corps of Engineers,04/23/1987,...,,EU3,Signed,CO,CO,USACE,1,"[{""id"": 281687, ""name"": ""19870010.pdf"", ""title...","{""commentLetter"": [{""eisId"": 83553, ""type"": ""c...",1987
16782,83709,Elliott Bay Small Craft Harbor/Marina Developm...,19870001,Final,01/05/1987,01/01/1970,01/16/1987,,U.S. Army Corps of Engineers,02/17/1987,...,,ND,Signed,WA,WA,USACE,0,,,1987


In [13]:
# Preview document records
print("\n=== Document Records Sample ===")
display(doc_df[['eisId', 'ceqNumber', 'attachmentId', 'name', 'type', 'sizeKb', 'pages']].head(10))


=== Document Records Sample ===


Unnamed: 0,eisId,ceqNumber,attachmentId,name,type,sizeKb,pages
0,531723,20250186,544805,LoMo FRR Comprehensive Study Draft Report.pdf,EIS_Document,19092,383.0
1,531723,20250186,544810,LoMo System Plan - Basis of Estimate.pdf,EIS_Document,2184,48.0
2,531723,20250186,544815,Appendix A.1 LoMo FRM Past Performance Assessm...,EIS_Document,4410,124.0
3,531723,20250186,544820,Appendix A.2.1 LoMo RAS Calibration Omaha Dist...,EIS_Document,21459,162.0
4,531723,20250186,545125,Appendix A.2.2 LoMo RAS Calibration Kansas Cit...,EIS_Document,13265,126.0
5,531723,20250186,545130,Appendix A.3 Attachment Modeling Alternative a...,EIS_Document,40183,302.0
6,531723,20250186,545135,APPENDIX A.3 Modeling Alternatives and Results...,EIS_Document,14369,157.0
7,531723,20250186,545140,APPENDIX A.4 Resiliency.pdf,EIS_Document,2192,33.0
8,531723,20250186,545145,APPENDIX B Cost.pdf,EIS_Document,606,17.0
9,531723,20250186,545150,APPENDIX C Environmental.pdf,EIS_Document,4736,97.0


## Export to CSV (Optional)

For compatibility with R or other tools.

In [14]:
# Export to CSV
#eis_df.to_csv(METADATA_DIR / "eis_record_api.csv", index=False)
#doc_df.to_csv(METADATA_DIR / "eis_document_record_api.csv", index=False)
#print(f"Exported to CSV files in {METADATA_DIR}")

## Utility: Fetch Single Record by ID

Useful for checking specific records or debugging.

In [15]:
# Example: Fetch a single record by EIS ID
# record = search_eis_by_id("20240001")
# if record:
#     print(json.dumps(record, indent=2, default=str))