# Fetch EIS Records via EPA E-NEPA API

This notebook replaces the web scraping approach (`scrape_record_set_V2.R` and `scrape_record_details_V2.R`) with the official EPA E-NEPA API.

**API Documentation:** https://cdxapps.epa.gov/cdx-enepa-II/apidocs/index.html

## Outputs
- `eis_record_api.parquet` - All EIS records with full metadata
- `eis_document_record_api.parquet` - All document/attachment records

These files are also saved as `.pkl` for easy Python reloading and can be converted to R-compatible formats.

In [None]:
# Install required packages if needed
# !pip install requests pandas pyarrow tqdm

In [None]:
import requests
import pandas as pd
import json
import time
import os
from datetime import datetime, timedelta
from pathlib import Path
from tqdm.notebook import tqdm
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Configuration
BASE_URL = "https://cdxapps.epa.gov/cdx-enepa-II/rest"
SEARCH_ENDPOINT = f"{BASE_URL}/public/v1/eis/search"

# Output paths - relative to repository root
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"

# Ensure metadata directory exists
METADATA_DIR.mkdir(exist_ok=True)

# Output files
EIS_RECORD_FILE = METADATA_DIR / "eis_record_api.parquet"
EIS_RECORD_PKL = METADATA_DIR / "eis_record_api.pkl"
DOC_RECORD_FILE = METADATA_DIR / "eis_document_record_api.parquet"
DOC_RECORD_PKL = METADATA_DIR / "eis_document_record_api.pkl"

# Year range for fetching records
START_YEAR = 1987
END_YEAR = datetime.now().year

# Rate limiting
REQUEST_DELAY = 0.5  # seconds between requests

print(f"Repository root: {REPO_ROOT}")
print(f"Metadata directory: {METADATA_DIR}")
print(f"Will fetch records from {START_YEAR} to {END_YEAR}")

## API Helper Functions

In [None]:
def search_eis_by_date_range(start_date: str, end_date: str, max_retries: int = 3) -> list:
    """
    Search for EIS records within a date range.
    
    Args:
        start_date: Start date in MM/dd/yyyy format
        end_date: End date in MM/dd/yyyy format
        max_retries: Number of retry attempts on failure
    
    Returns:
        List of EIS record dictionaries
    """
    params = {
        "startFRDate": start_date,
        "endFRDate": end_date
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(SEARCH_ENDPOINT, params=params, timeout=60)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                logger.error(f"Failed to fetch records for {start_date} to {end_date}")
                return []
    return []


def search_eis_by_id(eis_id: str, max_retries: int = 3) -> dict:
    """
    Search for a specific EIS record by ID.
    
    Args:
        eis_id: The EIS ID (8-digit number)
        max_retries: Number of retry attempts on failure
    
    Returns:
        EIS record dictionary or None
    """
    params = {"eisId": eis_id}
    
    for attempt in range(max_retries):
        try:
            response = requests.get(SEARCH_ENDPOINT, params=params, timeout=60)
            response.raise_for_status()
            results = response.json()
            return results[0] if results else None
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for EIS {eis_id}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return None
    return None

In [None]:
def flatten_eis_record(record: dict) -> dict:
    """
    Flatten a nested EIS record into a flat dictionary for DataFrame storage.
    
    Args:
        record: Raw EIS record from API
    
    Returns:
        Flattened dictionary
    """
    flat = {
        "eisId": record.get("eisId"),
        "title": record.get("title"),
        "ceqNumber": record.get("ceqNumber"),
        "type": record.get("type"),
        "filedDate": record.get("filedDate"),
        "commentLetterDate": record.get("commentLetterDate"),
        "federalRegisterReportDate": record.get("federalRegisterReportDate"),
        "uniqueIdentificationNumber": record.get("uniqueIdentificationNumber"),
        "leadAgency": record.get("leadAgency"),
        "dueDate": record.get("dueDate"),
        "ammendedNoticeDate": record.get("ammendedNoticeDate"),
        "ammendedNoticeText": record.get("ammendedNoticeText"),
        "supplementalInformation": record.get("supplementalInformation"),
        "noticeOfIntent": record.get("noticeOfIntent"),
        "rating": record.get("rating"),
        "status": record.get("status"),
    }
    
    # Extract states
    states = record.get("states", [])
    flat["states"] = ", ".join([s.get("name", "") for s in states]) if states else None
    flat["primaryState"] = next((s.get("name") for s in states if s.get("primary")), None)
    
    # Extract cooperating agencies
    coop_agencies = record.get("cooperatingAgency", [])
    flat["cooperatingAgencies"] = ", ".join([a.get("name", "") for a in coop_agencies]) if coop_agencies else None
    
    # Count attachments
    attachments = record.get("attachments", [])
    flat["attachmentCount"] = len(attachments)
    
    # Store raw JSON for attachments and zipLinkMetadata for later processing
    flat["_attachments_json"] = json.dumps(attachments) if attachments else None
    flat["_zipLinkMetadata_json"] = json.dumps(record.get("zipLinkMetadata")) if record.get("zipLinkMetadata") else None
    
    return flat


def extract_attachments(record: dict) -> list:
    """
    Extract attachment/document records from an EIS record.
    
    Args:
        record: Raw EIS record from API
    
    Returns:
        List of attachment dictionaries
    """
    eis_id = record.get("eisId")
    attachments = record.get("attachments", [])
    
    docs = []
    for att in attachments:
        doc = {
            "eisId": eis_id,
            "attachmentId": att.get("id"),
            "name": att.get("name"),
            "title": att.get("title"),
            "fileNameForDownload": att.get("fileNameForDownload"),
            "type": att.get("type"),
            "size": att.get("size"),
            "sizeKb": att.get("sizeKb"),
            "pages": att.get("pages"),
        }
        docs.append(doc)
    
    return docs

## Fetch All Records

The API is queried by year to manage response sizes. Each year's results are accumulated and saved incrementally.

In [None]:
def load_existing_records():
    """
    Load existing records if available.
    
    Returns:
        Tuple of (eis_records_df, doc_records_df) or (None, None)
    """
    eis_df = None
    doc_df = None
    
    if EIS_RECORD_PKL.exists():
        try:
            eis_df = pd.read_pickle(EIS_RECORD_PKL)
            logger.info(f"Loaded {len(eis_df)} existing EIS records")
        except Exception as e:
            logger.warning(f"Could not load existing EIS records: {e}")
    
    if DOC_RECORD_PKL.exists():
        try:
            doc_df = pd.read_pickle(DOC_RECORD_PKL)
            logger.info(f"Loaded {len(doc_df)} existing document records")
        except Exception as e:
            logger.warning(f"Could not load existing document records: {e}")
    
    return eis_df, doc_df


def save_records(eis_df: pd.DataFrame, doc_df: pd.DataFrame):
    """
    Save records to both pickle and parquet formats.
    """
    # Save pickle (fast Python serialization)
    eis_df.to_pickle(EIS_RECORD_PKL)
    doc_df.to_pickle(DOC_RECORD_PKL)
    
    # Save parquet (efficient, cross-platform)
    eis_df.to_parquet(EIS_RECORD_FILE, index=False)
    doc_df.to_parquet(DOC_RECORD_FILE, index=False)
    
    logger.info(f"Saved {len(eis_df)} EIS records and {len(doc_df)} document records")

In [None]:
def fetch_all_records(start_year: int = START_YEAR, end_year: int = END_YEAR, 
                      overwrite: bool = False, save_interval: int = 5):
    """
    Fetch all EIS records from the API.
    
    Args:
        start_year: First year to fetch
        end_year: Last year to fetch
        overwrite: If True, fetch all records regardless of existing data.
                   If False, only fetch records not already in the database.
        save_interval: Save progress every N years
    
    Returns:
        Tuple of (eis_records_df, doc_records_df)
    """
    # Load existing records
    existing_eis_df, existing_doc_df = load_existing_records()
    
    if overwrite or existing_eis_df is None:
        all_eis_records = []
        all_doc_records = []
        existing_ids = set()
    else:
        all_eis_records = existing_eis_df.to_dict('records')
        all_doc_records = existing_doc_df.to_dict('records') if existing_doc_df is not None else []
        existing_ids = set(existing_eis_df['eisId'].astype(str))
        logger.info(f"Starting with {len(existing_ids)} existing records")
    
    years = list(range(end_year, start_year - 1, -1))  # Most recent first
    
    for i, year in enumerate(tqdm(years, desc="Fetching years")):
        start_date = f"01/01/{year}"
        end_date = f"12/31/{year}"
        
        logger.info(f"Fetching records for {year}...")
        records = search_eis_by_date_range(start_date, end_date)
        
        new_count = 0
        for record in records:
            eis_id = str(record.get("eisId"))
            
            if not overwrite and eis_id in existing_ids:
                continue
            
            # Flatten and store EIS record
            flat_record = flatten_eis_record(record)
            all_eis_records.append(flat_record)
            existing_ids.add(eis_id)
            
            # Extract and store attachments
            attachments = extract_attachments(record)
            all_doc_records.extend(attachments)
            
            new_count += 1
        
        logger.info(f"Year {year}: Found {len(records)} records, {new_count} new")
        
        # Save progress periodically
        if (i + 1) % save_interval == 0:
            eis_df = pd.DataFrame(all_eis_records)
            doc_df = pd.DataFrame(all_doc_records)
            save_records(eis_df, doc_df)
        
        time.sleep(REQUEST_DELAY)
    
    # Final save
    eis_df = pd.DataFrame(all_eis_records)
    doc_df = pd.DataFrame(all_doc_records)
    save_records(eis_df, doc_df)
    
    return eis_df, doc_df

## Fetch Configuration

Configure the fetch settings before running.

In [None]:
# ============================================
# FETCH SETTINGS - MODIFY AS NEEDED
# ============================================

# Set to True to re-fetch ALL records from scratch (overwrites existing data)
# Set to False to only fetch NEW records not already in the database (incremental update)
OVERWRITE = False

# Year range to fetch
# Modify these to limit the fetch to specific years
FETCH_START_YEAR = 1987  # Earliest year in the database
FETCH_END_YEAR = datetime.now().year  # Current year

# How often to save progress (every N years)
# Lower values = more frequent saves, safer but slightly slower
SAVE_INTERVAL = 5

print(f"=== Fetch Configuration ===")
print(f"  OVERWRITE: {OVERWRITE}")
print(f"  Year range: {FETCH_START_YEAR} to {FETCH_END_YEAR}")
print(f"  Save interval: every {SAVE_INTERVAL} years")
print()
if OVERWRITE:
    print("WARNING: OVERWRITE=True will re-fetch ALL records and replace existing data!")
else:
    print("Mode: Incremental update (will skip existing records)")

## Run the Fetch

Execute the cell below to start fetching records. Progress will be saved incrementally.

In [None]:
# Run the fetch using configuration settings above
eis_df, doc_df = fetch_all_records(
    start_year=FETCH_START_YEAR,
    end_year=FETCH_END_YEAR,
    overwrite=OVERWRITE,
    save_interval=SAVE_INTERVAL
)

In [None]:
# Display summary statistics
print(f"\n=== Summary ===")
print(f"Total EIS records: {len(eis_df)}")
print(f"Total document records: {len(doc_df)}")
print(f"\nEIS records by year:")
eis_df['year'] = eis_df['eisId'].astype(str).str[:4]
print(eis_df['year'].value_counts().sort_index())

In [None]:
# Preview EIS records
print("\n=== EIS Records Sample ===")
display(eis_df[['eisId', 'title', 'leadAgency', 'states', 'federalRegisterReportDate', 'attachmentCount']].head(10))

In [None]:
# Preview document records
print("\n=== Document Records Sample ===")
display(doc_df[['eisId', 'attachmentId', 'name', 'type', 'sizeKb', 'pages']].head(10))

## Export to CSV (Optional)

For compatibility with R or other tools.

In [None]:
# Export to CSV
eis_df.to_csv(METADATA_DIR / "eis_record_api.csv", index=False)
doc_df.to_csv(METADATA_DIR / "eis_document_record_api.csv", index=False)
print(f"Exported to CSV files in {METADATA_DIR}")

## Utility: Fetch Single Record by ID

Useful for checking specific records or debugging.

In [None]:
# Example: Fetch a single record by EIS ID
# record = search_eis_by_id("20240001")
# if record:
#     print(json.dumps(record, indent=2, default=str))