# Download EIS Documents via EPA E-NEPA API

This notebook downloads EIS documents using the official EPA E-NEPA API, replacing the web scraping approach in `download_eis_files_2024.R`.

**Key Features:**
- Downloads individual files using `attachmentId` (no need for ZIP bundles)
- `overwrite` toggle to skip existing files or re-download everything
- Interfaces with existing file structure (`documents/{YEAR}/`)
- Maintains naming convention: `{EIS_ID}_{filename}`
- Handles symbolic links to Box storage transparently

**API Documentation:** https://cdxapps.epa.gov/cdx-enepa-II/apidocs/index.html

In [32]:
# Install required packages if needed
# !pip install requests pandas tqdm pyarrow

In [60]:
import requests
import pandas as pd
import os
import re
import time
from pathlib import Path
from tqdm.notebook import tqdm
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [62]:
# Configuration
BASE_URL = "https://cdxapps.epa.gov/cdx-enepa-II/rest"
DOWNLOAD_ENDPOINT = f"{BASE_URL}/public/v1/eis/document/download"

# Paths - relative to repository root
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"  # May be a symlink to Box

# Input file (from fetch_eis_records_api.ipynb)
DOC_RECORD_FILE = METADATA_DIR / "eis_document_record_api.pkl"
DOC_RECORD_PARQUET = METADATA_DIR / "eis_document_record_api.parquet"

# Download tracking file
DOWNLOAD_STATUS_FILE = METADATA_DIR / "download_status_api.pkl"

# Rate limiting
REQUEST_DELAY = 0.25  # seconds between requests
MAX_WORKERS = 4  # parallel downloads (be respectful to the API)

print(f"Repository root: {REPO_ROOT}")
print(f"Documents directory: {DOCUMENTS_DIR}")
print(f"Is symlink: {DOCUMENTS_DIR.is_symlink()}")
if DOCUMENTS_DIR.is_symlink():
    print(f"Symlink target: {DOCUMENTS_DIR.resolve()}")

Repository root: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository
Documents directory: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/documents
Is symlink: True
Symlink target: /Users/admin-tascott/Library/CloudStorage/Box-Box/eis_documents/enepa_repository/documents


## Helper Functions

In [64]:
def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to match existing convention.
    - Remove special characters: ( ) & , ~
    - Replace spaces with underscores
    - Normalize PDF extension
    
    Args:
        filename: Original filename
    
    Returns:
        Sanitized filename
    """
    # Remove problematic characters
    clean = re.sub(r'[()&,~\/]', '', filename)
    # Replace multiple spaces/underscores with single underscore
    clean = re.sub(r'[\s_]+', '_', clean)
    # Normalize PDF extension
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = re.sub(r'\.pdf\.pdf$', '.pdf', clean, flags=re.IGNORECASE)
    # Remove leading/trailing underscores
    clean = clean.strip('_')
    return clean


def get_year_from_eis_id(ceq_number) -> str:
    """
    Extract year from CEQ Number (first 4 digits).
    
    Args:
        ceq_number: CEQ Number (e.g., 20240001)
    
    Returns:
        Year string (e.g., "2024")
    """
    return str(ceq_number)[:4]


def build_local_filename(ceq_number, original_filename: str) -> str:
    """
    Build the local filename following existing convention.
    Format: {CEQ_NUMBER}_{sanitized_filename}
    
    Note: This prepends the CEQ NUMBER even if it's already in the filename,
    matching the existing behavior.
    
    Args:
        ceq_number: CEQ NUMBER
        original_filename: Original filename from API
    
    Returns:
        Local filename
    """
    sanitized = sanitize_filename(original_filename)
    return f"{ceq_number}_{sanitized}"


def get_existing_files(documents_dir: Path) -> set:
    """
    Get set of all existing files (by filename only, not full path).
    
    Args:
        documents_dir: Path to documents directory
    
    Returns:
        Set of existing filenames
    """
    existing = set()
    
    if not documents_dir.exists():
        return existing
    
    for year_dir in documents_dir.iterdir():
        if year_dir.is_dir() and year_dir.name.isdigit():
            for file in year_dir.iterdir():
                if file.is_file():
                    existing.add(file.name)
    
    return existing

In [66]:
def download_attachment(attachment_id: int, dest_path: Path, max_retries: int = 3) -> dict:
    """
    Download a single attachment by ID.
    
    Args:
        attachment_id: The attachment ID from the API
        dest_path: Full path where file should be saved
        max_retries: Number of retry attempts
    
    Returns:
        Dict with download status: {success, size, error}
    """
    params = {"attachmentId": attachment_id}
    
    for attempt in range(max_retries):
        try:
            response = requests.get(
                DOWNLOAD_ENDPOINT, 
                params=params, 
                timeout=120,
                stream=True
            )
            response.raise_for_status()
            
            # Ensure parent directory exists
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Write file
            with open(dest_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            
            file_size = dest_path.stat().st_size
            
            # Check for empty file
            if file_size == 0:
                dest_path.unlink()  # Remove empty file
                return {"success": False, "size": 0, "error": "Empty file received"}
            
            return {"success": True, "size": file_size, "error": None}
            
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for attachment {attachment_id}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                return {"success": False, "size": 0, "error": str(e)}
    
    return {"success": False, "size": 0, "error": "Max retries exceeded"}

## Load Document Records

In [68]:
def load_document_records():
    """
    Load document records from the fetch notebook output.
    """
    if DOC_RECORD_FILE.exists():
        return pd.read_pickle(DOC_RECORD_FILE)
    elif DOC_RECORD_PARQUET.exists():
        return pd.read_parquet(DOC_RECORD_PARQUET)
    else:
        raise FileNotFoundError(
            f"Document records not found. Run fetch_eis_records_api.ipynb first.\n"
            f"Expected: {DOC_RECORD_FILE} or {DOC_RECORD_PARQUET}"
        )


def load_download_status():
    """
    Load existing download status tracking.
    """
    if DOWNLOAD_STATUS_FILE.exists():
        return pd.read_pickle(DOWNLOAD_STATUS_FILE)
    return pd.DataFrame(columns=['attachmentId', 'ceqNumber', 'filename', 'downloaded', 'size', 'error', 'timestamp'])


def save_download_status(status_df: pd.DataFrame):
    """
    Save download status tracking.
    """
    status_df.to_pickle(DOWNLOAD_STATUS_FILE)
    status_df.to_csv(METADATA_DIR / "download_status_api.csv", index=False)

In [70]:
# Load document records
doc_df = load_document_records()
print(f"Loaded {len(doc_df)} document records")
display(doc_df.head())

Loaded 45704 document records


Unnamed: 0,eisId,ceqNumber,attachmentId,name,title,fileNameForDownload,type,size,sizeKb,pages
0,531723,20250186,544805,LoMo FRR Comprehensive Study Draft Report.pdf,LoMo FRR Comprehensive Study Draft Report,LoMo FRR Comprehensive Study Draft Report.pdf,EIS_Document,19550084,19092,383.0
1,531723,20250186,544810,LoMo System Plan - Basis of Estimate.pdf,LoMo System Plan - Basis of Estimate,LoMo System Plan - Basis of Estimate.pdf,EIS_Document,2235648,2184,48.0
2,531723,20250186,544815,Appendix A.1 LoMo FRM Past Performance Assessm...,Appendix A.1 LoMo FRM Past Performance Assessment,Appendix A.1 LoMo FRM Past Performance Assessm...,EIS_Document,4515420,4410,124.0
3,531723,20250186,544820,Appendix A.2.1 LoMo RAS Calibration Omaha Dist...,Appendix A.2.1 LoMo RAS Calibration Omaha Dist...,Appendix A.2.1 LoMo RAS Calibration Omaha Dist...,EIS_Document,21973806,21459,162.0
4,531723,20250186,545125,Appendix A.2.2 LoMo RAS Calibration Kansas Cit...,Appendix A.2.2 LoMo RAS Calibration Kansas Cit...,Appendix A.2.2 LoMo RAS Calibration Kansas Cit...,EIS_Document,13582918,13265,126.0


In [72]:
# Summary by year - use eisId if ceqNumber not available
if 'ceqNumber' in doc_df.columns:
    doc_df['year'] = doc_df['ceqNumber'].astype(str).str[:4]
else:
    doc_df['year'] = doc_df['eisId'].astype(str).str[:4]

print("\nDocuments by year:")
print(doc_df['year'].value_counts().sort_index())


Documents by year:
year
1987       2
1988       1
1990       2
1991       4
1992       3
1993       3
1994      10
1995      20
1996      59
1997      82
1998      58
1999     202
2000     369
2001     481
2002     596
2003     562
2004     631
2005     755
2006     712
2007     912
2008     911
2009     894
2010     661
2011     406
2012     991
2013    3007
2014    2918
2015    4104
2016    3683
2017    2697
2018    3239
2019    2865
2020    3280
2021    2309
2022    2206
2023    1633
2024    2498
2025    1938
Name: count, dtype: int64


## Download Configuration

In [74]:
# ============================================
# DOWNLOAD SETTINGS - MODIFY AS NEEDED
# ============================================

# Set to True to re-download all files, False to skip existing files
OVERWRITE = False

# Filter by year (set to None to download all years)
# Example: YEAR_FILTER = [2023, 2024] to only download 2023-2024
YEAR_FILTER = None

# Filter by document type (set to None for all types)
# Common types: 'pdf', 'PDF', etc.
TYPE_FILTER = None  # e.g., ['pdf', 'PDF']

# Maximum number of files to download (set to None for all)
# Useful for testing
MAX_DOWNLOADS = None  # e.g., 100

print(f"Settings:")
print(f"  OVERWRITE: {OVERWRITE}")
print(f"  YEAR_FILTER: {YEAR_FILTER}")
print(f"  TYPE_FILTER: {TYPE_FILTER}")
print(f"  MAX_DOWNLOADS: {MAX_DOWNLOADS}")

Settings:
  OVERWRITE: False
  YEAR_FILTER: None
  TYPE_FILTER: None
  MAX_DOWNLOADS: None


## Prepare Download Queue

In [76]:
def prepare_download_queue(doc_df: pd.DataFrame, documents_dir: Path,
                           overwrite: bool = False, year_filter: list = None,
                           type_filter: list = None) -> pd.DataFrame:
    """
    Prepare the download queue by determining which files need to be downloaded.
    
    Args:
        doc_df: Document records DataFrame
        documents_dir: Path to documents directory
        overwrite: If True, include all files. If False, skip existing.
        year_filter: Optional list of years to include
        type_filter: Optional list of file types to include
    
    Returns:
        DataFrame with download queue
    """
    queue = doc_df.copy()
    
    # Add year column
    queue['year'] = queue['eisId'].astype(str).str[:4]
    
    # Apply year filter
    if year_filter:
        year_filter_str = [str(y) for y in year_filter]
        queue = queue[queue['year'].isin(year_filter_str)]
        logger.info(f"Filtered to years {year_filter}: {len(queue)} documents")
    
    # Apply type filter
    if type_filter:
        # Check file extension
        queue['extension'] = queue['name'].str.lower().str.split('.').str[-1]
        type_filter_lower = [t.lower() for t in type_filter]
        queue = queue[queue['extension'].isin(type_filter_lower)]
        logger.info(f"Filtered to types {type_filter}: {len(queue)} documents")
    
    # Build local filenames
    queue['localFilename'] = queue.apply(
        lambda row: build_local_filename(row['eisId'], row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"),
        axis=1
    )
    
    # Build full paths
    queue['localPath'] = queue.apply(
        lambda row: documents_dir / row['year'] / row['localFilename'],
        axis=1
    )
    
    # Check for existing files
    if not overwrite:
        queue['exists'] = queue['localPath'].apply(lambda p: p.exists())
        existing_count = queue['exists'].sum()
        logger.info(f"Found {existing_count} existing files")
        queue = queue[~queue['exists']]
        logger.info(f"Queue after removing existing: {len(queue)} documents")
    
    # Remove any rows with missing attachment IDs
    queue = queue[queue['attachmentId'].notna()]
    
    return queue

In [78]:
# Prepare the download queue
download_queue = prepare_download_queue(
    doc_df, 
    DOCUMENTS_DIR, 
    overwrite=OVERWRITE,
    year_filter=YEAR_FILTER,
    type_filter=TYPE_FILTER
)

# Apply max downloads limit
if MAX_DOWNLOADS and len(download_queue) > MAX_DOWNLOADS:
    download_queue = download_queue.head(MAX_DOWNLOADS)
    logger.info(f"Limited to {MAX_DOWNLOADS} downloads")

print(f"\nDownload queue: {len(download_queue)} files")
print(f"\nBy year:")
print(download_queue['year'].value_counts().sort_index())

2026-01-27 21:36:09,584 - INFO - Found 0 existing files
2026-01-27 21:36:09,593 - INFO - Queue after removing existing: 45704 documents



Download queue: 45704 files

By year:
year
1377    194
1520      3
1521     10
1523     55
1525      6
       ... 
8929    148
8930     80
8931    341
8932    143
8933    145
Name: count, Length: 3020, dtype: int64


## Execute Downloads

In [None]:
def download_files(queue: pd.DataFrame, parallel: bool = False) -> pd.DataFrame:
    """
    Download all files in the queue.
    
    Args:
        queue: Download queue DataFrame
        parallel: If True, use parallel downloads (be careful with rate limits)
    
    Returns:
        DataFrame with download status
    """
    results = []
    
    if parallel and MAX_WORKERS > 1:
        # Parallel downloads
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {}
            for _, row in queue.iterrows():
                future = executor.submit(
                    download_attachment,
                    int(row['attachmentId']),
                    row['localPath']
                )
                futures[future] = row
            
            for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
                row = futures[future]
                result = future.result()
                results.append({
                    'attachmentId': row['attachmentId'],
                    'eisId': row['eisId'],
                    'filename': row['localFilename'],
                    'downloaded': result['success'],
                    'size': result['size'],
                    'error': result['error'],
                    'timestamp': datetime.now().isoformat()
                })
    else:
        # Sequential downloads
        for _, row in tqdm(queue.iterrows(), total=len(queue), desc="Downloading"):
            result = download_attachment(
                int(row['attachmentId']),
                row['localPath']
            )
            results.append({
                'attachmentId': row['attachmentId'],
                'eisId': row['eisId'],
                'filename': row['localFilename'],
                'downloaded': result['success'],
                'size': result['size'],
                'error': result['error'],
                'timestamp': datetime.now().isoformat()
            })
            time.sleep(REQUEST_DELAY)
    
    return pd.DataFrame(results)

In [None]:
# Run the downloads
if len(download_queue) > 0:
    print(f"Starting download of {len(download_queue)} files...")
    print(f"Using {'parallel' if MAX_WORKERS > 1 else 'sequential'} downloads")
    
    download_results = download_files(download_queue, parallel=(MAX_WORKERS > 1))
    
    # Merge with existing status
    existing_status = load_download_status()
    combined_status = pd.concat([existing_status, download_results], ignore_index=True)
    
    # Remove duplicates (keep latest)
    combined_status = combined_status.drop_duplicates(subset=['attachmentId'], keep='last')
    
    # Save status
    save_download_status(combined_status)
    
    # Summary
    success_count = download_results['downloaded'].sum()
    fail_count = len(download_results) - success_count
    total_size = download_results['size'].sum()
    
    print(f"\n=== Download Summary ===")
    print(f"Successful: {success_count}")
    print(f"Failed: {fail_count}")
    print(f"Total size: {total_size / (1024**3):.2f} GB")
    
    if fail_count > 0:
        print(f"\nFailed downloads:")
        display(download_results[~download_results['downloaded']][['eisId', 'filename', 'error']])
else:
    print("No files to download. All files already exist or queue is empty.")

## Retry Failed Downloads

In [None]:
def retry_failed_downloads():
    """
    Retry any previously failed downloads.
    """
    status_df = load_download_status()
    failed = status_df[~status_df['downloaded']]
    
    if len(failed) == 0:
        print("No failed downloads to retry.")
        return
    
    print(f"Retrying {len(failed)} failed downloads...")
    
    # Rebuild queue from failed records
    doc_df_full = load_document_records()
    retry_queue = doc_df_full[doc_df_full['attachmentId'].isin(failed['attachmentId'])].copy()
    
    # Add required columns
    retry_queue['year'] = retry_queue['eisId'].astype(str).str[:4]
    retry_queue['localFilename'] = retry_queue.apply(
        lambda row: build_local_filename(row['eisId'], row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"),
        axis=1
    )
    retry_queue['localPath'] = retry_queue.apply(
        lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
        axis=1
    )
    
    # Download
    retry_results = download_files(retry_queue, parallel=False)
    
    # Update status
    status_df = status_df[~status_df['attachmentId'].isin(retry_results['attachmentId'])]
    combined_status = pd.concat([status_df, retry_results], ignore_index=True)
    save_download_status(combined_status)
    
    success_count = retry_results['downloaded'].sum()
    print(f"Retry complete: {success_count}/{len(retry_results)} successful")

# Uncomment to retry failed downloads:
# retry_failed_downloads()

## Verify Downloads Against Existing Files

In [None]:
def verify_downloads():
    """
    Compare expected documents against existing files.
    """
    doc_df_full = load_document_records()
    
    # Build expected filenames
    doc_df_full['year'] = doc_df_full['eisId'].astype(str).str[:4]
    doc_df_full['expectedFilename'] = doc_df_full.apply(
        lambda row: build_local_filename(row['eisId'], row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"),
        axis=1
    )
    doc_df_full['expectedPath'] = doc_df_full.apply(
        lambda row: DOCUMENTS_DIR / row['year'] / row['expectedFilename'],
        axis=1
    )
    
    # Check existence
    doc_df_full['exists'] = doc_df_full['expectedPath'].apply(lambda p: p.exists())
    
    total = len(doc_df_full)
    existing = doc_df_full['exists'].sum()
    missing = total - existing
    
    print(f"=== Download Verification ===")
    print(f"Total expected documents: {total}")
    print(f"Existing files: {existing} ({100*existing/total:.1f}%)")
    print(f"Missing files: {missing} ({100*missing/total:.1f}%)")
    
    print(f"\nBy year:")
    summary = doc_df_full.groupby('year').agg(
        total=('exists', 'count'),
        existing=('exists', 'sum')
    )
    summary['missing'] = summary['total'] - summary['existing']
    summary['pct_complete'] = (100 * summary['existing'] / summary['total']).round(1)
    display(summary)
    
    return doc_df_full

verification_df = verify_downloads()

## Utility: Download ZIP Bundles (Alternative Method)

If individual downloads are slow, you can use the ZIP bundle endpoints instead. This downloads all documents for a project at once.

In [None]:
def download_eis_zip(eis_id: int, set_number: int, dest_dir: Path) -> dict:
    """
    Download EIS documents as a ZIP file.
    
    Args:
        eis_id: EIS ID
        set_number: Set number (documents are partitioned)
        dest_dir: Directory to save ZIP
    
    Returns:
        Dict with download status
    """
    url = f"{BASE_URL}/public/v1/eis/document/download/zip/eis_document"
    params = {"eisId": eis_id, "set": set_number}
    
    try:
        response = requests.get(url, params=params, timeout=300, stream=True)
        response.raise_for_status()
        
        dest_path = dest_dir / f"{eis_id}_set{set_number}.zip"
        dest_dir.mkdir(parents=True, exist_ok=True)
        
        with open(dest_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        return {"success": True, "path": dest_path, "error": None}
    except Exception as e:
        return {"success": False, "path": None, "error": str(e)}


def download_comment_letters_zip(eis_id: str, dest_dir: Path) -> dict:
    """
    Download comment letters as a ZIP file.
    
    Args:
        eis_id: EIS ID (as string)
        dest_dir: Directory to save ZIP
    
    Returns:
        Dict with download status
    """
    url = f"{BASE_URL}/public/v1/eis/document/download/zip/comment_letter"
    params = {"eisId": eis_id}
    
    try:
        response = requests.get(url, params=params, timeout=300, stream=True)
        response.raise_for_status()
        
        dest_path = dest_dir / f"{eis_id}_comment_letters.zip"
        dest_dir.mkdir(parents=True, exist_ok=True)
        
        with open(dest_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        return {"success": True, "path": dest_path, "error": None}
    except Exception as e:
        return {"success": False, "path": None, "error": str(e)}

# Example usage (uncomment to use):
# result = download_eis_zip(20240001, 1, DOCUMENTS_DIR / "zips")
# print(result)