# Download EPA Comment Letters via API

This notebook downloads EPA comment letters using the E-NEPA API.

**Key Features:**
- Downloads individual comment letter files using `attachmentId`
- `OVERWRITE` toggle to skip existing files or re-download
- Stores all files in flat directory: `documents/comment_letters/`
- Maintains naming convention: `{CEQ_NUMBER}_{filename}`

**API Documentation:** https://cdxapps.epa.gov/cdx-enepa-II/apidocs/index.html

In [41]:
# Install required packages if needed
# !pip install requests pandas tqdm pyarrow

In [42]:
import requests
import pandas as pd
import os
import re
import time
import shutil
from pathlib import Path
from tqdm.notebook import tqdm
import logging
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [43]:
# Configuration
BASE_URL = "https://cdxapps.epa.gov/cdx-enepa-II/rest"
DOWNLOAD_ENDPOINT = f"{BASE_URL}/public/v1/eis/document/download"

# Paths - relative to repository root
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"
COMMENT_LETTERS_DIR = DOCUMENTS_DIR / "comment_letters"  # Flat directory for all comment letters

# Input file (from fetch_eis_records_api.ipynb)
COMMENT_LETTER_PKL = METADATA_DIR / "comment_letter_record_api.pkl"
COMMENT_LETTER_PARQUET = METADATA_DIR / "comment_letter_record_api.parquet"

# Alternative: use main document records
DOC_RECORD_PKL = METADATA_DIR / "eis_document_record_api.pkl"

# Download tracking file
DOWNLOAD_STATUS_FILE = METADATA_DIR / "comment_letter_download_status.pkl"

# Rate limiting
REQUEST_DELAY = 0.25

print(f"Repository root: {REPO_ROOT}")
print(f"Comment letters directory: {COMMENT_LETTERS_DIR}")
print(f"Documents directory is symlink: {DOCUMENTS_DIR.is_symlink()}")
if DOCUMENTS_DIR.is_symlink():
    print(f"Symlink target: {DOCUMENTS_DIR.resolve()}")

Repository root: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository
Comment letters directory: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/documents/comment_letters
Documents directory is symlink: True
Symlink target: /Users/admin-tascott/Library/CloudStorage/Box-Box/eis_documents/enepa_repository/documents


## Configuration

In [45]:
# ============================================
# DOWNLOAD SETTINGS - MODIFY AS NEEDED
# ============================================

# Set to True to re-download all files, False to skip existing files
OVERWRITE = False

# Copy optimization: If True, check documents/{YEAR}/ for existing files and copy
# instead of re-downloading. Set to False if files are on cloud storage (Box) 
# and copying is slow due to on-demand download.
USE_COPY_OPTIMIZATION = False

# Filter by year (set to None to download all years)
# Example: YEAR_FILTER = [2023, 2024]
YEAR_FILTER = None

# Maximum number of files to download (set to None for all)
# Useful for testing
MAX_DOWNLOADS = None  # e.g., 100

print(f"=== Download Configuration ===")
print(f"  OVERWRITE: {OVERWRITE}")
print(f"  USE_COPY_OPTIMIZATION: {USE_COPY_OPTIMIZATION}")
print(f"  YEAR_FILTER: {YEAR_FILTER}")
print(f"  MAX_DOWNLOADS: {MAX_DOWNLOADS}")

=== Download Configuration ===
  OVERWRITE: False
  USE_COPY_OPTIMIZATION: False
  YEAR_FILTER: None
  MAX_DOWNLOADS: None


## Helper Functions

In [47]:
def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to match existing convention.
    """
    # Remove problematic characters
    clean = re.sub(r'[()&,~\/]', '', filename)
    # Replace multiple spaces/underscores with single underscore
    clean = re.sub(r'[\s_]+', '_', clean)
    # Normalize PDF extension
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = re.sub(r'\.pdf\.pdf$', '.pdf', clean, flags=re.IGNORECASE)
    # Remove leading/trailing underscores
    clean = clean.strip('_')
    return clean


def build_local_filename(ceq_number, original_filename: str) -> str:
    """
    Build the local filename: {CEQ_NUMBER}_{sanitized_filename}
    """
    sanitized = sanitize_filename(original_filename)
    return f"{ceq_number}_{sanitized}"


def get_existing_files() -> set:
    """
    Get set of existing files in comment_letters directory.
    """
    existing = set()
    if COMMENT_LETTERS_DIR.exists():
        for file in COMMENT_LETTERS_DIR.iterdir():
            if file.is_file():
                existing.add(file.name)
    return existing


def find_in_documents_dir(ceq_number, filename: str) -> Path:
    """
    Check if a file exists in the main documents directory (documents/{YEAR}/).
    
    Args:
        ceq_number: CEQ number (used to determine year and filename)
        filename: The expected filename
    
    Returns:
        Path to existing file if found, None otherwise
    """
    year = str(ceq_number)[:4]
    expected_path = DOCUMENTS_DIR / year / filename
    
    if expected_path.exists():
        return expected_path
    
    return None

In [48]:
def download_attachment(attachment_id: int, dest_path: Path, max_retries: int = 3) -> dict:
    """
    Download a single attachment by ID.
    """
    params = {"attachmentId": attachment_id}
    
    for attempt in range(max_retries):
        try:
            response = requests.get(
                DOWNLOAD_ENDPOINT, 
                params=params, 
                timeout=120,
                stream=True
            )
            response.raise_for_status()
            
            # Ensure parent directory exists
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Write file
            with open(dest_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            
            file_size = dest_path.stat().st_size
            
            if file_size == 0:
                dest_path.unlink()
                return {"success": False, "size": 0, "error": "Empty file received"}
            
            return {"success": True, "size": file_size, "error": None}
            
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for attachment {attachment_id}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return {"success": False, "size": 0, "error": str(e)}
    
    return {"success": False, "size": 0, "error": "Max retries exceeded"}

## Load Comment Letter Records

In [50]:
def load_comment_letter_records():
    """
    Load comment letter records.
    First tries dedicated comment letter file, then falls back to main doc records.
    """
    # Try dedicated comment letter file first
    if COMMENT_LETTER_PKL.exists():
        return pd.read_pickle(COMMENT_LETTER_PKL)
    elif COMMENT_LETTER_PARQUET.exists():
        return pd.read_parquet(COMMENT_LETTER_PARQUET)
    
    # Fall back to main document records
    if DOC_RECORD_PKL.exists():
        doc_df = pd.read_pickle(DOC_RECORD_PKL)
        # Filter to comment letters
        return doc_df[doc_df['type'] == 'Comment_Letter'].copy()
    
    raise FileNotFoundError(
        f"Comment letter records not found.\n"
        f"Run fetch_comment_letters_api.ipynb or fetch_eis_records_api.ipynb first."
    )


def load_download_status():
    """
    Load existing download status.
    """
    if DOWNLOAD_STATUS_FILE.exists():
        return pd.read_pickle(DOWNLOAD_STATUS_FILE)
    return pd.DataFrame(columns=['attachmentId', 'eisId', 'filename', 'downloaded', 'size', 'error', 'timestamp'])


def save_download_status(status_df: pd.DataFrame):
    """
    Save download status.
    """
    status_df.to_pickle(DOWNLOAD_STATUS_FILE)
    status_df.to_csv(METADATA_DIR / "comment_letter_download_status.csv", index=False)

In [51]:
# Load comment letter records
comment_df = load_comment_letter_records()
print(f"Loaded {len(comment_df)} comment letter records")

# Add year column - use ceqNumber (format: YYYYNNNN) for year extraction
comment_df['year'] = comment_df['ceqNumber'].astype(str).str[:4]

print(f"\nComment letters by year:")
print(comment_df['year'].value_counts().sort_index())

Loaded 11700 comment letter records

Comment letters by year:
year
1987      2
1988      1
1990      2
1991      4
1992      3
1993      3
1994     10
1995     20
1996     59
1997     82
1998     58
1999    202
2000    369
2001    478
2002    596
2003    553
2004    631
2005    751
2006    712
2007    912
2008    911
2009    894
2010    661
2011    406
2012    352
2013    332
2014    331
2015    318
2016    293
2017    197
2018    284
2019    252
2020    221
2021    165
2022    163
2023    139
2024    198
2025    135
Name: count, dtype: int64


## Prepare Download Queue

In [53]:
def prepare_download_queue(df: pd.DataFrame, overwrite: bool = False, 
                           year_filter: list = None,
                           use_copy_optimization: bool = True) -> pd.DataFrame:
    """
    Prepare the download queue.
    
    Checks for files in three places:
    1. Already in comment_letters/ directory
    2. Available in documents/{YEAR}/ directory (can be copied) - if use_copy_optimization=True
    3. Need to download from API
    """
    queue = df.copy()
    
    # Ensure year column exists - use ceqNumber for year extraction
    if 'year' not in queue.columns:
        queue['year'] = queue['ceqNumber'].astype(str).str[:4]
    
    # Apply year filter
    if year_filter:
        year_filter_str = [str(y) for y in year_filter]
        queue = queue[queue['year'].isin(year_filter_str)]
        logger.info(f"Filtered to years {year_filter}: {len(queue)} letters")
    
    # Build local filenames using ceqNumber as prefix (matches existing convention)
    queue['localFilename'] = queue.apply(
        lambda row: build_local_filename(
            row['ceqNumber'], 
            row.get('name') or row.get('fileNameForDownload') or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    
    # Build full paths (flat directory for comment letters)
    queue['localPath'] = queue['localFilename'].apply(lambda f: COMMENT_LETTERS_DIR / f)
    
    # Check for source in main documents directory (can be copied instead of downloaded)
    if use_copy_optimization:
        logger.info("Checking documents/ directory for existing files (this may take a moment)...")
        queue['sourcePath'] = queue.apply(
            lambda row: find_in_documents_dir(row['ceqNumber'], row['localFilename']),
            axis=1
        )
        queue['canCopy'] = queue['sourcePath'].notna()
    else:
        queue['sourcePath'] = None
        queue['canCopy'] = False
    
    # Check for existing files in comment_letters/
    if not overwrite:
        queue['exists'] = queue['localPath'].apply(lambda p: p.exists())
        existing_count = queue['exists'].sum()
        logger.info(f"Found {existing_count} existing files in comment_letters/")
        queue = queue[~queue['exists']]
        logger.info(f"Queue after removing existing: {len(queue)} letters")
    
    # Remove rows with missing attachment IDs
    queue = queue[queue['attachmentId'].notna()]
    
    # Log copy vs download stats
    if use_copy_optimization:
        copy_count = queue['canCopy'].sum()
        download_count = len(queue) - copy_count
        logger.info(f"Can copy from documents/: {copy_count}")
        logger.info(f"Need to download: {download_count}")
    
    return queue

In [54]:
# Prepare download queue
download_queue = prepare_download_queue(
    comment_df,
    overwrite=OVERWRITE,
    year_filter=YEAR_FILTER,
    use_copy_optimization=USE_COPY_OPTIMIZATION
)

# Apply max downloads limit
if MAX_DOWNLOADS and len(download_queue) > MAX_DOWNLOADS:
    download_queue = download_queue.head(MAX_DOWNLOADS)
    logger.info(f"Limited to {MAX_DOWNLOADS} downloads")

print(f"\n=== Download Queue ===")
print(f"Total files to process: {len(download_queue)}")

if len(download_queue) > 0:
    print(f"\nBy year:")
    print(download_queue['year'].value_counts().sort_index())

2026-01-30 14:26:19,303 - INFO - Found 11626 existing files in comment_letters/
2026-01-30 14:26:19,305 - INFO - Queue after removing existing: 74 letters



=== Download Queue ===
Total files to process: 74

By year:
year
2010     4
2011    66
2015     4
Name: count, dtype: int64


## Execute Downloads

In [56]:
def copy_file(source_path: Path, dest_path: Path) -> dict:
    """
    Copy a file from source to destination.
    """
    try:
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source_path, dest_path)
        file_size = dest_path.stat().st_size
        return {"success": True, "size": file_size, "error": None, "method": "copy"}
    except Exception as e:
        return {"success": False, "size": 0, "error": str(e), "method": "copy"}


def download_files(queue: pd.DataFrame, use_copy: bool = False) -> pd.DataFrame:
    """
    Download comment letter files.
    
    Args:
        queue: DataFrame with files to process
        use_copy: If True and files exist in documents/{YEAR}/, copy instead of download
    """
    results = []
    
    # Ensure directory exists
    COMMENT_LETTERS_DIR.mkdir(parents=True, exist_ok=True)
    
    if use_copy and 'canCopy' in queue.columns:
        copy_queue = queue[queue['canCopy'] == True].copy()
        download_queue = queue[queue['canCopy'] == False].copy()
    else:
        copy_queue = pd.DataFrame()
        download_queue = queue
    
    # Copy files from documents directory
    if len(copy_queue) > 0:
        logger.info(f"Copying {len(copy_queue)} files from documents/...")
        failed_copies = []
        
        for idx, row in tqdm(copy_queue.iterrows(), total=len(copy_queue), desc="Copying"):
            result = copy_file(row['sourcePath'], row['localPath'])
            
            if result['success']:
                results.append({
                    'attachmentId': row['attachmentId'],
                    'ceqNumber': row['ceqNumber'],
                    'filename': row['localFilename'],
                    'downloaded': True,
                    'size': result['size'],
                    'error': None,
                    'method': 'copy',
                    'timestamp': datetime.now().isoformat()
                })
            else:
                failed_copies.append(row)
        
        # Add failed copies to download queue
        if failed_copies:
            logger.info(f"{len(failed_copies)} copies failed, will download instead")
            failed_df = pd.DataFrame(failed_copies)
            download_queue = pd.concat([download_queue, failed_df], ignore_index=True)
    
    # Download files from API
    if len(download_queue) > 0:
        logger.info(f"Downloading {len(download_queue)} files from API...")
        for idx, row in tqdm(download_queue.iterrows(), total=len(download_queue), desc="Downloading"):
            result = download_attachment(
                int(row['attachmentId']),
                row['localPath']
            )
            results.append({
                'attachmentId': row['attachmentId'],
                'ceqNumber': row['ceqNumber'],
                'filename': row['localFilename'],
                'downloaded': result['success'],
                'size': result['size'],
                'error': result['error'],
                'method': 'download',
                'timestamp': datetime.now().isoformat()
            })
            time.sleep(REQUEST_DELAY)
    
    return pd.DataFrame(results)

In [57]:
# Run downloads
if len(download_queue) > 0:
    print(f"Processing {len(download_queue)} comment letters...")
    print(f"Destination: {COMMENT_LETTERS_DIR}")
    
    if USE_COPY_OPTIMIZATION:
        copy_count = download_queue['canCopy'].sum()
        print(f"  - Can copy from documents/: {copy_count}")
        print(f"  - Need to download: {len(download_queue) - copy_count}")
    
    download_results = download_files(download_queue, use_copy=USE_COPY_OPTIMIZATION)
    
    # Merge with existing status
    existing_status = load_download_status()
    combined_status = pd.concat([existing_status, download_results], ignore_index=True)
    
    # Remove duplicates (keep latest)
    if 'filename' in combined_status.columns:
        combined_status = combined_status.drop_duplicates(subset=['filename'], keep='last')
    
    # Save status
    save_download_status(combined_status)
    
    # Summary
    success_count = download_results['downloaded'].sum()
    fail_count = len(download_results) - success_count
    total_size = download_results['size'].sum()
    
    print(f"\n=== Summary ===")
    print(f"Successful: {success_count}")
    print(f"Failed: {fail_count}")
    print(f"Total size: {total_size / (1024**2):.2f} MB")
    
    if fail_count > 0:
        print(f"\nFailed downloads:")
        display(download_results[~download_results['downloaded']][['ceqNumber', 'filename', 'error']])
else:
    print("No files to download. All comment letters already exist or queue is empty.")

2026-01-30 14:26:19,330 - INFO - Downloading 74 files from API...


Processing 74 comment letters...
Destination: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/documents/comment_letters


Downloading:   0%|          | 0/74 [00:00<?, ?it/s]


=== Summary ===
Successful: 71
Failed: 3
Total size: 62.76 MB

Failed downloads:


Unnamed: 0,ceqNumber,filename,error
0,20150083,20150083_20150083.pdf,Empty file received
1,20150058,20150058_20150058.pdf,Empty file received
3,20150060,20150060_20150060.pdf,Empty file received


## Verify Downloads

In [59]:
def verify_downloads():
    """
    Verify downloaded comment letters against expected records.
    """
    comment_df_full = load_comment_letter_records()
    
    # Build expected filenames using ceqNumber as prefix
    comment_df_full['expectedFilename'] = comment_df_full.apply(
        lambda row: build_local_filename(
            row['ceqNumber'],
            row.get('name') or row.get('fileNameForDownload') or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    comment_df_full['expectedPath'] = comment_df_full['expectedFilename'].apply(
        lambda f: COMMENT_LETTERS_DIR / f
    )
    
    # Check existence
    comment_df_full['exists'] = comment_df_full['expectedPath'].apply(lambda p: p.exists())
    
    total = len(comment_df_full)
    existing = comment_df_full['exists'].sum()
    missing = total - existing
    
    print(f"=== Comment Letter Verification ===")
    print(f"Total expected: {total}")
    print(f"Existing files: {existing} ({100*existing/total:.1f}%)")
    print(f"Missing files: {missing} ({100*missing/total:.1f}%)")
    
    # By year - use ceqNumber for year extraction
    comment_df_full['year'] = comment_df_full['ceqNumber'].astype(str).str[:4]
    print(f"\nBy year:")
    summary = comment_df_full.groupby('year').agg(
        total=('exists', 'count'),
        existing=('exists', 'sum')
    )
    summary['missing'] = summary['total'] - summary['existing']
    summary['pct_complete'] = (100 * summary['existing'] / summary['total']).round(1)
    display(summary)
    
    return comment_df_full

verification_df = verify_downloads()

=== Comment Letter Verification ===
Total expected: 11700
Existing files: 11696 (100.0%)
Missing files: 4 (0.0%)

By year:


Unnamed: 0_level_0,total,existing,missing,pct_complete
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1987,2,2,0,100.0
1988,1,1,0,100.0
1990,2,2,0,100.0
1991,4,4,0,100.0
1992,3,3,0,100.0
1993,3,3,0,100.0
1994,10,10,0,100.0
1995,20,20,0,100.0
1996,59,59,0,100.0
1997,82,82,0,100.0


## Retry Failed Downloads

In [61]:
def retry_failed_downloads():
    """
    Retry any previously failed downloads.
    """
    status_df = load_download_status()
    failed = status_df[~status_df['downloaded']]
    
    if len(failed) == 0:
        print("No failed downloads to retry.")
        return
    
    print(f"Retrying {len(failed)} failed downloads...")
    
    # Rebuild queue
    comment_df_full = load_comment_letter_records()
    
    if 'attachmentId' in failed.columns and failed['attachmentId'].notna().any():
        retry_queue = comment_df_full[comment_df_full['attachmentId'].isin(failed['attachmentId'])].copy()
    else:
        retry_queue = comment_df_full[comment_df_full['eisId'].isin(failed['eisId'])].copy()
    
    # Build filenames
    retry_queue['localFilename'] = retry_queue.apply(
        lambda row: build_local_filename(
            row['ceqNumber'],
            row.get('name') or row.get('fileNameForDownload') or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    retry_queue['localPath'] = retry_queue['localFilename'].apply(lambda f: COMMENT_LETTERS_DIR / f)
    
    # Download (no copy optimization for retries)
    retry_results = download_files(retry_queue, use_copy=False)
    
    # Update status
    status_df = status_df[~status_df['filename'].isin(retry_results['filename'])]
    combined_status = pd.concat([status_df, retry_results], ignore_index=True)
    save_download_status(combined_status)
    
    success_count = retry_results['downloaded'].sum()
    print(f"Retry complete: {success_count}/{len(retry_results)} successful")

# Uncomment to retry failed downloads:
# retry_failed_downloads()

## List Downloaded Files

In [63]:
# List files in comment_letters directory
if COMMENT_LETTERS_DIR.exists():
    files = list(COMMENT_LETTERS_DIR.iterdir())
    pdf_files = [f for f in files if f.suffix.lower() == '.pdf']
    
    print(f"Files in {COMMENT_LETTERS_DIR}:")
    print(f"  Total files: {len(files)}")
    print(f"  PDF files: {len(pdf_files)}")
    
    if pdf_files:
        total_size = sum(f.stat().st_size for f in pdf_files)
        print(f"  Total size: {total_size / (1024**2):.2f} MB")
        
        print(f"\nSample files:")
        for f in sorted(pdf_files)[:10]:
            print(f"  {f.name} ({f.stat().st_size / 1024:.1f} KB)")
else:
    print(f"Directory does not exist yet: {COMMENT_LETTERS_DIR}")

Files in /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/documents/comment_letters:
  Total files: 11641
  PDF files: 6428
  Total size: 3963.26 MB

Sample files:
  19870010_19870010.pdf (569.6 KB)
  19870393_19870393.pdf (2081.2 KB)
  19880342_19880342.pdf (800.7 KB)
  19900002_19900002.pdf (235.4 KB)
  19900271_19900271.pdf (1777.3 KB)
  19910005_19910005.pdf (290.1 KB)
  19910009_19910009.pdf (977.3 KB)
  19910014_19910014.pdf (309.5 KB)
  19910406_PETRIFIED_FOREST_NP_GMP.pdf (279.0 KB)
  19920401_19920401.pdf (38.8 KB)
