# Download EPA Comment Letters via API

This notebook downloads EPA comment letters using the E-NEPA API.

**Key Features:**
- Downloads individual comment letter files using `attachmentId`
- Also supports bulk ZIP download per project via `/public/v1/eis/document/download/zip/comment_letter`
- `OVERWRITE` toggle to skip existing files or re-download
- Stores all files in flat directory: `documents/comment_letters/`
- Maintains naming convention: `{EIS_ID}_{filename}`

**API Documentation:** https://cdxapps.epa.gov/cdx-enepa-II/apidocs/index.html

In [None]:
# Install required packages if needed
# !pip install requests pandas tqdm pyarrow

In [None]:
import requests
import pandas as pd
import os
import re
import time
import zipfile
import io
from pathlib import Path
from tqdm.notebook import tqdm
import logging
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Configuration
BASE_URL = "https://cdxapps.epa.gov/cdx-enepa-II/rest"
DOWNLOAD_ENDPOINT = f"{BASE_URL}/public/v1/eis/document/download"
ZIP_COMMENT_ENDPOINT = f"{BASE_URL}/public/v1/eis/document/download/zip/comment_letter"

# Paths - relative to repository root
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"
COMMENT_LETTERS_DIR = DOCUMENTS_DIR / "comment_letters"  # Flat directory for all comment letters

# Input file (from fetch_comment_letters_api.ipynb)
COMMENT_LETTER_PKL = METADATA_DIR / "comment_letter_record_api.pkl"
COMMENT_LETTER_PARQUET = METADATA_DIR / "comment_letter_record_api.parquet"

# Alternative: use main document records
DOC_RECORD_PKL = METADATA_DIR / "eis_document_record_api.pkl"

# Download tracking file
DOWNLOAD_STATUS_FILE = METADATA_DIR / "comment_letter_download_status.pkl"

# Rate limiting
REQUEST_DELAY = 0.25

print(f"Repository root: {REPO_ROOT}")
print(f"Comment letters directory: {COMMENT_LETTERS_DIR}")
print(f"Documents directory is symlink: {DOCUMENTS_DIR.is_symlink()}")
if DOCUMENTS_DIR.is_symlink():
    print(f"Symlink target: {DOCUMENTS_DIR.resolve()}")

## Configuration

In [None]:
# ============================================
# DOWNLOAD SETTINGS - MODIFY AS NEEDED
# ============================================

# Set to True to re-download all files, False to skip existing files
OVERWRITE = False

# Download method:
# - "individual": Download each file separately (better for tracking, resumable)
# - "zip": Download all comment letters for each project as ZIP (faster for bulk)
DOWNLOAD_METHOD = "individual"

# Filter by year (set to None to download all years)
# Example: YEAR_FILTER = [2023, 2024]
YEAR_FILTER = None

# Maximum number of files to download (set to None for all)
# Useful for testing
MAX_DOWNLOADS = None  # e.g., 100

print(f"=== Download Configuration ===")
print(f"  OVERWRITE: {OVERWRITE}")
print(f"  DOWNLOAD_METHOD: {DOWNLOAD_METHOD}")
print(f"  YEAR_FILTER: {YEAR_FILTER}")
print(f"  MAX_DOWNLOADS: {MAX_DOWNLOADS}")

## Helper Functions

In [None]:
def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to match existing convention.
    """
    # Remove problematic characters
    clean = re.sub(r'[()&,~\/]', '', filename)
    # Replace multiple spaces/underscores with single underscore
    clean = re.sub(r'[\s_]+', '_', clean)
    # Normalize PDF extension
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = re.sub(r'\.pdf\.pdf$', '.pdf', clean, flags=re.IGNORECASE)
    # Remove leading/trailing underscores
    clean = clean.strip('_')
    return clean


def build_local_filename(eis_id, original_filename: str) -> str:
    """
    Build the local filename: {EIS_ID}_{sanitized_filename}
    """
    sanitized = sanitize_filename(original_filename)
    return f"{eis_id}_{sanitized}"


def get_existing_files() -> set:
    """
    Get set of existing files in comment_letters directory.
    """
    existing = set()
    if COMMENT_LETTERS_DIR.exists():
        for file in COMMENT_LETTERS_DIR.iterdir():
            if file.is_file():
                existing.add(file.name)
    return existing

In [None]:
def download_attachment(attachment_id: int, dest_path: Path, max_retries: int = 3) -> dict:
    """
    Download a single attachment by ID.
    """
    params = {"attachmentId": attachment_id}
    
    for attempt in range(max_retries):
        try:
            response = requests.get(
                DOWNLOAD_ENDPOINT, 
                params=params, 
                timeout=120,
                stream=True
            )
            response.raise_for_status()
            
            # Ensure parent directory exists
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Write file
            with open(dest_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            
            file_size = dest_path.stat().st_size
            
            if file_size == 0:
                dest_path.unlink()
                return {"success": False, "size": 0, "error": "Empty file received"}
            
            return {"success": True, "size": file_size, "error": None}
            
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for attachment {attachment_id}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return {"success": False, "size": 0, "error": str(e)}
    
    return {"success": False, "size": 0, "error": "Max retries exceeded"}


def download_comment_letters_zip(eis_id: str, dest_dir: Path, max_retries: int = 3) -> dict:
    """
    Download all comment letters for a project as ZIP and extract.
    
    Args:
        eis_id: EIS ID (as string)
        dest_dir: Directory to extract files to
        max_retries: Number of retry attempts
    
    Returns:
        Dict with download status and list of extracted files
    """
    params = {"eisId": eis_id}
    
    for attempt in range(max_retries):
        try:
            response = requests.get(
                ZIP_COMMENT_ENDPOINT,
                params=params,
                timeout=300,
                stream=True
            )
            response.raise_for_status()
            
            # Read ZIP into memory
            zip_content = io.BytesIO()
            for chunk in response.iter_content(chunk_size=8192):
                zip_content.write(chunk)
            zip_content.seek(0)
            
            # Extract files
            dest_dir.mkdir(parents=True, exist_ok=True)
            extracted_files = []
            
            with zipfile.ZipFile(zip_content) as zf:
                for member in zf.namelist():
                    if member.endswith('/'):  # Skip directories
                        continue
                    
                    # Build output filename with EIS ID prefix
                    original_name = Path(member).name
                    local_name = build_local_filename(eis_id, original_name)
                    dest_path = dest_dir / local_name
                    
                    # Extract
                    with zf.open(member) as src, open(dest_path, 'wb') as dst:
                        dst.write(src.read())
                    
                    extracted_files.append({
                        'filename': local_name,
                        'size': dest_path.stat().st_size
                    })
            
            return {
                "success": True,
                "files": extracted_files,
                "error": None
            }
            
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for EIS {eis_id}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return {"success": False, "files": [], "error": str(e)}
        except zipfile.BadZipFile as e:
            return {"success": False, "files": [], "error": f"Invalid ZIP: {e}"}
    
    return {"success": False, "files": [], "error": "Max retries exceeded"}

## Load Comment Letter Records

In [None]:
def load_comment_letter_records():
    """
    Load comment letter records.
    First tries dedicated comment letter file, then falls back to main doc records.
    """
    # Try dedicated comment letter file first
    if COMMENT_LETTER_PKL.exists():
        return pd.read_pickle(COMMENT_LETTER_PKL)
    elif COMMENT_LETTER_PARQUET.exists():
        return pd.read_parquet(COMMENT_LETTER_PARQUET)
    
    # Fall back to main document records
    if DOC_RECORD_PKL.exists():
        doc_df = pd.read_pickle(DOC_RECORD_PKL)
        # Filter to comment letters
        return doc_df[doc_df['type'] == 'Comment_Letter'].copy()
    
    raise FileNotFoundError(
        f"Comment letter records not found.\n"
        f"Run fetch_comment_letters_api.ipynb or fetch_eis_records_api.ipynb first."
    )


def load_download_status():
    """
    Load existing download status.
    """
    if DOWNLOAD_STATUS_FILE.exists():
        return pd.read_pickle(DOWNLOAD_STATUS_FILE)
    return pd.DataFrame(columns=['attachmentId', 'eisId', 'filename', 'downloaded', 'size', 'error', 'timestamp'])


def save_download_status(status_df: pd.DataFrame):
    """
    Save download status.
    """
    status_df.to_pickle(DOWNLOAD_STATUS_FILE)
    status_df.to_csv(METADATA_DIR / "comment_letter_download_status.csv", index=False)

In [None]:
# Load comment letter records
comment_df = load_comment_letter_records()
print(f"Loaded {len(comment_df)} comment letter records")

# Add year column - use ceqNumber (format: YYYYNNNN) for year extraction
comment_df['year'] = comment_df['ceqNumber'].astype(str).str[:4]

print(f"\nComment letters by year:")
print(comment_df['year'].value_counts().sort_index())

## Prepare Download Queue

In [None]:
def prepare_download_queue(df: pd.DataFrame, overwrite: bool = False, 
                           year_filter: list = None) -> pd.DataFrame:
    """
    Prepare the download queue.
    """
    queue = df.copy()
    
    # Ensure year column exists - use ceqNumber for year extraction
    if 'year' not in queue.columns:
        queue['year'] = queue['ceqNumber'].astype(str).str[:4]
    
    # Apply year filter
    if year_filter:
        year_filter_str = [str(y) for y in year_filter]
        queue = queue[queue['year'].isin(year_filter_str)]
        logger.info(f"Filtered to years {year_filter}: {len(queue)} letters")
    
    # Build local filenames using ceqNumber as prefix (matches existing convention)
    queue['localFilename'] = queue.apply(
        lambda row: build_local_filename(
            row['ceqNumber'], 
            row.get('name') or row.get('fileNameForDownload') or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    
    # Build full paths (flat directory, not by year)
    queue['localPath'] = queue['localFilename'].apply(lambda f: COMMENT_LETTERS_DIR / f)
    
    # Check for existing files
    if not overwrite:
        queue['exists'] = queue['localPath'].apply(lambda p: p.exists())
        existing_count = queue['exists'].sum()
        logger.info(f"Found {existing_count} existing files")
        queue = queue[~queue['exists']]
        logger.info(f"Queue after removing existing: {len(queue)} letters")
    
    # Remove rows with missing attachment IDs
    queue = queue[queue['attachmentId'].notna()]
    
    return queue

In [None]:
# Prepare download queue
download_queue = prepare_download_queue(
    comment_df,
    overwrite=OVERWRITE,
    year_filter=YEAR_FILTER
)

# Apply max downloads limit
if MAX_DOWNLOADS and len(download_queue) > MAX_DOWNLOADS:
    download_queue = download_queue.head(MAX_DOWNLOADS)
    logger.info(f"Limited to {MAX_DOWNLOADS} downloads")

print(f"\nDownload queue: {len(download_queue)} files")
if len(download_queue) > 0:
    print(f"\nBy year:")
    print(download_queue['year'].value_counts().sort_index())

## Execute Downloads

In [None]:
def download_individual(queue: pd.DataFrame) -> pd.DataFrame:
    """
    Download files individually using attachmentId.
    """
    results = []
    
    # Ensure directory exists
    COMMENT_LETTERS_DIR.mkdir(parents=True, exist_ok=True)
    
    for _, row in tqdm(queue.iterrows(), total=len(queue), desc="Downloading"):
        result = download_attachment(
            int(row['attachmentId']),
            row['localPath']
        )
        results.append({
            'attachmentId': row['attachmentId'],
            'eisId': row['eisId'],
            'filename': row['localFilename'],
            'downloaded': result['success'],
            'size': result['size'],
            'error': result['error'],
            'timestamp': datetime.now().isoformat()
        })
        time.sleep(REQUEST_DELAY)
    
    return pd.DataFrame(results)


def download_via_zip(queue: pd.DataFrame) -> pd.DataFrame:
    """
    Download files using ZIP endpoint (one ZIP per project).
    """
    results = []
    
    # Get unique EIS IDs
    unique_eis_ids = queue['eisId'].unique()
    logger.info(f"Downloading ZIPs for {len(unique_eis_ids)} projects")
    
    for eis_id in tqdm(unique_eis_ids, desc="Downloading ZIPs"):
        result = download_comment_letters_zip(str(eis_id), COMMENT_LETTERS_DIR)
        
        if result['success']:
            for file_info in result['files']:
                results.append({
                    'attachmentId': None,  # Not tracked in ZIP method
                    'eisId': eis_id,
                    'filename': file_info['filename'],
                    'downloaded': True,
                    'size': file_info['size'],
                    'error': None,
                    'timestamp': datetime.now().isoformat()
                })
        else:
            # Record failure for all expected files from this EIS
            eis_files = queue[queue['eisId'] == eis_id]
            for _, row in eis_files.iterrows():
                results.append({
                    'attachmentId': row['attachmentId'],
                    'eisId': eis_id,
                    'filename': row['localFilename'],
                    'downloaded': False,
                    'size': 0,
                    'error': result['error'],
                    'timestamp': datetime.now().isoformat()
                })
        
        time.sleep(REQUEST_DELAY)
    
    return pd.DataFrame(results)

In [None]:
# Run downloads
if len(download_queue) > 0:
    print(f"Starting download of {len(download_queue)} comment letters...")
    print(f"Method: {DOWNLOAD_METHOD}")
    print(f"Destination: {COMMENT_LETTERS_DIR}")
    
    if DOWNLOAD_METHOD == "zip":
        download_results = download_via_zip(download_queue)
    else:
        download_results = download_individual(download_queue)
    
    # Merge with existing status
    existing_status = load_download_status()
    combined_status = pd.concat([existing_status, download_results], ignore_index=True)
    
    # Remove duplicates (keep latest)
    if 'attachmentId' in combined_status.columns:
        combined_status = combined_status.drop_duplicates(subset=['filename'], keep='last')
    
    # Save status
    save_download_status(combined_status)
    
    # Summary
    success_count = download_results['downloaded'].sum()
    fail_count = len(download_results) - success_count
    total_size = download_results['size'].sum()
    
    print(f"\n=== Download Summary ===")
    print(f"Successful: {success_count}")
    print(f"Failed: {fail_count}")
    print(f"Total size: {total_size / (1024**2):.2f} MB")
    
    if fail_count > 0:
        print(f"\nFailed downloads:")
        display(download_results[~download_results['downloaded']][['eisId', 'filename', 'error']])
else:
    print("No files to download. All comment letters already exist or queue is empty.")

## Verify Downloads

In [None]:
def verify_downloads():
    """
    Verify downloaded comment letters against expected records.
    """
    comment_df_full = load_comment_letter_records()
    
    # Build expected filenames using ceqNumber as prefix
    comment_df_full['expectedFilename'] = comment_df_full.apply(
        lambda row: build_local_filename(
            row['ceqNumber'],
            row.get('name') or row.get('fileNameForDownload') or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    comment_df_full['expectedPath'] = comment_df_full['expectedFilename'].apply(
        lambda f: COMMENT_LETTERS_DIR / f
    )
    
    # Check existence
    comment_df_full['exists'] = comment_df_full['expectedPath'].apply(lambda p: p.exists())
    
    total = len(comment_df_full)
    existing = comment_df_full['exists'].sum()
    missing = total - existing
    
    print(f"=== Comment Letter Verification ===")
    print(f"Total expected: {total}")
    print(f"Existing files: {existing} ({100*existing/total:.1f}%)")
    print(f"Missing files: {missing} ({100*missing/total:.1f}%)")
    
    # By year - use ceqNumber for year extraction
    comment_df_full['year'] = comment_df_full['ceqNumber'].astype(str).str[:4]
    print(f"\nBy year:")
    summary = comment_df_full.groupby('year').agg(
        total=('exists', 'count'),
        existing=('exists', 'sum')
    )
    summary['missing'] = summary['total'] - summary['existing']
    summary['pct_complete'] = (100 * summary['existing'] / summary['total']).round(1)
    display(summary)
    
    return comment_df_full

verification_df = verify_downloads()

## Retry Failed Downloads

In [None]:
def retry_failed_downloads():
    """
    Retry any previously failed downloads.
    """
    status_df = load_download_status()
    failed = status_df[~status_df['downloaded']]
    
    if len(failed) == 0:
        print("No failed downloads to retry.")
        return
    
    print(f"Retrying {len(failed)} failed downloads...")
    
    # Rebuild queue
    comment_df_full = load_comment_letter_records()
    
    if 'attachmentId' in failed.columns and failed['attachmentId'].notna().any():
        retry_queue = comment_df_full[comment_df_full['attachmentId'].isin(failed['attachmentId'])].copy()
    else:
        # Match by eisId + filename pattern
        retry_queue = comment_df_full[comment_df_full['eisId'].isin(failed['eisId'])].copy()
    
    # Build filenames using ceqNumber as prefix
    retry_queue['localFilename'] = retry_queue.apply(
        lambda row: build_local_filename(
            row['ceqNumber'],
            row.get('name') or row.get('fileNameForDownload') or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    retry_queue['localPath'] = retry_queue['localFilename'].apply(lambda f: COMMENT_LETTERS_DIR / f)
    
    # Download
    retry_results = download_individual(retry_queue)
    
    # Update status
    status_df = status_df[~status_df['filename'].isin(retry_results['filename'])]
    combined_status = pd.concat([status_df, retry_results], ignore_index=True)
    save_download_status(combined_status)
    
    success_count = retry_results['downloaded'].sum()
    print(f"Retry complete: {success_count}/{len(retry_results)} successful")

# Uncomment to retry failed downloads:
# retry_failed_downloads()

## List Downloaded Files

In [None]:
# List files in comment_letters directory
if COMMENT_LETTERS_DIR.exists():
    files = list(COMMENT_LETTERS_DIR.iterdir())
    pdf_files = [f for f in files if f.suffix.lower() == '.pdf']
    
    print(f"Files in {COMMENT_LETTERS_DIR}:")
    print(f"  Total files: {len(files)}")
    print(f"  PDF files: {len(pdf_files)}")
    
    if pdf_files:
        total_size = sum(f.stat().st_size for f in pdf_files)
        print(f"  Total size: {total_size / (1024**2):.2f} MB")
        
        print(f"\nSample files:")
        for f in sorted(pdf_files)[:10]:
            print(f"  {f.name} ({f.stat().st_size / 1024:.1f} KB)")
else:
    print(f"Directory does not exist yet: {COMMENT_LETTERS_DIR}")