# Extract EPA Comment Letter Ratings

EPA comment letters prior to October 2018 included a rating system:

## Rating System

### Environmental Impact (Letters)
- **LO** (Lack of Objections): No potential environmental impacts requiring changes
- **EC** (Environmental Concerns): Environmental impacts that should be avoided
- **EO** (Environmental Objections): Significant environmental impacts that must be avoided
- **EU** (Environmentally Unsatisfactory): Unsatisfactory due to potential for significant harm

### Adequacy of Information (Numbers)
- **1** (Adequate): EIS adequately sets forth environmental impacts
- **2** (Insufficient Information): Draft EIS lacks sufficient information
- **3** (Inadequate): Draft EIS does not meet NEPA/Section 309 requirements

### Combined Rating
Example: **EC-2** = Environmental Concerns + Insufficient Information

## Output
CSV file with: filename, eisId, combined_rating, letter_rating, number_rating

In [None]:
# Install required packages if needed
# !pip install pypdf2 pdfplumber pandas tqdm

In [None]:
import pandas as pd
import re
import os
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm
import logging

# Try different PDF libraries
try:
    import pdfplumber
    PDF_LIBRARY = "pdfplumber"
except ImportError:
    try:
        from PyPDF2 import PdfReader
        PDF_LIBRARY = "pypdf2"
    except ImportError:
        PDF_LIBRARY = None
        print("WARNING: No PDF library found. Install pdfplumber or pypdf2:")
        print("  pip install pdfplumber")
        print("  pip install pypdf2")

print(f"Using PDF library: {PDF_LIBRARY}")

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Configuration
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"
COMMENT_LETTERS_DIR = DOCUMENTS_DIR / "comment_letters"

# Input files
COMMENT_LETTER_PKL = METADATA_DIR / "comment_letter_record_api.pkl"
DOC_RECORD_PKL = METADATA_DIR / "eis_document_record_api.pkl"
EIS_RECORD_PKL = METADATA_DIR / "eis_record_api.pkl"

# Legacy R data files (fallback)
EIS_RECORD_RDS = METADATA_DIR / "eis_record_detail.rds"
DOC_RECORD_RDS = METADATA_DIR / "eis_document_record.rds"

# Output file
OUTPUT_FILE = METADATA_DIR / "comment_letter_ratings.csv"

# Cutoff date for ratings (October 2018)
RATING_CUTOFF_DATE = datetime(2018, 10, 1)

print(f"Repository root: {REPO_ROOT}")
print(f"Comment letters directory: {COMMENT_LETTERS_DIR}")
print(f"Output file: {OUTPUT_FILE}")
print(f"Rating cutoff: {RATING_CUTOFF_DATE.strftime('%B %Y')}")

## Rating Extraction Functions

In [None]:
# Valid rating components
LETTER_RATINGS = ['LO', 'EC', 'EO', 'EU']
NUMBER_RATINGS = ['1', '2', '3']

# Regex patterns for finding ratings
# Pattern 1: Combined rating like "EC-2", "LO-1", etc.
COMBINED_PATTERN = re.compile(
    r'\b(LO|EC|EO|EU)\s*[-–—]\s*([123])\b',
    re.IGNORECASE
)

# Pattern 2: Rating in context like "Rating: EC-2" or "rated EC-2"
RATING_CONTEXT_PATTERN = re.compile(
    r'(?:rating|rated|rate)\s*[:.]?\s*(LO|EC|EO|EU)\s*[-–—]?\s*([123])?',
    re.IGNORECASE
)

# Pattern 3: Spelled out ratings
SPELLED_OUT_PATTERNS = {
    'LO': re.compile(r'lack\s+of\s+objections?', re.IGNORECASE),
    'EC': re.compile(r'environmental\s+concerns?', re.IGNORECASE),
    'EO': re.compile(r'environmental\s+objections?', re.IGNORECASE),
    'EU': re.compile(r'environmentally\s+unsatisfactory', re.IGNORECASE),
}

# Pattern for number ratings in context
NUMBER_CONTEXT_PATTERNS = {
    '1': re.compile(r'\b(?:category|adequacy)\s*[-:]?\s*1\b|\badequate\b', re.IGNORECASE),
    '2': re.compile(r'\b(?:category|adequacy)\s*[-:]?\s*2\b|\binsufficient\s+information\b', re.IGNORECASE),
    '3': re.compile(r'\b(?:category|adequacy)\s*[-:]?\s*3\b|\binadequate\b', re.IGNORECASE),
}

In [None]:
def extract_text_from_pdf(pdf_path: Path, max_pages: int = 10) -> str:
    """
    Extract text from a PDF file.
    
    Args:
        pdf_path: Path to PDF file
        max_pages: Maximum number of pages to read (ratings are usually on first few pages)
    
    Returns:
        Extracted text
    """
    text = ""
    
    try:
        if PDF_LIBRARY == "pdfplumber":
            with pdfplumber.open(pdf_path) as pdf:
                for i, page in enumerate(pdf.pages[:max_pages]):
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
        
        elif PDF_LIBRARY == "pypdf2":
            with open(pdf_path, 'rb') as f:
                reader = PdfReader(f)
                for i, page in enumerate(reader.pages[:max_pages]):
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
        else:
            logger.warning("No PDF library available")
            return ""
            
    except Exception as e:
        logger.warning(f"Error reading {pdf_path.name}: {e}")
        return ""
    
    return text


def extract_rating(text: str) -> dict:
    """
    Extract EPA rating from text.
    
    Args:
        text: Extracted PDF text
    
    Returns:
        Dict with combined_rating, letter_rating, number_rating
    """
    result = {
        'combined_rating': None,
        'letter_rating': None,
        'number_rating': None,
        'extraction_method': None
    }
    
    if not text:
        return result
    
    # Method 1: Look for combined rating pattern (most reliable)
    combined_match = COMBINED_PATTERN.search(text)
    if combined_match:
        letter = combined_match.group(1).upper()
        number = combined_match.group(2)
        result['letter_rating'] = letter
        result['number_rating'] = number
        result['combined_rating'] = f"{letter}-{number}"
        result['extraction_method'] = 'combined_pattern'
        return result
    
    # Method 2: Look for rating in context
    context_match = RATING_CONTEXT_PATTERN.search(text)
    if context_match:
        letter = context_match.group(1).upper()
        number = context_match.group(2) if context_match.group(2) else None
        result['letter_rating'] = letter
        result['number_rating'] = number
        if number:
            result['combined_rating'] = f"{letter}-{number}"
        else:
            result['combined_rating'] = letter
        result['extraction_method'] = 'context_pattern'
        return result
    
    # Method 3: Look for spelled-out ratings
    found_letter = None
    found_number = None
    
    for letter, pattern in SPELLED_OUT_PATTERNS.items():
        if pattern.search(text):
            # Verify it's in rating context (near "rating" or "EPA")
            # Find the match position
            match = pattern.search(text)
            if match:
                # Check surrounding context (100 chars before/after)
                start = max(0, match.start() - 100)
                end = min(len(text), match.end() + 100)
                context = text[start:end].lower()
                if 'rating' in context or 'epa' in context or 'category' in context:
                    found_letter = letter
                    break
    
    for number, pattern in NUMBER_CONTEXT_PATTERNS.items():
        if pattern.search(text):
            found_number = number
            break
    
    if found_letter:
        result['letter_rating'] = found_letter
        result['number_rating'] = found_number
        if found_number:
            result['combined_rating'] = f"{found_letter}-{found_number}"
        else:
            result['combined_rating'] = found_letter
        result['extraction_method'] = 'spelled_out'
    
    return result

## Load Comment Letter Metadata

In [None]:
def load_comment_letter_metadata():
    """
    Load comment letter metadata from available sources.
    """
    # Try API-generated files first
    if COMMENT_LETTER_PKL.exists():
        df = pd.read_pickle(COMMENT_LETTER_PKL)
        logger.info(f"Loaded {len(df)} records from {COMMENT_LETTER_PKL.name}")
        return df
    
    if DOC_RECORD_PKL.exists():
        df = pd.read_pickle(DOC_RECORD_PKL)
        df = df[df['type'] == 'Comment_Letter'].copy()
        logger.info(f"Loaded {len(df)} comment letters from {DOC_RECORD_PKL.name}")
        return df
    
    # Try to load from EIS records (which may have commentLetterDate)
    if EIS_RECORD_PKL.exists():
        eis_df = pd.read_pickle(EIS_RECORD_PKL)
        logger.info(f"Loaded EIS records from {EIS_RECORD_PKL.name}")
        return eis_df
    
    raise FileNotFoundError(
        "No metadata files found. Run fetch_eis_records_api.ipynb first."
    )


def get_eis_metadata():
    """
    Load EIS record metadata for dates.
    """
    if EIS_RECORD_PKL.exists():
        return pd.read_pickle(EIS_RECORD_PKL)
    return None

In [None]:
# Load metadata
comment_df = load_comment_letter_metadata()
eis_df = get_eis_metadata()

print(f"Total comment letter records: {len(comment_df)}")
print(f"\nColumns: {list(comment_df.columns)}")

In [None]:
def filter_pre_october_2018(df: pd.DataFrame, eis_df: pd.DataFrame = None) -> pd.DataFrame:
    """
    Filter to comment letters before October 2018.
    
    Uses commentLetterDate if available, otherwise uses eisId year.
    """
    filtered = df.copy()
    
    # Try to get date from commentLetterDate column
    if 'commentLetterDate' in filtered.columns:
        # Parse dates
        filtered['_date'] = pd.to_datetime(filtered['commentLetterDate'], errors='coerce')
        has_date = filtered['_date'].notna()
        filtered.loc[has_date, '_pre_cutoff'] = filtered.loc[has_date, '_date'] < RATING_CUTOFF_DATE
    else:
        filtered['_pre_cutoff'] = None
    
    # For records without date, try to merge from EIS records
    if eis_df is not None and 'commentLetterDate' in eis_df.columns:
        no_date = filtered['_pre_cutoff'].isna()
        if no_date.any():
            eis_dates = eis_df[['eisId', 'commentLetterDate']].copy()
            eis_dates['_eis_date'] = pd.to_datetime(eis_dates['commentLetterDate'], errors='coerce')
            filtered = filtered.merge(eis_dates[['eisId', '_eis_date']], on='eisId', how='left')
            
            # Fill in missing dates
            still_no_date = filtered['_pre_cutoff'].isna() & filtered['_eis_date'].notna()
            filtered.loc[still_no_date, '_pre_cutoff'] = filtered.loc[still_no_date, '_eis_date'] < RATING_CUTOFF_DATE
    
    # For remaining records without date, use EIS ID year
    still_missing = filtered['_pre_cutoff'].isna()
    if still_missing.any():
        filtered['_year'] = filtered['eisId'].astype(str).str[:4].astype(int)
        # If year < 2018, definitely pre-cutoff; if year > 2018, definitely post-cutoff
        # If year == 2018, we can't be sure, so include it to be safe
        filtered.loc[still_missing, '_pre_cutoff'] = filtered.loc[still_missing, '_year'] <= 2018
    
    # Filter
    result = filtered[filtered['_pre_cutoff'] == True].copy()
    
    # Clean up temp columns
    cols_to_drop = [c for c in result.columns if c.startswith('_')]
    result = result.drop(columns=cols_to_drop, errors='ignore')
    
    return result

In [None]:
# Filter to pre-October 2018
pre_2018_df = filter_pre_october_2018(comment_df, eis_df)
print(f"Comment letters pre-October 2018: {len(pre_2018_df)}")

# Show year distribution
pre_2018_df['year'] = pre_2018_df['eisId'].astype(str).str[:4]
print(f"\nBy year:")
print(pre_2018_df['year'].value_counts().sort_index())

## Find Comment Letter Files

In [None]:
def sanitize_filename(filename: str) -> str:
    """Sanitize filename to match download convention."""
    clean = re.sub(r'[()&,~\/]', '', filename)
    clean = re.sub(r'[\s_]+', '_', clean)
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = re.sub(r'\.pdf\.pdf$', '.pdf', clean, flags=re.IGNORECASE)
    clean = clean.strip('_')
    return clean


def find_comment_letter_files(df: pd.DataFrame) -> pd.DataFrame:
    """
    Find actual PDF files for comment letters.
    
    Checks both:
    - documents/comment_letters/ (flat structure)
    - documents/{YEAR}/ (by-year structure)
    """
    result = df.copy()
    result['file_path'] = None
    result['file_found'] = False
    
    # Build expected filenames
    def build_expected_filename(row):
        name = row.get('name') or row.get('fileNameForDownload') or f"{row.get('attachmentId', 'unknown')}.pdf"
        return f"{row['eisId']}_{sanitize_filename(name)}"
    
    result['expected_filename'] = result.apply(build_expected_filename, axis=1)
    
    # Get existing files from comment_letters directory
    comment_letter_files = {}
    if COMMENT_LETTERS_DIR.exists():
        for f in COMMENT_LETTERS_DIR.iterdir():
            if f.is_file() and f.suffix.lower() == '.pdf':
                comment_letter_files[f.name] = f
    
    # Get existing files from year directories
    year_dir_files = {}
    if DOCUMENTS_DIR.exists():
        for year_dir in DOCUMENTS_DIR.iterdir():
            if year_dir.is_dir() and year_dir.name.isdigit():
                for f in year_dir.iterdir():
                    if f.is_file() and f.suffix.lower() == '.pdf':
                        year_dir_files[f.name] = f
    
    logger.info(f"Found {len(comment_letter_files)} files in comment_letters/")
    logger.info(f"Found {len(year_dir_files)} files in year directories")
    
    # Match files
    for idx, row in result.iterrows():
        expected = row['expected_filename']
        eis_id = str(row['eisId'])
        year = eis_id[:4]
        
        # Check comment_letters directory first
        if expected in comment_letter_files:
            result.at[idx, 'file_path'] = str(comment_letter_files[expected])
            result.at[idx, 'file_found'] = True
            continue
        
        # Check year directory
        if expected in year_dir_files:
            result.at[idx, 'file_path'] = str(year_dir_files[expected])
            result.at[idx, 'file_found'] = True
            continue
        
        # Try fuzzy match (same EIS ID prefix)
        for fname, fpath in {**comment_letter_files, **year_dir_files}.items():
            if fname.startswith(f"{eis_id}_") and 'comment' in fname.lower():
                result.at[idx, 'file_path'] = str(fpath)
                result.at[idx, 'file_found'] = True
                break
    
    return result

In [None]:
# Find files
files_df = find_comment_letter_files(pre_2018_df)

found_count = files_df['file_found'].sum()
total_count = len(files_df)

print(f"\n=== File Search Results ===")
print(f"Files found: {found_count} / {total_count} ({100*found_count/total_count:.1f}%)")
print(f"Files missing: {total_count - found_count}")

## Extract Ratings

In [None]:
def process_comment_letters(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process all comment letters and extract ratings.
    """
    # Filter to files that were found
    to_process = df[df['file_found']].copy()
    
    results = []
    
    for idx, row in tqdm(to_process.iterrows(), total=len(to_process), desc="Extracting ratings"):
        file_path = Path(row['file_path'])
        
        # Extract text
        text = extract_text_from_pdf(file_path)
        
        # Extract rating
        rating = extract_rating(text)
        
        results.append({
            'filename': file_path.name,
            'eisId': row['eisId'],
            'combined_rating': rating['combined_rating'],
            'letter_rating': rating['letter_rating'],
            'number_rating': rating['number_rating'],
            'extraction_method': rating['extraction_method'],
            'file_path': str(file_path)
        })
    
    return pd.DataFrame(results)

In [None]:
# Process files
if PDF_LIBRARY is None:
    print("ERROR: No PDF library available. Please install pdfplumber or pypdf2.")
else:
    ratings_df = process_comment_letters(files_df)

In [None]:
# Summary statistics
if 'ratings_df' in dir() and len(ratings_df) > 0:
    print(f"\n=== Extraction Results ===")
    print(f"Total files processed: {len(ratings_df)}")
    
    ratings_found = ratings_df['combined_rating'].notna().sum()
    print(f"Ratings found: {ratings_found} ({100*ratings_found/len(ratings_df):.1f}%)")
    print(f"Ratings not found: {len(ratings_df) - ratings_found}")
    
    print(f"\n=== Rating Distribution ===")
    print(f"\nCombined ratings:")
    print(ratings_df['combined_rating'].value_counts(dropna=False))
    
    print(f"\nLetter ratings:")
    print(ratings_df['letter_rating'].value_counts(dropna=False))
    
    print(f"\nNumber ratings:")
    print(ratings_df['number_rating'].value_counts(dropna=False))
    
    print(f"\nExtraction methods used:")
    print(ratings_df['extraction_method'].value_counts(dropna=False))

In [None]:
# Preview results
if 'ratings_df' in dir() and len(ratings_df) > 0:
    print("\n=== Sample Results ===")
    display(ratings_df[['filename', 'eisId', 'combined_rating', 'letter_rating', 'number_rating']].head(20))

## Save Results

In [None]:
# Save to CSV
if 'ratings_df' in dir() and len(ratings_df) > 0:
    # Select and order columns for output
    output_df = ratings_df[['filename', 'eisId', 'combined_rating', 'letter_rating', 'number_rating']].copy()
    
    # Save
    output_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Saved {len(output_df)} records to {OUTPUT_FILE}")
    
    # Also save full results with extraction metadata
    full_output = METADATA_DIR / "comment_letter_ratings_full.csv"
    ratings_df.to_csv(full_output, index=False)
    print(f"Saved full results to {full_output}")

## Review Files Without Ratings

These files may need manual review or have different formatting.

In [None]:
# Show files where rating wasn't found
if 'ratings_df' in dir() and len(ratings_df) > 0:
    no_rating = ratings_df[ratings_df['combined_rating'].isna()]
    
    if len(no_rating) > 0:
        print(f"\n=== Files Without Extracted Ratings ({len(no_rating)}) ===")
        print("These may need manual review:")
        display(no_rating[['filename', 'eisId', 'file_path']].head(20))
        
        # Save list of files needing review
        review_file = METADATA_DIR / "comment_letters_need_review.csv"
        no_rating[['filename', 'eisId', 'file_path']].to_csv(review_file, index=False)
        print(f"\nSaved list to {review_file}")

## Debug: Test Rating Extraction on Sample File

In [None]:
def test_extraction(file_path: str, show_text: bool = True):
    """
    Test rating extraction on a single file.
    """
    path = Path(file_path)
    if not path.exists():
        print(f"File not found: {path}")
        return
    
    text = extract_text_from_pdf(path, max_pages=5)
    
    if show_text:
        print("=== Extracted Text (first 3000 chars) ===")
        print(text[:3000])
        print("\n" + "="*50)
    
    rating = extract_rating(text)
    print(f"\n=== Extracted Rating ===")
    print(f"Combined: {rating['combined_rating']}")
    print(f"Letter: {rating['letter_rating']}")
    print(f"Number: {rating['number_rating']}")
    print(f"Method: {rating['extraction_method']}")

# Uncomment to test on a specific file:
# test_extraction("/path/to/comment_letter.pdf")