# Predict EPA Comment Letter Ratings Using Transformer Embeddings

This notebook uses transformer models to predict EPA ratings for comment letters published after October 2018 (when EPA stopped providing ratings) based on the content of pre-October 2018 letters that have ratings.

## Approach
1. Load pre-October 2018 letters with known ratings (training data)
2. Load post-October 2018 letters (prediction targets)
3. Preprocess text to extract letter body (excluding headers, footers, salutations, ratings)
4. Generate embeddings using transformer models
5. Train classifiers for letter rating (LO/EC/EO/EU) and number rating (1/2/3)
6. Predict ratings for post-2018 letters

## Model Options
- **sentence-transformers/all-mpnet-base-v2** (default): High-quality general embeddings, fast
- **nlpaueb/legal-bert-base-uncased**: Domain-specific legal text model
- **joelniklaus/legal-english-longformer-base**: Better for longer documents

## References
- [Legal-BERT](https://huggingface.co/nlpaueb/legal-bert-base-uncased)
- [Sentence Transformers](https://www.sbert.net/)
- [Legal English Longformer](https://huggingface.co/joelniklaus/legal-english-longformer-base)

In [None]:
# Install required packages
# !pip install sentence-transformers transformers torch scikit-learn pandas numpy tqdm pdfplumber

In [None]:
import pandas as pd
import numpy as np
import re
import os
import pickle
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm
import logging
import warnings
warnings.filterwarnings('ignore')

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Check for GPU
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

In [None]:
# Configuration
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"
COMMENT_LETTERS_DIR = DOCUMENTS_DIR / "comment_letters"

# Input files
RATINGS_FILE = METADATA_DIR / "comment_letter_ratings.csv"
COMMENT_LETTER_PKL = METADATA_DIR / "comment_letter_record_api.pkl"
DOC_RECORD_PKL = METADATA_DIR / "eis_document_record_api.pkl"
EIS_RECORD_PKL = METADATA_DIR / "eis_record_api.pkl"

# Output files
EMBEDDINGS_DIR = METADATA_DIR / "embeddings"
PREDICTED_RATINGS_FILE = METADATA_DIR / "comment_letter_ratings_predicted.csv"
MODEL_DIR = METADATA_DIR / "models"

# Create directories
EMBEDDINGS_DIR.mkdir(exist_ok=True)
MODEL_DIR.mkdir(exist_ok=True)

# Cutoff date
RATING_CUTOFF_DATE = datetime(2018, 10, 1)

print(f"Repository root: {REPO_ROOT}")
print(f"Ratings file: {RATINGS_FILE}")
print(f"Output: {PREDICTED_RATINGS_FILE}")

## Configuration

In [None]:
# ============================================
# MODEL SETTINGS - MODIFY AS NEEDED
# ============================================

# Embedding model options:
# - "sentence-transformers/all-mpnet-base-v2" (recommended, good quality, fast)
# - "sentence-transformers/all-MiniLM-L6-v2" (faster, slightly lower quality)
# - "nlpaueb/legal-bert-base-uncased" (legal domain-specific)
# - "joelniklaus/legal-english-longformer-base" (better for long documents)
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# Use cached embeddings if available (set to False to regenerate)
USE_CACHED_EMBEDDINGS = True

# Maximum text length (in characters) - longer texts will be truncated
# Models have token limits: BERT ~512 tokens, Longformer ~4096 tokens
MAX_TEXT_LENGTH = 10000  # ~2000 tokens

# Classifier to use: "logistic", "svm", "rf", "mlp", "ensemble"
CLASSIFIER_TYPE = "ensemble"

# Cross-validation folds
CV_FOLDS = 5

# Test split ratio (for final evaluation)
TEST_SIZE = 0.2

print(f"=== Configuration ===")
print(f"Embedding model: {EMBEDDING_MODEL}")
print(f"Use cached embeddings: {USE_CACHED_EMBEDDINGS}")
print(f"Max text length: {MAX_TEXT_LENGTH}")
print(f"Classifier: {CLASSIFIER_TYPE}")

## Text Preprocessing

Extract the main body of the letter, excluding:
- Headers (EPA letterhead, dates, addresses)
- Footers (page numbers, signatures)
- Salutations ("Dear...", "Sincerely...")
- Rating statements

In [None]:
# PDF text extraction
try:
    import pdfplumber
    PDF_LIBRARY = "pdfplumber"
except ImportError:
    try:
        from PyPDF2 import PdfReader
        PDF_LIBRARY = "pypdf2"
    except ImportError:
        PDF_LIBRARY = None
        print("WARNING: Install pdfplumber or pypdf2 for PDF extraction")

print(f"PDF library: {PDF_LIBRARY}")

In [None]:
def extract_text_from_pdf(pdf_path: Path, max_pages: int = None) -> str:
    """
    Extract text from a PDF file.
    """
    text = ""
    try:
        if PDF_LIBRARY == "pdfplumber":
            with pdfplumber.open(pdf_path) as pdf:
                pages = pdf.pages[:max_pages] if max_pages else pdf.pages
                for page in pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
        elif PDF_LIBRARY == "pypdf2":
            with open(pdf_path, 'rb') as f:
                reader = PdfReader(f)
                pages = reader.pages[:max_pages] if max_pages else reader.pages
                for page in pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
    except Exception as e:
        logger.warning(f"Error reading {pdf_path.name}: {e}")
    return text


def preprocess_letter_text(text: str) -> str:
    """
    Preprocess letter text to extract main body content.
    
    Removes:
    - EPA letterhead and headers
    - Addresses and date lines
    - Salutations and closings
    - Rating statements
    - Page numbers and footers
    - Excessive whitespace
    """
    if not text:
        return ""
    
    # Split into lines for processing
    lines = text.split('\n')
    processed_lines = []
    
    # Patterns to remove
    header_patterns = [
        r'^\s*UNITED STATES ENVIRONMENTAL PROTECTION AGENCY',
        r'^\s*U\.?S\.?\s*EPA',
        r'^\s*Environmental Protection Agency',
        r'^\s*Region\s+\d+',
        r'^\s*\d+\s+[A-Z][a-z]+\s+Street',  # Address lines
        r'^\s*[A-Z][a-z]+,\s*[A-Z]{2}\s+\d{5}',  # City, State ZIP
        r'^\s*\d{1,2}/\d{1,2}/\d{2,4}',  # Date formats
        r'^\s*[A-Z][a-z]+\s+\d{1,2},\s*\d{4}',  # Month Day, Year
        r'^\s*Page\s+\d+',
        r'^\s*-\s*\d+\s*-',
    ]
    
    salutation_patterns = [
        r'^\s*Dear\s+',
        r'^\s*To\s+Whom\s+It\s+May\s+Concern',
        r'^\s*RE:\s*',
        r'^\s*Re:\s*',
        r'^\s*Subject:\s*',
    ]
    
    closing_patterns = [
        r'^\s*Sincerely',
        r'^\s*Respectfully',
        r'^\s*Best\s+regards',
        r'^\s*Thank\s+you',
        r'^\s*cc:\s*',
        r'^\s*Enclosure',
        r'^\s*Attachment',
    ]
    
    rating_patterns = [
        r'\b(LO|EC|EO|EU)\s*[-–—]\s*[123]\b',
        r'Rating\s*:\s*(LO|EC|EO|EU)',
        r'EPA\s+Rating',
        r'Lack\s+of\s+Objections',
        r'Environmental\s+Concerns',
        r'Environmental\s+Objections',
        r'Environmentally\s+Unsatisfactory',
    ]
    
    in_body = False
    past_closing = False
    
    for line in lines:
        stripped = line.strip()
        
        # Skip empty lines at the start
        if not stripped and not in_body:
            continue
        
        # Skip headers
        if any(re.match(p, stripped, re.IGNORECASE) for p in header_patterns):
            continue
        
        # Detect salutations (marks start of body)
        if any(re.match(p, stripped, re.IGNORECASE) for p in salutation_patterns):
            in_body = True
            continue
        
        # Detect closings (marks end of body)
        if any(re.match(p, stripped, re.IGNORECASE) for p in closing_patterns):
            past_closing = True
            continue
        
        # Skip lines after closing
        if past_closing:
            continue
        
        # Remove rating statements from line
        for pattern in rating_patterns:
            stripped = re.sub(pattern, '', stripped, flags=re.IGNORECASE)
        
        # If we have substantive content, we're in the body
        if len(stripped) > 20:
            in_body = True
        
        if in_body and stripped:
            processed_lines.append(stripped)
    
    # Join and clean up
    result = ' '.join(processed_lines)
    
    # Remove extra whitespace
    result = re.sub(r'\s+', ' ', result).strip()
    
    return result

## Load Data

In [None]:
def load_ratings_data():
    """
    Load the extracted ratings for pre-October 2018 letters.
    """
    if not RATINGS_FILE.exists():
        raise FileNotFoundError(
            f"Ratings file not found: {RATINGS_FILE}\n"
            "Run extract_comment_letter_ratings.ipynb first."
        )
    
    df = pd.read_csv(RATINGS_FILE)
    
    # Filter to records with valid ratings
    df = df[df['combined_rating'].notna()].copy()
    
    # Standardize ratings
    df['letter_rating'] = df['letter_rating'].str.upper()
    df['number_rating'] = df['number_rating'].astype(str)
    
    return df


def load_comment_letter_metadata():
    """
    Load all comment letter metadata.
    """
    if COMMENT_LETTER_PKL.exists():
        return pd.read_pickle(COMMENT_LETTER_PKL)
    elif DOC_RECORD_PKL.exists():
        df = pd.read_pickle(DOC_RECORD_PKL)
        return df[df['type'] == 'Comment_Letter'].copy()
    else:
        raise FileNotFoundError("Comment letter metadata not found.")


def get_post_2018_letters(comment_df: pd.DataFrame, eis_df: pd.DataFrame = None):
    """
    Get comment letters from October 2018 onwards.
    """
    df = comment_df.copy()
    
    # Try to use commentLetterDate
    if 'commentLetterDate' in df.columns:
        df['_date'] = pd.to_datetime(df['commentLetterDate'], errors='coerce')
    else:
        df['_date'] = None
    
    # Merge dates from EIS records if needed
    if eis_df is not None and 'commentLetterDate' in eis_df.columns:
        eis_dates = eis_df[['eisId', 'commentLetterDate']].copy()
        eis_dates['_eis_date'] = pd.to_datetime(eis_dates['commentLetterDate'], errors='coerce')
        df = df.merge(eis_dates[['eisId', '_eis_date']], on='eisId', how='left')
        df['_date'] = df['_date'].fillna(df['_eis_date'])
    
    # Use EIS ID year for records without date
    df['_year'] = df['eisId'].astype(str).str[:4].astype(int)
    
    # Filter to post-2018 (October 2018 onwards)
    has_date = df['_date'].notna()
    post_cutoff_date = has_date & (df['_date'] >= RATING_CUTOFF_DATE)
    post_cutoff_year = ~has_date & (df['_year'] >= 2019)  # Conservative: 2019 and later
    maybe_post = ~has_date & (df['_year'] == 2018)  # 2018 might be before or after October
    
    # Include definite post-cutoff and 2018 unknowns
    result = df[post_cutoff_date | post_cutoff_year | maybe_post].copy()
    
    # Clean up
    cols_to_drop = [c for c in result.columns if c.startswith('_')]
    result = result.drop(columns=cols_to_drop, errors='ignore')
    
    return result

In [None]:
# Load training data (pre-2018 with ratings)
ratings_df = load_ratings_data()
print(f"Training data: {len(ratings_df)} letters with ratings")

print(f"\nRating distribution:")
print(ratings_df['combined_rating'].value_counts())

In [None]:
# Load post-2018 letters for prediction
try:
    comment_df = load_comment_letter_metadata()
    eis_df = pd.read_pickle(EIS_RECORD_PKL) if EIS_RECORD_PKL.exists() else None
    post_2018_df = get_post_2018_letters(comment_df, eis_df)
    print(f"\nPost-October 2018 letters to predict: {len(post_2018_df)}")
except Exception as e:
    print(f"Could not load post-2018 letters: {e}")
    post_2018_df = pd.DataFrame()

## Initialize Embedding Model

In [None]:
def load_embedding_model(model_name: str):
    """
    Load the embedding model.
    """
    if model_name.startswith("sentence-transformers/"):
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer(model_name.replace("sentence-transformers/", ""))
        return model, "sentence-transformers"
    
    elif "legal-bert" in model_name.lower():
        from transformers import AutoTokenizer, AutoModel
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        model.to(DEVICE)
        return (tokenizer, model), "transformers"
    
    elif "longformer" in model_name.lower():
        from transformers import AutoTokenizer, AutoModel
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        model.to(DEVICE)
        return (tokenizer, model), "transformers"
    
    else:
        # Default to sentence-transformers
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer(model_name)
        return model, "sentence-transformers"


def get_embeddings(texts: list, model, model_type: str, batch_size: int = 32) -> np.ndarray:
    """
    Generate embeddings for a list of texts.
    """
    if model_type == "sentence-transformers":
        embeddings = model.encode(
            texts, 
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_numpy=True
        )
        return embeddings
    
    elif model_type == "transformers":
        tokenizer, transformer = model
        embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize
            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(DEVICE)
            
            # Get embeddings
            with torch.no_grad():
                outputs = transformer(**inputs)
                # Mean pooling over token embeddings
                attention_mask = inputs['attention_mask']
                token_embeddings = outputs.last_hidden_state
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
                sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                batch_embeddings = (sum_embeddings / sum_mask).cpu().numpy()
            
            embeddings.append(batch_embeddings)
        
        return np.vstack(embeddings)
    
    else:
        raise ValueError(f"Unknown model type: {model_type}")

In [None]:
# Load embedding model
print(f"Loading embedding model: {EMBEDDING_MODEL}")
embedding_model, model_type = load_embedding_model(EMBEDDING_MODEL)
print(f"Model type: {model_type}")

## Extract and Embed Training Texts

In [None]:
def find_pdf_file(filename: str, eis_id: str) -> Path:
    """
    Find the PDF file in various possible locations.
    """
    year = str(eis_id)[:4]
    
    # Possible locations
    locations = [
        COMMENT_LETTERS_DIR / filename,
        DOCUMENTS_DIR / year / filename,
        DOCUMENTS_DIR / "comment_letters" / filename,
    ]
    
    for loc in locations:
        if loc.exists():
            return loc
    
    # Try fuzzy match
    for dir_path in [COMMENT_LETTERS_DIR, DOCUMENTS_DIR / year]:
        if dir_path.exists():
            for f in dir_path.iterdir():
                if f.name.startswith(f"{eis_id}_") and 'comment' in f.name.lower():
                    return f
    
    return None


def extract_texts_for_dataframe(df: pd.DataFrame, desc: str = "Extracting") -> tuple:
    """
    Extract and preprocess texts for all records in dataframe.
    
    Returns:
        Tuple of (texts, valid_indices)
    """
    texts = []
    valid_indices = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=desc):
        filename = row.get('filename')
        eis_id = str(row['eisId'])
        
        # Find PDF file
        if filename:
            pdf_path = find_pdf_file(filename, eis_id)
        else:
            pdf_path = None
        
        if pdf_path is None:
            continue
        
        # Extract and preprocess text
        raw_text = extract_text_from_pdf(pdf_path)
        processed_text = preprocess_letter_text(raw_text)
        
        # Skip if text is too short
        if len(processed_text) < 100:
            continue
        
        # Truncate if too long
        if len(processed_text) > MAX_TEXT_LENGTH:
            processed_text = processed_text[:MAX_TEXT_LENGTH]
        
        texts.append(processed_text)
        valid_indices.append(idx)
    
    return texts, valid_indices

In [None]:
# Cache file for embeddings
model_name_safe = EMBEDDING_MODEL.replace("/", "_")
train_embeddings_file = EMBEDDINGS_DIR / f"train_embeddings_{model_name_safe}.pkl"

if USE_CACHED_EMBEDDINGS and train_embeddings_file.exists():
    print(f"Loading cached training embeddings from {train_embeddings_file}")
    with open(train_embeddings_file, 'rb') as f:
        cache = pickle.load(f)
    train_embeddings = cache['embeddings']
    train_df = cache['dataframe']
    print(f"Loaded {len(train_embeddings)} embeddings")
else:
    print("Extracting texts from training PDFs...")
    train_texts, train_indices = extract_texts_for_dataframe(ratings_df, desc="Extracting training texts")
    
    # Create filtered dataframe
    train_df = ratings_df.loc[train_indices].copy()
    print(f"Successfully extracted {len(train_texts)} texts")
    
    # Generate embeddings
    print("\nGenerating embeddings...")
    train_embeddings = get_embeddings(train_texts, embedding_model, model_type)
    print(f"Embedding shape: {train_embeddings.shape}")
    
    # Cache embeddings
    with open(train_embeddings_file, 'wb') as f:
        pickle.dump({'embeddings': train_embeddings, 'dataframe': train_df}, f)
    print(f"Cached embeddings to {train_embeddings_file}")

## Train Classifiers

In [None]:
def get_classifier(classifier_type: str):
    """
    Get classifier based on type.
    """
    if classifier_type == "logistic":
        return LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
    elif classifier_type == "svm":
        return SVC(kernel='rbf', probability=True, random_state=42, class_weight='balanced')
    elif classifier_type == "rf":
        return RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    elif classifier_type == "mlp":
        return MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
    elif classifier_type == "ensemble":
        # Use voting ensemble of multiple classifiers
        from sklearn.ensemble import VotingClassifier
        estimators = [
            ('lr', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')),
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')),
            ('svm', SVC(kernel='rbf', probability=True, random_state=42, class_weight='balanced')),
        ]
        return VotingClassifier(estimators=estimators, voting='soft')
    else:
        raise ValueError(f"Unknown classifier type: {classifier_type}")


def train_and_evaluate(X: np.ndarray, y: np.ndarray, label_name: str, classifier_type: str):
    """
    Train classifier and evaluate using cross-validation.
    """
    print(f"\n{'='*50}")
    print(f"Training classifier for: {label_name}")
    print(f"{'='*50}")
    
    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    print(f"Classes: {le.classes_}")
    print(f"Class distribution: {np.bincount(y_encoded)}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=TEST_SIZE, random_state=42, stratify=y_encoded
    )
    
    # Train classifier
    clf = get_classifier(classifier_type)
    
    # Cross-validation on training set
    cv_scores = cross_val_score(clf, X_train, y_train, cv=CV_FOLDS, scoring='accuracy')
    print(f"\nCross-validation accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")
    
    # Train on full training set
    clf.fit(X_train, y_train)
    
    # Evaluate on test set
    y_pred = clf.predict(X_test)
    
    print(f"\nTest set accuracy: {accuracy_score(y_test, y_pred):.3f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Retrain on all data for final model
    final_clf = get_classifier(classifier_type)
    final_clf.fit(X, y_encoded)
    
    return final_clf, le

In [None]:
# Prepare labels
y_letter = train_df['letter_rating'].values
y_number = train_df['number_rating'].astype(str).values

print(f"Training samples: {len(train_embeddings)}")
print(f"\nLetter rating distribution:")
print(pd.Series(y_letter).value_counts())
print(f"\nNumber rating distribution:")
print(pd.Series(y_number).value_counts())

In [None]:
# Train letter rating classifier
letter_clf, letter_encoder = train_and_evaluate(
    train_embeddings, y_letter, "Letter Rating (LO/EC/EO/EU)", CLASSIFIER_TYPE
)

In [None]:
# Train number rating classifier
number_clf, number_encoder = train_and_evaluate(
    train_embeddings, y_number, "Number Rating (1/2/3)", CLASSIFIER_TYPE
)

In [None]:
# Save models
model_file = MODEL_DIR / f"rating_classifiers_{model_name_safe}.pkl"
with open(model_file, 'wb') as f:
    pickle.dump({
        'letter_clf': letter_clf,
        'letter_encoder': letter_encoder,
        'number_clf': number_clf,
        'number_encoder': number_encoder,
        'embedding_model': EMBEDDING_MODEL,
        'classifier_type': CLASSIFIER_TYPE,
    }, f)
print(f"Saved models to {model_file}")

## Predict Ratings for Post-2018 Letters

In [None]:
def sanitize_filename(filename: str) -> str:
    """Sanitize filename."""
    clean = re.sub(r'[()&,~\/]', '', filename)
    clean = re.sub(r'[\s_]+', '_', clean)
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = clean.strip('_')
    return clean


def build_filename_for_row(row):
    """Build expected filename for a record."""
    name = row.get('name') or row.get('fileNameForDownload') or f"{row.get('attachmentId', 'unknown')}.pdf"
    return f"{row['eisId']}_{sanitize_filename(name)}"

In [None]:
# Prepare post-2018 data for prediction
if len(post_2018_df) > 0:
    # Add filename column if not present
    if 'filename' not in post_2018_df.columns:
        post_2018_df['filename'] = post_2018_df.apply(build_filename_for_row, axis=1)
    
    # Check for cached embeddings
    pred_embeddings_file = EMBEDDINGS_DIR / f"pred_embeddings_{model_name_safe}.pkl"
    
    if USE_CACHED_EMBEDDINGS and pred_embeddings_file.exists():
        print(f"Loading cached prediction embeddings from {pred_embeddings_file}")
        with open(pred_embeddings_file, 'rb') as f:
            cache = pickle.load(f)
        pred_embeddings = cache['embeddings']
        pred_df = cache['dataframe']
        print(f"Loaded {len(pred_embeddings)} embeddings")
    else:
        print("Extracting texts from post-2018 PDFs...")
        pred_texts, pred_indices = extract_texts_for_dataframe(post_2018_df, desc="Extracting prediction texts")
        
        pred_df = post_2018_df.loc[pred_indices].copy()
        print(f"Successfully extracted {len(pred_texts)} texts")
        
        if len(pred_texts) > 0:
            print("\nGenerating embeddings...")
            pred_embeddings = get_embeddings(pred_texts, embedding_model, model_type)
            print(f"Embedding shape: {pred_embeddings.shape}")
            
            # Cache
            with open(pred_embeddings_file, 'wb') as f:
                pickle.dump({'embeddings': pred_embeddings, 'dataframe': pred_df}, f)
        else:
            pred_embeddings = np.array([])
            print("No texts extracted for prediction.")
else:
    print("No post-2018 letters to predict.")
    pred_embeddings = np.array([])
    pred_df = pd.DataFrame()

In [None]:
# Make predictions
if len(pred_embeddings) > 0:
    print(f"Making predictions for {len(pred_embeddings)} letters...")
    
    # Predict letter ratings
    letter_pred_encoded = letter_clf.predict(pred_embeddings)
    letter_pred = letter_encoder.inverse_transform(letter_pred_encoded)
    
    # Predict number ratings
    number_pred_encoded = number_clf.predict(pred_embeddings)
    number_pred = number_encoder.inverse_transform(number_pred_encoded)
    
    # Get prediction probabilities
    if hasattr(letter_clf, 'predict_proba'):
        letter_proba = letter_clf.predict_proba(pred_embeddings)
        letter_confidence = letter_proba.max(axis=1)
    else:
        letter_confidence = np.ones(len(letter_pred))
    
    if hasattr(number_clf, 'predict_proba'):
        number_proba = number_clf.predict_proba(pred_embeddings)
        number_confidence = number_proba.max(axis=1)
    else:
        number_confidence = np.ones(len(number_pred))
    
    # Build combined rating
    combined_pred = [f"{l}-{n}" for l, n in zip(letter_pred, number_pred)]
    
    # Create results dataframe
    results_df = pred_df[['eisId', 'filename']].copy()
    results_df['predicted_combined_rating'] = combined_pred
    results_df['predicted_letter_rating'] = letter_pred
    results_df['predicted_number_rating'] = number_pred
    results_df['letter_confidence'] = letter_confidence
    results_df['number_confidence'] = number_confidence
    results_df['avg_confidence'] = (letter_confidence + number_confidence) / 2
    
    print(f"\n=== Prediction Summary ===")
    print(f"Total predictions: {len(results_df)}")
    print(f"\nPredicted letter rating distribution:")
    print(results_df['predicted_letter_rating'].value_counts())
    print(f"\nPredicted number rating distribution:")
    print(results_df['predicted_number_rating'].value_counts())
    print(f"\nPredicted combined rating distribution:")
    print(results_df['predicted_combined_rating'].value_counts())
else:
    results_df = pd.DataFrame()
    print("No predictions to make.")

In [None]:
# Preview predictions
if len(results_df) > 0:
    print("\n=== Sample Predictions ===")
    display(results_df[[
        'eisId', 'filename', 'predicted_combined_rating', 
        'letter_confidence', 'number_confidence'
    ]].head(20))

## Save Predictions

In [None]:
# Save predictions to CSV
if len(results_df) > 0:
    # Select columns for output
    output_df = results_df[[
        'filename', 'eisId', 
        'predicted_combined_rating', 'predicted_letter_rating', 'predicted_number_rating',
        'letter_confidence', 'number_confidence', 'avg_confidence'
    ]].copy()
    
    # Sort by EIS ID
    output_df = output_df.sort_values('eisId')
    
    # Save
    output_df.to_csv(PREDICTED_RATINGS_FILE, index=False)
    print(f"Saved {len(output_df)} predictions to {PREDICTED_RATINGS_FILE}")
    
    # Also save with full metadata
    full_output = METADATA_DIR / "comment_letter_ratings_predicted_full.csv"
    results_df.to_csv(full_output, index=False)
    print(f"Saved full results to {full_output}")

## Analysis: Compare Training vs Predicted Distributions

In [None]:
if len(results_df) > 0:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Letter rating comparison
    ax1 = axes[0]
    train_letter_counts = pd.Series(y_letter).value_counts(normalize=True).sort_index()
    pred_letter_counts = results_df['predicted_letter_rating'].value_counts(normalize=True).sort_index()
    
    x = np.arange(len(train_letter_counts))
    width = 0.35
    ax1.bar(x - width/2, train_letter_counts.values, width, label='Training (pre-2018)', alpha=0.7)
    ax1.bar(x + width/2, [pred_letter_counts.get(k, 0) for k in train_letter_counts.index], width, label='Predicted (post-2018)', alpha=0.7)
    ax1.set_xticks(x)
    ax1.set_xticklabels(train_letter_counts.index)
    ax1.set_title('Letter Rating Distribution')
    ax1.legend()
    ax1.set_ylabel('Proportion')
    
    # Number rating comparison
    ax2 = axes[1]
    train_number_counts = pd.Series(y_number).value_counts(normalize=True).sort_index()
    pred_number_counts = results_df['predicted_number_rating'].value_counts(normalize=True).sort_index()
    
    x = np.arange(len(train_number_counts))
    ax2.bar(x - width/2, train_number_counts.values, width, label='Training (pre-2018)', alpha=0.7)
    ax2.bar(x + width/2, [pred_number_counts.get(k, 0) for k in train_number_counts.index], width, label='Predicted (post-2018)', alpha=0.7)
    ax2.set_xticks(x)
    ax2.set_xticklabels(train_number_counts.index)
    ax2.set_title('Number Rating Distribution')
    ax2.legend()
    ax2.set_ylabel('Proportion')
    
    # Confidence distribution
    ax3 = axes[2]
    ax3.hist(results_df['avg_confidence'], bins=20, alpha=0.7, edgecolor='black')
    ax3.set_title('Prediction Confidence Distribution')
    ax3.set_xlabel('Average Confidence')
    ax3.set_ylabel('Count')
    ax3.axvline(results_df['avg_confidence'].median(), color='red', linestyle='--', label=f'Median: {results_df["avg_confidence"].median():.2f}')
    ax3.legend()
    
    plt.tight_layout()
    plt.savefig(METADATA_DIR / "rating_prediction_analysis.png", dpi=150)
    plt.show()
    print(f"Saved analysis plot to {METADATA_DIR / 'rating_prediction_analysis.png'}")

## Low Confidence Predictions

These predictions may need manual review.

In [None]:
if len(results_df) > 0:
    # Identify low confidence predictions
    low_confidence = results_df[results_df['avg_confidence'] < 0.5].copy()
    
    print(f"Low confidence predictions (< 50%): {len(low_confidence)} ({100*len(low_confidence)/len(results_df):.1f}%)")
    
    if len(low_confidence) > 0:
        display(low_confidence[[
            'eisId', 'filename', 'predicted_combined_rating', 'avg_confidence'
        ]].head(20))
        
        # Save for review
        review_file = METADATA_DIR / "predictions_need_review.csv"
        low_confidence.to_csv(review_file, index=False)
        print(f"\nSaved low confidence predictions to {review_file}")