# Enhanced LLM-based PDF Extraction (Multiple Permits)
This notebook extracts **multiple permit JSON objects** from a PDF.

- Handles free-text and tabular formats.
- Uses LLM to normalize field names and values into a canonical schema.
- Returns an array of JSON objects (one per permit).


In [None]:

import os, json, re
from PyPDF2 import PdfReader
import openai

# Canonical schema
SCHEMA = ["Permit Number", "Issue Date", "Expiry Date", "Owner Name", "Address"]


In [None]:

def extract_raw_text(pdf_path):
    """Extracts plain text from PDF using PyPDF2."""
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text.strip()

def split_blocks(text):
    """Split PDF text into candidate blocks (tables or paragraphs)."""
    return [b.strip() for b in text.split("\n\n") if b.strip()]


In [None]:

def llm_extract_permits(block, model="gpt-4o-mini"):
    """Use LLM to extract one or multiple permit JSON objects from a block."""
    prompt = f"""
    You are an information extractor. Extract data into this canonical schema:

    {SCHEMA}

    Input block:
    {block}

    If multiple permits are present (e.g., table rows), 
    return a JSON array with one object per permit.
    If only one, return an array with one object.
    If none, return [].
    """
    resp = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a precise data extraction assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    try:
        data = json.loads(resp["choices"][0]["message"]["content"])
        if isinstance(data, dict):
            return [data]
        return data
    except Exception:
        return []


In [None]:

def extract_all_permits(pdf_path):
    """Extract all permits (list of JSON objects) from the PDF."""
    text = extract_raw_text(pdf_path)
    blocks = split_blocks(text)
    all_permits = []
    for block in blocks:
        permits = llm_extract_permits(block)
        if permits:
            all_permits.extend(permits)
    return all_permits


In [None]:

pdf_path = "/mnt/data/permit_table_example.pdf"  # Change to your PDF path
permits = extract_all_permits(pdf_path)
print(json.dumps(permits, indent=2))

# Save to JSON for downstream use
with open("/mnt/data/permits_extracted.json", "w") as f:
    json.dump(permits, f, indent=2)
print("Saved -> /mnt/data/permits_extracted.json")
