# Enhanced LLM-based PDF Extraction (Multiple Permits, Stdlib Only)
This notebook extracts **multiple permit JSON objects** from a PDF with no external installs.

- Stdlib-only PDF parser (zlib + regex).
- Handles free-text and tabular formats.
- Uses LLM to normalize field names and values into a canonical schema.
- Returns an array of JSON objects (one per permit).


In [None]:

import os, re, zlib, json
import openai

SCHEMA = ["Permit Number", "Issue Date", "Expiry Date", "Owner Name", "Address"]


In [None]:

def _extract_strings_from_TJ_array(arr: str) -> str:
    out = []
    i, n = 0, len(arr)
    while i < n:
        if arr[i] == '(':
            i += 1; buf = []; depth = 1; esc = False
            while i < n and depth > 0:
                c = arr[i]
                if esc: buf.append(c); esc = False
                else:
                    if c == '\\': esc = True
                    elif c == '(':
                        depth += 1; buf.append(c)
                    elif c == ')':
                        depth -= 1
                        if depth == 0: break
                        buf.append(c)
                    else: buf.append(c)
                i += 1
            out.append(''.join(buf))
        else: i += 1
    return ''.join(out)

def _extract_strings_parens(s: str) -> str:
    out = []
    i, n = 0, len(s)
    while i < n:
        if s[i] == '(':
            i += 1; buf = []; depth = 1; esc = False
            while i < n and depth > 0:
                c = s[i]
                if esc: buf.append(c); esc = False
                else:
                    if c == '\\': esc = True
                    elif c == '(':
                        depth += 1; buf.append(c)
                    elif c == ')':
                        depth -= 1
                        if depth == 0: break
                        buf.append(c)
                    else: buf.append(c)
                i += 1
            out.append(''.join(buf))
        else: i += 1
    return ' '.join(out)

def _try_inflate(data: bytes) -> bytes:
    try: return zlib.decompress(data)
    except Exception: return data

def extract_raw_text(pdf_path: str) -> str:
    with open(pdf_path, "rb") as f:
        raw = f.read()
    text_chunks = []
    for m in re.finditer(br"stream[\r\n]+(.*?)[\r\n]+endstream", raw, flags=re.DOTALL):
        inflated = _try_inflate(m.group(1))
        try: s = inflated.decode("latin-1", errors="ignore")
        except Exception: continue
        for bt in re.finditer(r"BT(.*?)ET", s, flags=re.DOTALL):
            body = bt.group(1)
            for tj in re.finditer(r"\((?:\\.|[^\)])*\)\s*Tj", body):
                text_chunks.append(_extract_strings_parens(tj.group(0)))
            for tja in re.finditer(r"\[(.*?)\]\s*TJ", body, flags=re.DOTALL):
                text_chunks.append(_extract_strings_from_TJ_array(tja.group(1)))
    if not text_chunks:
        try:
            s_all = raw.decode("latin-1", errors="ignore")
            for tj in re.finditer(r"\((?:\\.|[^\)])*\)\s*Tj", s_all):
                text_chunks.append(_extract_strings_parens(tj.group(0)))
        except Exception: pass
    return re.sub(r"\s+", " ", " ".join(text_chunks)).strip()

def split_blocks(text):
    return [b.strip() for b in text.split("\n\n") if b.strip()]


In [None]:

def llm_extract_permits(block, model="gpt-4o-mini"):
    prompt = f"""
    You are an information extractor. Extract permits into this schema:

    {SCHEMA}

    Input block:
    {block}

    If multiple permits are present (e.g., table rows), 
    return a JSON array with one object per permit.
    If only one, return an array with one object.
    If none, return [].
    """
    resp = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a precise data extraction assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    try:
        data = json.loads(resp["choices"][0]["message"]["content"])
        if isinstance(data, dict):
            return [data]
        return data
    except Exception:
        return []


In [None]:

def extract_all_permits(pdf_path):
    text = extract_raw_text(pdf_path)
    blocks = split_blocks(text)
    all_permits = []
    for block in blocks:
        permits = llm_extract_permits(block)
        if permits:
            all_permits.extend(permits)
    return all_permits


In [None]:

pdf_path = "/mnt/data/permit_table_example.pdf"  # Replace with your PDF path
permits = extract_all_permits(pdf_path)
print(json.dumps(permits, indent=2))

with open("/mnt/data/permits_extracted.json", "w") as f:
    json.dump(permits, f, indent=2)
print("Saved -> /mnt/data/permits_extracted.json")
