In [55]:
import os
import fitz  # PyMuPDF
from openai import OpenAI
import os
import json
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
MODEL = "gpt-4.1-nano"

In [56]:
from pydantic import BaseModel

class PageScore(BaseModel):
    page: int
    score: int  # must be 1–5
    has_place_names: bool
    has_distances: bool
    short_reason: str

In [57]:
def extract_pages(pdf_path):
    # Open the document
    doc = fitz.open(pdf_path)

    pages = [] 

    # Iterate through pages 
    for i, page in enumerate(doc):
        text = page.get_text()
        if text.strip():
            pages.append({"page": i + 1, "text":text})
    return pages 


In [58]:
def get_page_score_structured(page_num, text):
    prompt = f"""
    # Role
    You are an expert in geographic document analysis. Your task is to identify pages that contain rich spatial/logistical detail from a historical manuscript.

    # Instructions
    Follow these steps:
    1. Read the page excerpt.
    2. Judge whether it contains meaningful travel-related data.
    3. If yes, score it from 1 (low spatial detail) to 5 (very high spatial detail).
    4. Return structured JSON only.

Return structured JSON with:
- page: the page number
- score: 1 to 5 (spatial detail density)
- has_place_names: true/false
- has_distances: true/false
- short_reason: a brief explanation under 20 words

Page {page_num}:
{text[:2000]}
""".strip()

    try:
        response = client.responses.parse(
            model="gpt-4.1-nano",
            input=[
                {"role": "system", "content": "Return structured JSON matching the provided schema."},
                {"role": "user", "content": prompt}
            ],
            text_format=PageScore
        )
        return response.output_parsed
    except Exception as e:
        print(f"Error on page {page_num}: {e}")
        return json.dumps({
        "page": page_num,
        "score": 0,
        "has_place_names": False,
        "has_distances": False,
        "short_reason": "Error: " + str(e)
        })

In [72]:
# Extracting the pdfs 
pdf_path = "../../data/raw/herndon1854.pdf"
pages = extract_pages(pdf_path)
print(f"Extracted {len(pages)} non-empty pages.")


Extracted 446 non-empty pages.


In [73]:
def score_all_pages(pages):
    results = []

    for page in tqdm(pages, desc="Scoring pages"):
        try:
            result = get_page_score_structured(page['page'], page['text'])
            results.append(result.model_dump())  # <-- updated here
        except Exception as e:
            print(f"Error on page {page['page']}: {e}")
            results.append({
                "page": page["page"],
                "score": 0,
                "has_place_names": False,
                "has_distances": False,
                "short_reason": "Error"
            })

    return results

In [74]:
scored_pages = score_all_pages(pages)

Scoring pages: 100%|██████████| 446/446 [09:32<00:00,  1.28s/it]


In [75]:
import json

with open("herndon_scored.json", "w") as f:
    json.dump(scored_pages, f, indent=2)

print("🔥 Saved to disk — you're safe now.")


🔥 Saved to disk — you're safe now.


In [76]:
scored_pages

[{'page': 1,
  'score': 1,
  'has_place_names': False,
  'has_distances': False,
  'short_reason': 'Lacks clear geographic or travel-related details.'},
 {'page': 2,
  'score': 1,
  'has_place_names': False,
  'has_distances': False,
  'short_reason': 'Minimal text, no geographical or travel details present.'},
 {'page': 8,
  'score': 1,
  'has_place_names': True,
  'has_distances': True,
  'short_reason': 'Contains some place names and possible distance indications, but limited positional detail.'},
 {'page': 9,
  'score': 2,
  'has_place_names': True,
  'has_distances': False,
  'short_reason': 'Mentions exploration and some geographic names, but no specific distances.'},
 {'page': 11,
  'score': 2,
  'has_place_names': True,
  'has_distances': False,
  'short_reason': 'Mentions exploration and tributaries but lacks detailed spatial or distance data.'},
 {'page': 12,
  'score': 2,
  'has_place_names': True,
  'has_distances': False,
  'short_reason': 'Mentions exploration locations b

In [77]:
from itertools import groupby
from operator import itemgetter

In [78]:
def find_high_score_windows(scored_pages, min_score=4, min_length=3):
    high_pages = [p["page"] for p in scored_pages if p["score"] >= min_score]
    windows = []
    for k, g in groupby(enumerate(high_pages), lambda x: x[1] - x[0]):
        group = list(map(itemgetter(1), g))
        if len(group) >= min_length:
            windows.append((group[0], group[-1]))
    return windows

windows = find_high_score_windows(scored_pages)
print("📘 Windows to extract:", windows)


📘 Windows to extract: [(187, 189), (194, 196), (258, 260), (281, 283), (290, 292), (299, 304), (317, 320), (328, 333), (343, 345)]


In [79]:
import os
import subprocess
import pytesseract
from PIL import Image
import fitz
import re
from tqdm import tqdm

In [80]:
def rasterize_pdf_pages(pdf_path, start_page, end_page, outdir, prefix="page", dpi=300):
    os.makedirs(outdir, exist_ok=True)
    cmd = [
        "pdftoppm",
        "-r", str(dpi),
        "-f", str(start_page),
        "-l", str(end_page),
        pdf_path,
        os.path.join(outdir, prefix)
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"Rasterization failed:\n{result.stderr}")


In [81]:
def rasterize_all_windows(pdf_path, windows, base_outdir):
    for i, (start, end) in enumerate(windows):
        outdir = os.path.join(base_outdir, f"window_{start}_{end}")
        rasterize_pdf_pages(pdf_path, start, end, outdir)


In [82]:
def ocr_all_images(base_outdir):
    all_text = ""
    for root, dirs, files in os.walk(base_outdir):
        ppm_files = sorted(f for f in files if f.endswith(".ppm"))
        for f in tqdm(ppm_files, desc=f"OCR in {root}"):
            path = os.path.join(root, f)
            text = pytesseract.image_to_string(Image.open(path))
            all_text += f"\n--- Page from {f} ---\n{text.strip()}\n"
    return all_text


In [83]:
def clean_extracted_text(text):
    cleaned_lines = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        if line.startswith("--- Page from"):
            continue
        if re.search(r"Exploration of the Valley|Herndon|Washington, D.C.|[Ll]ibrary of [Cc]ongress", line):
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)


In [84]:
def save_to_txt(text, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"✅ Saved to {filename}")


In [85]:
  # Replace with your actual selected windows
output_image_dir = "../../data/raw/herndon_ppm"
output_txt_file = "../../data/clean/herndon_cleaned.txt"

# Step-by-step pipeline
rasterize_all_windows(pdf_path, windows, output_image_dir)
ocr_text = ocr_all_images(output_image_dir)
cleaned_text = clean_extracted_text(ocr_text)
save_to_txt(cleaned_text, output_txt_file)


OCR in ../../data/raw/herndon_ppm: 0it [00:00, ?it/s]
OCR in ../../data/raw/herndon_ppm/window_205_210: 100%|██████████| 6/6 [00:09<00:00,  1.56s/it]
OCR in ../../data/raw/herndon_ppm/window_281_283: 100%|██████████| 3/3 [00:04<00:00,  1.67s/it]
OCR in ../../data/raw/herndon_ppm/window_343_345: 100%|██████████| 3/3 [00:04<00:00,  1.57s/it]
OCR in ../../data/raw/herndon_ppm/window_258_260: 100%|██████████| 3/3 [00:05<00:00,  1.68s/it]
OCR in ../../data/raw/herndon_ppm/window_194_196: 100%|██████████| 3/3 [00:05<00:00,  1.68s/it]
OCR in ../../data/raw/herndon_ppm/window_187_189: 100%|██████████| 3/3 [00:04<00:00,  1.44s/it]
OCR in ../../data/raw/herndon_ppm/window_317_320: 100%|██████████| 4/4 [00:06<00:00,  1.68s/it]
OCR in ../../data/raw/herndon_ppm/window_328_333: 100%|██████████| 6/6 [00:09<00:00,  1.64s/it]
OCR in ../../data/raw/herndon_ppm/window_290_292: 100%|██████████| 3/3 [00:04<00:00,  1.59s/it]
OCR in ../../data/raw/herndon_ppm/window_220_226: 100%|██████████| 7/7 [00:11<00:0

✅ Saved to ../../data/clean/herndon_cleaned.txt



