In [1]:
from pathlib import Path
import json
from datetime import datetime
import re

import pdfplumber
import pandas as pd
from tqdm import tqdm

config_path = (
    Path.home()
    / "Documents"
    / "jurimetria_aved_stf"
    / "config"
    / "project_config.json"
)
config = json.loads(config_path.read_text(encoding="utf-8"))

raw_pdf_dir = Path(config["paths"]["raw_pdf_dir"]).expanduser()
pdf_txt_dir = Path(config["paths"]["pdf_txt_dir"]).expanduser()
logs_dir    = Path(config["paths"]["logs_dir"]).expanduser()

pdf_txt_dir.mkdir(parents=True, exist_ok=True)
logs_dir.mkdir(parents=True, exist_ok=True)

ocr_conf    = config.get("ocr", {})
OCR_ENABLED = bool(ocr_conf.get("enabled", False))

MIN_TEXT_CHARS = 450

# heurísticas contra duplicação de caracteres
def line_looks_doubled(line: str, ratio_threshold: float = 0.8) -> bool:
    letters = [c for c in line if c.isalpha()]
    if len(letters) < 6:
        return False
    doubled = 0
    i = 0
    while i < len(letters) - 1:
        if letters[i] == letters[i + 1]:
            doubled += 2
            i += 2
        else:
            i += 1
    ratio = doubled / len(letters)
    return ratio >= ratio_threshold

def dedouble_line(line: str) -> str:
    return re.sub(r"(.)\1", r"\1", line)

def dedouble_text(text: str) -> str:
    lines = text.splitlines()
    out = []
    for ln in lines:
        if line_looks_doubled(ln):
            out.append(dedouble_line(ln))
        else:
            out.append(ln)
    return "\n".join(out)

# OCR de fallback
def ocr_fallback(pdf_path: Path, ocr_conf: dict) -> tuple[str, int, str]:
    try:
        from pdf2image import convert_from_path
        import pytesseract
    except ImportError as e:
        return "", 0, f"ocr_import_error: {e}"

    lang = ocr_conf.get("lang", "por+eng")
    dpi  = int(ocr_conf.get("dpi", 300))
    poppler_path  = ocr_conf.get("poppler_path") or None
    tesseract_cmd = ocr_conf.get("tesseract_cmd") or None
    if tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

    try:
        if poppler_path:
            images = convert_from_path(str(pdf_path), dpi=dpi, poppler_path=poppler_path)
        else:
            images = convert_from_path(str(pdf_path), dpi=dpi)
    except Exception as e:
        return "", 0, f"ocr_pdf2image_error: {e}"

    ocr_parts = []
    for img in images:
        try:
            ocr_parts.append(pytesseract.image_to_string(img, lang=lang))
        except Exception as e:
            ocr_parts.append("")
    text = "\n".join(ocr_parts).strip()
    return text, len(images), ""

# loop principal
pdf_files = sorted(list(raw_pdf_dir.glob("*.pdf")))
records = []

for pdf_path in tqdm(pdf_files, desc="Extraindo PDFs (robusto)"):
    out_txt = pdf_txt_dir / f"{pdf_path.stem}.txt"

    if out_txt.exists():
        text_content = out_txt.read_text(encoding="utf-8", errors="ignore")
        records.append(
            {
                "file_name": pdf_path.name,
                "source_path": str(pdf_path),
                "method": "skipped_existing",
                "n_pages": None,
                "text_len": len(text_content),
                "dedoubled": False,
                "timestamp": datetime.utcnow().isoformat(),
                "error": "",
            }
        )
        continue

    extracted_text = ""
    used_method = ""
    n_pages = 0
    error_msg = ""
    dedoubled_flag = False

    # 1) pdfplumber primeiro
    try:
        with pdfplumber.open(pdf_path) as pdf:
            n_pages = len(pdf.pages)
            parts = []
            for page in pdf.pages:
                parts.append(page.extract_text(x_tolerance=1, y_tolerance=1) or "")
            extracted_text = "\n".join(parts).strip()
        used_method = "pdfplumber"
    except Exception as e:
        extracted_text = ""
        used_method = "pdfplumber_failed"
        error_msg = str(e)

    # 2) se pdfplumber curto, OCR mesmo enabled
    if (not extracted_text) or (len(extracted_text) < MIN_TEXT_CHARS):
        ocr_text = ""
        ocr_pages = 0
        ocr_err = ""

        # se OCR global ligado, usa direto
        if OCR_ENABLED:
            ocr_text, ocr_pages, ocr_err = ocr_fallback(pdf_path, ocr_conf)
        else:
            # fallback inteligente
            ocr_text, ocr_pages, ocr_err = ocr_fallback(pdf_path, ocr_conf)

        # se OCR retornar mais texto que o plumber, permanece OCR
        if ocr_text and len(ocr_text) > len(extracted_text):
            extracted_text = ocr_text
            n_pages = ocr_pages
            used_method = "ocr_fallback"
            error_msg = ocr_err

    # 3) deduplicação de linhas com padrão de OCR
    if extracted_text:
        cleaned_text = dedouble_text(extracted_text)
        if cleaned_text != extracted_text:
            dedoubled_flag = True
        extracted_text = cleaned_text
        out_txt.write_text(extracted_text, encoding="utf-8")

    records.append(
        {
            "file_name": pdf_path.name,
            "source_path": str(pdf_path),
            "method": used_method,
            "n_pages": n_pages,
            "text_len": len(extracted_text),
            "dedoubled": dedoubled_flag,
            "timestamp": datetime.utcnow().isoformat(),
            "error": error_msg,
        }
    )

df = pd.DataFrame(records).sort_values("file_name")
(df).to_csv(logs_dir / "extraction_log.csv", index=False)

df

Extraindo PDFs (robusto): 100%|███████████████| 123/123 [21:54<00:00, 10.69s/it]


Unnamed: 0,file_name,source_path,method,n_pages,text_len,dedoubled,timestamp,error
0,a_002.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,109,237261,True,2025-11-01T01:57:53.894136,
1,a_003.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,88,186037,True,2025-11-01T01:58:00.745634,
2,a_004.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,112,244245,True,2025-11-01T01:58:09.271425,
3,a_005.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,110,239829,True,2025-11-01T01:58:19.204566,
4,a_006.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,134,293097,True,2025-11-01T01:58:31.468159,
...,...,...,...,...,...,...,...,...
118,a_125.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,138,234604,True,2025-11-01T02:16:41.258534,
119,a_126.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,210,384783,True,2025-11-01T02:16:51.818882,
120,a_127.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,1991,3853414,True,2025-11-01T02:19:29.016798,
121,a_128.pdf,/Users/cibelealexandreu/Documents/jurimetria_a...,pdfplumber,144,253882,True,2025-11-01T02:19:34.537821,


In [2]:
import pytesseract
pytesseract.get_tesseract_version()

<Version('5.5.1')>

In [3]:
from pathlib import Path
cfg = Path.home() / "Documents" / "jurimetria_aved_stf" / "config" / "project_config.json"
print(cfg.read_text()[:400])

{
  "project_name": "jurimetria_aved_stf",
  "created_at_utc": "2025-11-01T01:34:20.533018",
  "python_version": "3.11.11 | packaged by conda-forge | (main, Dec  5 2024, 08:47:03) [Clang 18.1.8 ]",
  "platform": "macOS-15.6.1-arm64-arm-64bit",
  "machine": "arm64",
  "paths": {
    "project_root": "/Users/cibelealexandreu/Documents/jurimetria_aved_stf",
    "config_dir": "/Users/cibelealexandreu/D
