In [9]:
from pathlib import Path
import sys
import platform
import json
from datetime import datetime
import importlib
import shutil

home = Path.home()

project_root  = home / "Documents" / "jurimetria_aved_stf"
config_dir    = project_root / "config"
data_dir      = project_root / "data"
logs_dir      = project_root / "logs"
outputs_dir   = project_root / "outputs"
notebooks_dir = project_root / "notebooks"

figures_dir = outputs_dir / "figures"
tables_dir  = outputs_dir / "tables"
reports_dir = outputs_dir / "reports"

data_pdf_raw      = data_dir / "00_pdf_raw"
data_pdf_txt      = data_dir / "01_pdf_txt"
data_corpus_clean = data_dir / "02_corpus_clean"
data_thematic     = data_dir / "03_thematic"
data_ranked       = data_dir / "04_ranked"
data_master       = data_dir / "05_master"

for d in [
    project_root,
    config_dir,
    data_dir,
    data_pdf_raw,
    data_pdf_txt,
    data_corpus_clean,
    data_thematic,
    data_ranked,
    data_master,
    logs_dir,
    outputs_dir,
    figures_dir,
    tables_dir,
    reports_dir,
    notebooks_dir,
]:
    d.mkdir(parents=True, exist_ok=True)

required_packages = [
    "pdfplumber",
    "pytesseract",
    "pdf2image",
    "pandas",
    "tqdm",
    "matplotlib",
]

packages_info = {}
for pkg in required_packages:
    try:
        m = importlib.import_module(pkg)
        v = getattr(m, "__version__", "unknown")
        packages_info[pkg] = {"available": True, "version": v}
    except ImportError:
        packages_info[pkg] = {"available": False, "version": None}

poppler_bin = shutil.which("pdftoppm")
poppler_path = str(Path(poppler_bin).parent) if poppler_bin else ""

tesseract_bin = shutil.which("tesseract")
tesseract_cmd = str(tesseract_bin) if tesseract_bin else ""

project_config = {
    "project_name": "jurimetria_aved_stf",
    "created_at_utc": datetime.utcnow().isoformat(),
    "python_version": sys.version,
    "platform": platform.platform(),
    "machine": platform.machine(),
    "paths": {
        "project_root": str(project_root),
        "config_dir": str(config_dir),
        "data_dir": str(data_dir),
        "logs_dir": str(logs_dir),
        "notebooks_dir": str(notebooks_dir),
        "raw_pdf_dir": str(data_pdf_raw),
        "pdf_txt_dir": str(data_pdf_txt),
        "corpus_clean_dir": str(data_corpus_clean),
        "thematic_dir": str(data_thematic),
        "ranked_dir": str(data_ranked),
        "master_dir": str(data_master),
        "outputs_dir": str(outputs_dir),
        "figures_dir": str(figures_dir),
        "tables_dir": str(tables_dir),
        "reports_dir": str(reports_dir),
    },
    "ocr": {
        "lang": "por+eng",
        "dpi": 300,
        "poppler_path": poppler_path,
        "tesseract_cmd": tesseract_cmd,
        "enabled": False,
    },
    "packages": packages_info,
}

config_path = config_dir / "project_config.json"
config_path.write_text(json.dumps(project_config, ensure_ascii=False, indent=2), encoding="utf-8")

env_log_path = logs_dir / "env_info.json"
env_log_path.write_text(json.dumps(project_config, ensure_ascii=False, indent=2), encoding="utf-8")

project_config

{'project_name': 'jurimetria_aved_stf',
 'created_at_utc': '2025-11-01T01:34:20.533018',
 'python_version': '3.11.11 | packaged by conda-forge | (main, Dec  5 2024, 08:47:03) [Clang 18.1.8 ]',
 'platform': 'macOS-15.6.1-arm64-arm-64bit',
 'machine': 'arm64',
 'paths': {'project_root': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf',
  'config_dir': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf/config',
  'data_dir': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf/data',
  'logs_dir': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf/logs',
  'notebooks_dir': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf/notebooks',
  'raw_pdf_dir': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf/data/00_pdf_raw',
  'pdf_txt_dir': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf/data/01_pdf_txt',
  'corpus_clean_dir': '/Users/cibelealexandreu/Documents/jurimetria_aved_stf/data/02_corpus_clean',
  'thematic_dir': '/Users/cibelealexandreu/Documents/jurime