## lire le document de quelle type il est et si cest une image ou contien du text dans sont code

In [39]:
from __future__ import annotations

import csv
import os
import re
import zipfile
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from typing import Optional, Sequence, Union, List, Dict, Any


# Saisie possible:
# INPUT_FILE = "a.pdf, b.docx, c.png"
# INPUT_FILE = ["a.pdf", "b.docx", "c.png"]
INPUT_FILE: Optional[Union[str, Sequence[str]]] = (
    # "epsteanpdf.pdf, epsteain22.pdf, testexcel.xlsx, testword.docx, image2tab.webp, contras-14page.pdf, signettab.png"
    # "contras-14page.pdf, testword.docx, testexcel.xlsx, signettab.png, image2tab.webp"
    "documents/testword.docx"
)

# Heuristiques
MIN_CHARS_OFFICE = 1     # 1 caractère => "text"
MIN_CHARS_PDF = 30       # seuil de texte extrait
PDF_MAX_PAGES = 3        # on teste les N premières pages

# Dossiers de recherche si un nom est donné sans chemin (utile en notebook)
SEARCH_DIRS = [
    os.getcwd(),
    "/mnt/data",  # utile dans l'environnement ChatGPT
]


@dataclass(frozen=True)
class FileType:
    ext: str
    mime: str
    label: str


# ----------------- input parsing -----------------

def normalize_input_files(x: Optional[Union[str, Sequence[str]]]) -> List[str]:
    """Retourne toujours une liste. Supporte une string avec virgules (CSV)."""
    if x is None:
        return []
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        if "," not in s:
            return [s]
        parts = next(csv.reader([s], skipinitialspace=True))
        return [p.strip() for p in parts if p.strip()]
    return [str(p).strip() for p in x if str(p).strip()]


def resolve_path(p: str) -> Optional[str]:
    """
    Résout un chemin:
    - si p existe tel quel -> retourne p
    - sinon essaie SEARCH_DIRS + basename(p)
    - sinon retourne None (introuvable)
    """
    p = os.path.expandvars(os.path.expanduser(p.strip()))
    if os.path.exists(p):
        return p

    base = os.path.basename(p)
    for d in SEARCH_DIRS:
        alt = os.path.join(d, base)
        if os.path.exists(alt):
            return alt

    return None


# ----------------- format detection -----------------

def _read_head(path: str, n: int = 16384) -> bytes:
    with open(path, "rb") as f:
        return f.read(n)


def detect_path_type(path: str) -> FileType:
    head = _read_head(path)

    if head.startswith(b"%PDF-"):
        return FileType(".pdf", "application/pdf", "PDF document")

    if head.startswith(b"II*\x00") or head.startswith(b"MM\x00*"):
        return FileType(".tif", "image/tiff", "TIFF image")

    if head.startswith(b"\x89PNG\r\n\x1a\n"):
        return FileType(".png", "image/png", "PNG image")

    if head.startswith(b"\xff\xd8\xff"):
        return FileType(".jpg", "image/jpeg", "JPEG image")

    if len(head) >= 12 and head.startswith(b"RIFF") and head[8:12] == b"WEBP":
        return FileType(".webp", "image/webp", "WEBP image")

    # ZIP containers (DOCX/XLSX/PPTX/ODT/ODS/ODP/EPUB/ZIP)
    if head.startswith(b"PK\x03\x04") or head.startswith(b"PK\x05\x06") or head.startswith(b"PK\x07\x08"):
        try:
            with zipfile.ZipFile(path, "r") as z:
                names = set(z.namelist())

                # EPUB
                if "mimetype" in names and "META-INF/container.xml" in names:
                    try:
                        mt = z.read("mimetype")[:64].decode("ascii", errors="ignore").strip()
                    except Exception:
                        mt = ""
                    if mt == "application/epub+zip":
                        return FileType(".epub", "application/epub+zip", "EPUB eBook")

                # Office OpenXML
                if "word/document.xml" in names:
                    return FileType(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word document (DOCX)")
                if "xl/workbook.xml" in names:
                    return FileType(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel workbook (XLSX)")
                if "ppt/presentation.xml" in names:
                    return FileType(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint presentation (PPTX)")

                # OpenDocument
                if "content.xml" in names and "META-INF/manifest.xml" in names:
                    mt = ""
                    try:
                        if "mimetype" in names:
                            mt = z.read("mimetype")[:128].decode("ascii", errors="ignore").strip()
                    except Exception:
                        mt = ""
                    if mt == "application/vnd.oasis.opendocument.text":
                        return FileType(".odt", mt, "OpenDocument Text (ODT)")
                    if mt == "application/vnd.oasis.opendocument.spreadsheet":
                        return FileType(".ods", mt, "OpenDocument Spreadsheet (ODS)")
                    if mt == "application/vnd.oasis.opendocument.presentation":
                        return FileType(".odp", mt, "OpenDocument Presentation (ODP)")
                    return FileType(".odf", "application/zip", "OpenDocument container")

        except Exception:
            pass

        return FileType(".zip", "application/zip", "ZIP archive/container")

    # Ancien Office (OLE2)
    if head.startswith(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"):
        return FileType(".ole", "application/x-ole-storage", "OLE2 container (old Office)")

    return FileType("", "application/octet-stream", "Unknown / binary")


# ----------------- text vs image_only -----------------

def _xml_text_len(xml_bytes: bytes) -> int:
    """Compte du texte dans du XML (éléments + fallback simple)."""
    try:
        root = ET.fromstring(xml_bytes)
        total = 0
        for elem in root.iter():
            if elem.text and elem.text.strip():
                total += len(elem.text.strip())
        return total
    except Exception:
        s = re.sub(rb"<[^>]+>", b" ", xml_bytes)
        return len(re.sub(rb"\s+", b" ", s).strip())


def _zip_has_text(path: str, ext: str) -> bool:
    """
    DOCX/XLSX/PPTX/ODT/ODS/ODP/EPUB
    True si on trouve au moins MIN_CHARS_OFFICE caractères.
    """
    try:
        with zipfile.ZipFile(path, "r") as z:
            names = z.namelist()

            if ext == ".docx":
                total = 0
                # corps
                if "word/document.xml" in names:
                    total += _xml_text_len(z.read("word/document.xml"))
                # headers/footers (souvent du texte “isolé”)
                for nm in names:
                    if nm.startswith("word/header") and nm.endswith(".xml"):
                        total += _xml_text_len(z.read(nm))
                    if nm.startswith("word/footer") and nm.endswith(".xml"):
                        total += _xml_text_len(z.read(nm))
                    if total >= MIN_CHARS_OFFICE:
                        break
                return total >= MIN_CHARS_OFFICE

            if ext == ".xlsx":
                total = 0
                if "xl/sharedStrings.xml" in names:
                    total += _xml_text_len(z.read("xl/sharedStrings.xml"))
                if total < MIN_CHARS_OFFICE:
                    for nm in names:
                        if nm.startswith("xl/worksheets/") and nm.endswith(".xml"):
                            total += _xml_text_len(z.read(nm))
                            if total >= MIN_CHARS_OFFICE:
                                break
                return total >= MIN_CHARS_OFFICE

            if ext == ".pptx":
                total = 0
                for nm in names:
                    if nm.startswith("ppt/slides/") and nm.endswith(".xml"):
                        total += _xml_text_len(z.read(nm))
                        if total >= MIN_CHARS_OFFICE:
                            break
                return total >= MIN_CHARS_OFFICE

            if ext in {".odt", ".ods", ".odp"}:
                if "content.xml" in names:
                    return _xml_text_len(z.read("content.xml")) >= MIN_CHARS_OFFICE
                return False

            if ext == ".epub":
                total = 0
                for nm in names:
                    low = nm.lower()
                    if low.endswith((".xhtml", ".html", ".htm")):
                        try:
                            b = z.read(nm)
                        except Exception:
                            continue
                        s = re.sub(rb"<[^>]+>", b" ", b)
                        total += len(re.sub(rb"\s+", b" ", s).strip())
                        if total >= MIN_CHARS_OFFICE:
                            break
                return total >= MIN_CHARS_OFFICE

    except Exception:
        return False

    return False


def _get_pdf_reader():
    """Retourne PdfReader depuis pypdf ou PyPDF2, ou None si indisponible."""
    try:
        from pypdf import PdfReader  # type: ignore
        return PdfReader
    except ImportError:
        try:
            from PyPDF2 import PdfReader  # type: ignore
            return PdfReader
        except ImportError:
            return None


def _pdf_has_text(path: str) -> bool:
    """
    PDF:
    - True si extract_text() produit assez de caractères, OU si fonts / opérateurs texte présents.
    - Si aucune lib PDF n'est dispo: fallback binaire (cherche /Font ou opérateurs BT/Tj).
    """
    PdfReader = _get_pdf_reader()
    if PdfReader is None:
        # fallback binaire: moins fiable, mais évite de renvoyer faux systématique
        try:
            with open(path, "rb") as f:
                data = f.read(2_000_000)  # 2MB max
            if b"/Font" in data:
                return True
            if b"BT" in data and (b"Tj" in data or b"TJ" in data):
                return True
        except Exception:
            pass
        return False

    try:
        reader = PdfReader(path)
        pages = reader.pages[: max(1, PDF_MAX_PAGES)]

        extracted_score = 0
        saw_font = False
        saw_text_ops = False

        for page in pages:
            # 1) extraction texte
            txt = page.extract_text() or ""
            extracted_score += len("".join(txt.split()))
            if extracted_score >= MIN_CHARS_PDF:
                return True

            # 2) fonts dans resources
            try:
                res = page.get("/Resources") or {}
                font = res.get("/Font")
                if font:
                    saw_font = True
            except Exception:
                pass

            # 3) opérateurs texte dans stream
            try:
                contents = page.get_contents()
                if contents is None:
                    continue
                if hasattr(contents, "get_data"):
                    data = contents.get_data()
                else:
                    data = b"".join(c.get_data() for c in contents)  # type: ignore
                if b"BT" in data and (b"Tj" in data or b"TJ" in data):
                    saw_text_ops = True
            except Exception:
                pass

        return saw_font or saw_text_ops

    except Exception:
        return False


def content_kind_two_states(path: str, ftype: FileType) -> str:
    """Retourne seulement: 'text' ou 'image_only'."""
    ext = ftype.ext.lower()

    # Images => image_only
    if ext in {".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff", ".bmp", ".ico"}:
        return "image_only"

    # PDF
    if ext == ".pdf":
        return "text" if _pdf_has_text(path) else "image_only"

    # Formats texte compressés (Office/ODF/EPUB)
    if ext in {".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub"}:
        return "text" if _zip_has_text(path, ext) else "image_only"

    # Tout le reste => image_only (car tu veux 2 états)
    return "image_only"


def analyze_many_two_states(input_file: Optional[Union[str, Sequence[str]]]) -> List[Dict[str, Any]]:
    """
    Sortie:
      [{"path": ..., "ext": ..., "mime": ..., "label": ..., "content": "text|image_only"}, ...]
    Ignore les fichiers introuvables.
    """
    raw_paths = normalize_input_files(input_file)
    out: List[Dict[str, Any]] = []

    for raw in raw_paths:
        p = resolve_path(raw)
        if p is None:
            continue

        ft = detect_path_type(p)
        out.append({
            "path": p,
            "ext": ft.ext,
            "mime": ft.mime,
            "label": ft.label,
            "content": content_kind_two_states(p, ft),
        })

    return out


# Test
analyze_many_two_states(INPUT_FILE)


[{'path': 'documents/testword.docx',
  'ext': '.docx',
  'mime': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  'label': 'Word document (DOCX)',
  'content': 'text'}]

### si image faire passer sur un pretraitemetn lamelirer sinon un document avce text dans sont code source aallors pass

In [40]:
from __future__ import annotations

import csv
import os
import re
import zipfile
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from typing import Optional, Sequence, Union, List

import argparse
import sys
from pathlib import Path
from typing import Iterable, List, Optional, Tuple

import pytesseract
from PIL import Image, ImageEnhance, ImageFilter, ImageOps


try:
    import numpy as np  # type: ignore
except ImportError:  # pragma: no cover
    np = None

try:
    SCRIPT_DIR = Path(__file__).resolve().parent
except NameError:
    # In notebooks __file__ is undefined; fall back to current working directory.
    SCRIPT_DIR = Path.cwd()

DEFAULT_LANG = "fra"
DEFAULT_CONTRAST = 1.5
DEFAULT_SHARPNESS = 1.2
DEFAULT_BRIGHTNESS = 1.0
DEFAULT_UPSCALE = 1.5
DEFAULT_DPI = 300

# Heuristiques
MIN_CHARS_OFFICE = 1
MIN_CHARS_PDF = 30
PDF_MAX_PAGES = 3
SEARCH_DIRS = [os.getcwd(), "/mnt/data"]  # utile en notebook


@dataclass(frozen=True)
class FileType:
    ext: str
    mime: str
    label: str


def _read_head(path: str, n: int = 16384) -> bytes:
    with open(path, "rb") as f:
        return f.read(n)


def normalize_input_files(x: Optional[Union[str, Sequence[str]]]) -> List[str]:
    if x is None:
        return []
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        if "," not in s: 
            return [s]
        parts = next(csv.reader([s], skipinitialspace=True))
        return [p.strip() for p in parts if p.strip()]
    return [str(p).strip() for p in x if str(p).strip()]


def resolve_path(p: str) -> Optional[str]:
    p = os.path.expandvars(os.path.expanduser(p.strip()))
    if os.path.exists(p):
        return os.path.abspath(p)

    base = os.path.basename(p)
    for d in SEARCH_DIRS:
        alt = os.path.join(d, base)
        if os.path.exists(alt):
            return os.path.abspath(alt)

    return None


def detect_path_type(path: str) -> FileType:
    head = _read_head(path)

    if head.startswith(b"%PDF-"):
        return FileType(".pdf", "application/pdf", "PDF document")

    if head.startswith(b"II*\x00") or head.startswith(b"MM\x00*"):
        return FileType(".tif", "image/tiff", "TIFF image")

    if head.startswith(b"\x89PNG\r\n\x1a\n"):
        return FileType(".png", "image/png", "PNG image")

    if head.startswith(b"\xff\xd8\xff"):
        return FileType(".jpg", "image/jpeg", "JPEG image")

    if len(head) >= 12 and head.startswith(b"RIFF") and head[8:12] == b"WEBP":
        return FileType(".webp", "image/webp", "WEBP image")

    if head.startswith(b"PK\x03\x04") or head.startswith(b"PK\x05\x06") or head.startswith(b"PK\x07\x08"):
        try:
            with zipfile.ZipFile(path, "r") as z:
                names = set(z.namelist())

                if "mimetype" in names and "META-INF/container.xml" in names:
                    try:
                        mt = z.read("mimetype")[:64].decode("ascii", errors="ignore").strip()
                    except Exception:
                        mt = ""
                    if mt == "application/epub+zip":
                        return FileType(".epub", "application/epub+zip", "EPUB eBook")

                if "word/document.xml" in names:
                    return FileType(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word document (DOCX)")
                if "xl/workbook.xml" in names:
                    return FileType(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel workbook (XLSX)")
                if "ppt/presentation.xml" in names:
                    return FileType(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint presentation (PPTX)")

                if "content.xml" in names and "META-INF/manifest.xml" in names:
                    mt = ""
                    try:
                        if "mimetype" in names:
                            mt = z.read("mimetype")[:128].decode("ascii", errors="ignore").strip()
                    except Exception:
                        mt = ""
                    if mt == "application/vnd.oasis.opendocument.text":
                        return FileType(".odt", mt, "OpenDocument Text (ODT)")
                    if mt == "application/vnd.oasis.opendocument.spreadsheet":
                        return FileType(".ods", mt, "OpenDocument Spreadsheet (ODS)")
                    if mt == "application/vnd.oasis.opendocument.presentation":
                        return FileType(".odp", mt, "OpenDocument Presentation (ODP)")
                    return FileType(".odf", "application/zip", "OpenDocument container")
        except Exception:
            pass

        return FileType(".zip", "application/zip", "ZIP archive/container")

    if head.startswith(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"):
        return FileType(".ole", "application/x-ole-storage", "OLE2 container (old Office)")

    return FileType("", "application/octet-stream", "Unknown / binary")


def _xml_text_len(xml_bytes: bytes) -> int:
    try:
        root = ET.fromstring(xml_bytes)
        total = 0
        for elem in root.iter():
            if elem.text and elem.text.strip():
                total += len(elem.text.strip())
        return total
    except Exception:
        s = re.sub(rb"<[^>]+>", b" ", xml_bytes)
        return len(re.sub(rb"\s+", b" ", s).strip())


def _zip_has_text(path: str, ext: str) -> bool:
    try:
        with zipfile.ZipFile(path, "r") as z:
            names = z.namelist()

            if ext == ".docx":
                total = 0
                if "word/document.xml" in names:
                    total += _xml_text_len(z.read("word/document.xml"))
                for nm in names:
                    if nm.startswith("word/header") and nm.endswith(".xml"):
                        total += _xml_text_len(z.read(nm))
                    if nm.startswith("word/footer") and nm.endswith(".xml"):
                        total += _xml_text_len(z.read(nm))
                    if total >= MIN_CHARS_OFFICE:
                        break
                return total >= MIN_CHARS_OFFICE

            if ext == ".xlsx":
                total = 0
                if "xl/sharedStrings.xml" in names:
                    total += _xml_text_len(z.read("xl/sharedStrings.xml"))
                if total < MIN_CHARS_OFFICE:
                    for nm in names:
                        if nm.startswith("xl/worksheets/") and nm.endswith(".xml"):
                            total += _xml_text_len(z.read(nm))
                            if total >= MIN_CHARS_OFFICE:
                                break
                return total >= MIN_CHARS_OFFICE

            if ext == ".pptx":
                total = 0
                for nm in names:
                    if nm.startswith("ppt/slides/") and nm.endswith(".xml"):
                        total += _xml_text_len(z.read(nm))
                        if total >= MIN_CHARS_OFFICE:
                            break
                return total >= MIN_CHARS_OFFICE

            if ext in {".odt", ".ods", ".odp"}:
                return ("content.xml" in names) and (_xml_text_len(z.read("content.xml")) >= MIN_CHARS_OFFICE)

            if ext == ".epub":
                total = 0
                for nm in names:
                    low = nm.lower()
                    if low.endswith((".xhtml", ".html", ".htm")):
                        try:
                            b = z.read(nm)
                        except Exception:
                            continue
                        s = re.sub(rb"<[^>]+>", b" ", b)
                        total += len(re.sub(rb"\s+", b" ", s).strip())
                        if total >= MIN_CHARS_OFFICE:
                            break
                return total >= MIN_CHARS_OFFICE

    except Exception:
        return False

    return False


def _get_pdf_reader():
    try:
        from pypdf import PdfReader  # type: ignore
        return PdfReader
    except ImportError:
        try:
            from PyPDF2 import PdfReader  # type: ignore
            return PdfReader
        except ImportError:
            return None


def _pdf_has_text(path: str) -> bool:
    PdfReader = _get_pdf_reader()

    if PdfReader is None:
        try:
            with open(path, "rb") as f:
                data = f.read(2_000_000)
            if b"/Font" in data:
                return True
            if b"BT" in data and (b"Tj" in data or b"TJ" in data):
                return True
        except Exception:
            pass
        return False

    try:
        reader = PdfReader(path)
        pages = reader.pages[: max(1, PDF_MAX_PAGES)]
        extracted_score = 0

        for page in pages:
            txt = page.extract_text() or ""
            extracted_score += len("".join(txt.split()))
            if extracted_score >= MIN_CHARS_PDF:
                return True
        return False
    except Exception:
        return False


def content_kind_two_states(path: str, ftype: FileType) -> str:
    ext = ftype.ext.lower()

    if ext in {".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff", ".bmp", ".ico"}:
        return "image_only"

    if ext == ".pdf":
        return "text" if _pdf_has_text(path) else "image_only"

    if ext in {".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub"}:
        return "text" if _zip_has_text(path, ext) else "image_only"

    return "image_only"


# --------- ROUTAGE ---------
ORIGINAL_INPUT_FILE = globals().get("INPUT_FILE", None)
_raw_items = normalize_input_files(ORIGINAL_INPUT_FILE)

IMAGE_ONLY_FILES: List[str] = []
TEXT_FILES: List[str] = []
MISSING_FILES: List[str] = []

for item in _raw_items:
    p = resolve_path(item)
    if p is None:
        MISSING_FILES.append(item)
        continue

    ft = detect_path_type(p)
    kind = content_kind_two_states(p, ft)

    if kind == "image_only":
        IMAGE_ONLY_FILES.append(p)
    else:
        TEXT_FILES.append(p)
        print(f"[skip] content='text' -> {p}")

# IMPORTANT: ton code OCR (cellule suivante) reste inchangé, il lira INPUT_FILE ici
INPUT_FILE = IMAGE_ONLY_FILES

if MISSING_FILES:
    print("[missing] fichiers introuvables:")
    for m in MISSING_FILES:
        print(" -", m)








SHOW_PREPROCESSED = True   #/////////////////////////////////////////////////////////////////////////////////////////////////////////////////


@dataclass
class EnhanceOptions:
    contrast: float = DEFAULT_CONTRAST
    sharpness: float = DEFAULT_SHARPNESS
    brightness: float = DEFAULT_BRIGHTNESS
    upscale: float = DEFAULT_UPSCALE
    gamma: Optional[float] = None  # gamma correction; <1 brightens darks, >1 darkens
    pad: int = 0  # pixels to pad around the image
    median: Optional[int] = None  # kernel size for median filter (odd int, e.g., 3)
    unsharp_radius: Optional[float] = None  # e.g., 1.0
    unsharp_percent: int = 150
    invert: bool = False
    autocontrast_cutoff: Optional[int] = None  # 0-100; percentage to clip for autocontrast
    equalize: bool = False  # histogram equalization
    auto_rotate: bool = False  # attempt orientation detection + rotate
    otsu: bool = False  # auto-threshold with Otsu (requires numpy)
    threshold: Optional[int] = None  # 0-255; if set, applies a binary threshold


def build_config(
    oem: Optional[int],
    psm: Optional[int],
    base_flags: Iterable[str],
    dpi: Optional[int],
    tessdata_dir: Optional[Path],
    user_words: Optional[Path],
    user_patterns: Optional[Path],
) -> str:
    parts: List[str] = []
    if oem is not None:
        parts.append(f"--oem {oem}")
    if psm is not None:
        parts.append(f"--psm {psm}")
    if dpi is not None:
        parts.append(f"--dpi {dpi}")
    if tessdata_dir is not None:
        parts.append(f'--tessdata-dir "{tessdata_dir}"')
    if user_words is not None:
        parts.append(f'--user-words "{user_words}"')
    if user_patterns is not None:
        parts.append(f'--user-patterns "{user_patterns}"')
    parts.extend(base_flags)
    return " ".join(parts)


def ensure_environment(lang: str) -> None:
    try:
        _ = pytesseract.get_tesseract_version()
    except pytesseract.TesseractNotFoundError:
        sys.exit("Tesseract binary not found on PATH. Install it and its language data.")
    if lang:
        try:
            available = set(pytesseract.get_languages(config=""))
            requested = set(lang.split("+"))
            missing = requested - available
            if missing:
                print(
                    f"Warning: missing languages: {', '.join(sorted(missing))}. "
                    f"Available: {', '.join(sorted(available))}",
                    file=sys.stderr,
                )
        except pytesseract.TesseractError:
            pass


def auto_rotate_if_needed(img: Image.Image, enhance: EnhanceOptions) -> Image.Image:
    if not enhance.auto_rotate:
        return img
    try:
        osd = pytesseract.image_to_osd(img)
        angle = None
        for line in osd.splitlines():
            if line.lower().startswith("rotate:"):
                try:
                    angle = int(line.split(":")[1].strip())
                except ValueError:
                    angle = None
                break
        if angle is not None and angle % 360 != 0:
            return img.rotate(-angle, expand=True)
    except Exception:
        pass
    return img


def preprocess_image(image: Image.Image, enhance: EnhanceOptions) -> Image.Image:
    img = image.convert("L")
    img = auto_rotate_if_needed(img, enhance)

    if enhance.invert:
        img = ImageOps.invert(img)

    if enhance.pad and enhance.pad > 0:
        img = ImageOps.expand(img, border=enhance.pad, fill=255)

    if enhance.autocontrast_cutoff is not None:
        cutoff = max(0, min(100, enhance.autocontrast_cutoff))
        img = ImageOps.autocontrast(img, cutoff=cutoff)

    if enhance.equalize:
        img = ImageOps.equalize(img)

    if enhance.upscale and enhance.upscale != 1.0:
        w, h = img.size
        img = img.resize((int(w * enhance.upscale), int(h * enhance.upscale)), Image.LANCZOS)

    if enhance.gamma and enhance.gamma > 0:
        inv_gamma = 1.0 / enhance.gamma
        lut = [pow(x / 255.0, inv_gamma) * 255 for x in range(256)]
        img = img.point(lut)

    if enhance.brightness and enhance.brightness != 1.0:
        img = ImageEnhance.Brightness(img).enhance(enhance.brightness)

    if enhance.contrast and enhance.contrast != 1.0:
        img = ImageEnhance.Contrast(img).enhance(enhance.contrast)

    if enhance.sharpness and enhance.sharpness != 1.0:
        img = ImageEnhance.Sharpness(img).enhance(enhance.sharpness)

    if enhance.unsharp_radius:
        img = img.filter(
            ImageFilter.UnsharpMask(
                radius=enhance.unsharp_radius,
                percent=enhance.unsharp_percent,
                threshold=0,
            )
        )

    if enhance.median and enhance.median > 1 and enhance.median % 2 == 1:
        img = img.filter(ImageFilter.MedianFilter(size=enhance.median))

    if enhance.threshold is not None:
        thr = max(0, min(255, enhance.threshold))
        img = img.point(lambda p, t=thr: 255 if p > t else 0, mode="1").convert("L")
    elif enhance.otsu and np is not None:
        arr = np.array(img, dtype=np.uint8)
        hist, _ = np.histogram(arr, bins=256, range=(0, 256))
        total = arr.size
        sum_total = np.dot(np.arange(256), hist)

        sum_b = 0.0
        w_b = 0.0
        max_var = 0.0
        threshold = 0

        for i in range(256):
            w_b += hist[i]
            if w_b == 0:
                continue
            w_f = total - w_b
            if w_f == 0:
                break
            sum_b += i * hist[i]
            m_b = sum_b / w_b
            m_f = (sum_total - sum_b) / w_f
            var_between = w_b * w_f * (m_b - m_f) ** 2
            if var_between > max_var:
                max_var = var_between
                threshold = i

        img = img.point(lambda p, t=threshold: 255 if p > t else 0, mode="1").convert("L")

    return img


def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("-l", "--lang", default=DEFAULT_LANG)
    parser.add_argument("--oem", type=int, choices=range(0, 4), default=None)
    parser.add_argument("--psm", type=int, choices=range(0, 14), default=None)
    parser.add_argument("--dpi", type=int, default=DEFAULT_DPI)
    parser.add_argument("--tessdata-dir", type=Path, default=None)
    parser.add_argument("--user-words", type=Path, default=None)
    parser.add_argument("--user-patterns", type=Path, default=None)
    parser.add_argument("--whitelist", type=str, default=None)
    parser.add_argument("--blacklist", type=str, default=None)

    parser.add_argument("--contrast", type=float, default=DEFAULT_CONTRAST)
    parser.add_argument("--sharpness", type=float, default=DEFAULT_SHARPNESS)
    parser.add_argument("--brightness", type=float, default=DEFAULT_BRIGHTNESS)
    parser.add_argument("--upscale", type=float, default=DEFAULT_UPSCALE)
    parser.add_argument("--gamma", type=float, default=None)
    parser.add_argument("--pad", type=int, default=0)
    parser.add_argument("--threshold", type=int, default=None)
    parser.add_argument("--median", type=int, default=None)
    parser.add_argument("--unsharp-radius", type=float, default=None)
    parser.add_argument("--unsharp-percent", type=int, default=150)
    parser.add_argument("--invert", action="store_true")
    parser.add_argument("--autocontrast-cutoff", type=int, default=None)
    parser.add_argument("--equalize", action="store_true")
    parser.add_argument("--auto-rotate", action="store_true")
    parser.add_argument("--otsu", action="store_true")

    parser.add_argument(
        "--config",
        nargs="*",
        default=[],
        metavar="CFG",
        help="Additional configuration flags passed verbatim to tesseract (e.g., -c foo=bar).",
    )

    return parser.parse_args(list(argv) if argv is not None else [])


#  Exécution Cellule 1 (jusqu’à l’affichage) 

args = parse_args()
ensure_environment(args.lang)

enhance = EnhanceOptions(
    contrast=args.contrast,
    sharpness=args.sharpness,
    brightness=args.brightness,
    upscale=args.upscale,
    gamma=args.gamma,
    pad=args.pad,
    median=args.median,
    unsharp_radius=args.unsharp_radius,
    unsharp_percent=args.unsharp_percent,
    invert=args.invert,
    autocontrast_cutoff=args.autocontrast_cutoff,
    equalize=args.equalize,
    auto_rotate=args.auto_rotate,
    otsu=args.otsu,
    threshold=args.threshold,
)

config_flags: List[str] = list(args.config)

# AJOUTE ÇA :
config_flags.append("-c preserve_interword_spaces=1")

if args.whitelist:
    config_flags.append(f"-c tessedit_char_whitelist={args.whitelist}")
if args.blacklist:
    config_flags.append(f"-c tessedit_char_blacklist={args.blacklist}")


def _normalize_input_files(val):
    if val is None:
        return []
    if isinstance(val, (list, tuple, set)):
        items = list(val)
    else:
        items = [val]

    out = []
    for item in items:
        if item is None:
            continue
        if isinstance(item, Path):
            out.append(str(item))
            continue
        s = str(item).strip()
        if not s:
            continue
        if "," in s:
            parts = [p.strip() for p in s.split(",") if p.strip()]
            out.extend(parts)
        else:
            out.append(s)
    return out

# Backwards-compatible alias (older cell name)
_normalize_input_file = _normalize_input_files

# Safeguard if INPUT_FILE cell not executed yet
INPUT_FILE = globals().get("INPUT_FILE", None)


def _load_images_from_path(path: Path, dpi: int):
    if path.suffix.lower() == ".pdf":
        try:
            from pdf2image import convert_from_path
        except Exception:
            sys.exit(
                "pdf2image is not available. Install it and Poppler to read PDF files."
            )
        try:
            return convert_from_path(str(path), dpi=dpi)
        except Exception as exc:
            sys.exit(f"PDF conversion failed for {path}: {exc}")
    # default: image file (supports multi-page TIFF)
    img = Image.open(path)
    n_frames = getattr(img, "n_frames", 1)
    if n_frames and n_frames > 1:
        images = []
        for i in range(n_frames):
            try:
                img.seek(i)
            except Exception:
                break
            images.append(img.copy())
        return images
    return [img]


input_items = _normalize_input_files(INPUT_FILE)
if not input_items:
    print("[info] Aucun fichier à OCR (image_only). Tout ce que tu as donné est détecté comme 'text'.")
    DOCS = []
else:
    DOCS = []
    for item in input_items:
        path = Path(item)
        if not path.is_absolute():
            path = (SCRIPT_DIR / path).resolve()

        if not path.exists():
            raise FileNotFoundError(f"INPUT_FILE not found: {path}")

        print(f"[info] Using INPUT_FILE={path}", file=sys.stderr)

        dpi_val = int(getattr(args, "dpi", DEFAULT_DPI) or DEFAULT_DPI)
        images = _load_images_from_path(path, dpi=dpi_val)

        if len(images) == 1:
            original = images[0]
            prepped = preprocess_image(original, enhance)
            DOCS.append({"path": path, "original": original, "prepped": prepped})
        else:
            total = len(images)
            for idx, original in enumerate(images, start=1):
                prepped = preprocess_image(original, enhance)
                DOCS.append({
                    "path": path,
                    "original": original,
                    "prepped": prepped,
                    "page_index": idx,
                    "page_count": total
                })


DOCS = []
for item in input_items:
    path = Path(item)
    if not path.is_absolute():
        path = (SCRIPT_DIR / path).resolve()

    if not path.exists():
        sys.exit(f"INPUT_FILE not found: {path}")

    print(f"[info] Using INPUT_FILE={path}", file=sys.stderr)

    dpi_val = int(getattr(args, "dpi", DEFAULT_DPI) or DEFAULT_DPI)
    images = _load_images_from_path(path, dpi=dpi_val)

    if len(images) == 1:
        original = images[0]
        prepped = preprocess_image(original, enhance)
        DOCS.append({"path": path, "original": original, "prepped": prepped})
    else:
        total = len(images)
        for idx, original in enumerate(images, start=1):
            prepped = preprocess_image(original, enhance)
            DOCS.append({
                "path": path,
                "original": original,
                "prepped": prepped,
                "page_index": idx,
                "page_count": total
            })

from IPython.display import display

for doc in DOCS:
    original = doc["original"]
    prepped = doc["prepped"]
    path = doc["path"]

    display(original.convert("RGB") if original.mode not in ("RGB","L") else original)

    if "SHOW_PREPROCESSED" not in globals() or SHOW_PREPROCESSED:
        display(prepped.convert("RGB") if prepped.mode not in ("RGB","L") else prepped)

# Keep globals aligned with the last document for backwards compatibility.
if DOCS:
    path = DOCS[-1]["path"]
    original = DOCS[-1]["original"]
    prepped = DOCS[-1]["prepped"]



[skip] content='text' -> c:\Users\moura\OneDrive\Bureau\DMS\test\documents\testword.docx
[info] Aucun fichier à OCR (image_only). Tout ce que tu as donné est détecté comme 'text'.


### si image passer sur teseract ou document extraire sont contenue apartire de sont contenue code a la fin les deux => input txt

In [41]:
# NOTE:
# Cette cellule suppose que la cellule précédente a déjà exécuté:
# - la détection/routage (TEXT_FILES / IMAGE_ONLY_FILES / INPUT_FILE)
# - le preprocess + affichage (DOCS avec "prepped")
# Donc ici on fait:
# 1) OCR Tesseract UNIQUEMENT sur DOCS (images -> [info])
# 2) Extraction NATIVE (sans OCR) sur TEXT_FILES (-> [skip] content='text')
#
# Objectif print:
# - 1 seule fois, à la fin
# - affiche: fichier, nb pages, puis texte de chaque page

import uuid
import re
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import List

import pytesseract
from pytesseract import Output

# ==================== Réglage PRINT ====================
# False => aucune sortie pendant OCR/native
# True  => debug pendant extraction (à éviter si tu veux 1 seul print)
PRINT_DURING_EXTRACTION = False

# -------------------- AJOUT MINIMAL (flags tesseract pour espaces/tables) --------------------
if "config_flags" in globals():
    if "-c preserve_interword_spaces=1" not in config_flags:
        config_flags.append("-c preserve_interword_spaces=1")
    if "-c textord_tabfind_find_tables=1" not in config_flags:
        config_flags.append("-c textord_tabfind_find_tables=1")

# -------------------- AJOUT MINIMAL (reconstruction layout via TSV) --------------------
def _median(values):
    values = sorted(values)
    n = len(values)
    if n == 0:
        return None
    mid = n // 2
    if n % 2 == 1:
        return values[mid]
    return (values[mid - 1] + values[mid]) / 2.0

def _estimate_char_metrics(data: dict):
    widths = []
    heights = []
    texts = data.get("text", [])
    confs = data.get("conf", [])
    ws = data.get("width", [])
    hs = data.get("height", [])

    for i, t in enumerate(texts):
        if t is None:
            continue
        s = str(t)
        if not s.strip():
            continue
        try:
            c = float(confs[i])
        except Exception:
            c = 0.0
        if c < 0:
            continue

        w = int(ws[i]) if i < len(ws) else 0
        h = int(hs[i]) if i < len(hs) else 0
        if h > 0:
            heights.append(h)

        L = len(s)
        if w > 0 and L > 0:
            widths.append(w / float(L))

    char_w = _median(widths) or 10.0
    line_h = _median(heights) or 20.0

    if char_w <= 1:
        char_w = 10.0
    if line_h <= 1:
        line_h = 20.0

    return float(char_w), float(line_h)

def _render_layout_from_data(data: dict, img_w: int, img_h: int) -> str:
    char_w, line_h = _estimate_char_metrics(data)
    line_tol = max(6.0, line_h * 0.55)

    items = []
    texts = data.get("text", [])
    confs = data.get("conf", [])
    lefts = data.get("left", [])
    tops = data.get("top", [])
    widths = data.get("width", [])
    heights = data.get("height", [])

    for i, t in enumerate(texts):
        if t is None:
            continue
        s = str(t)
        if not s.strip():
            continue
        try:
            c = float(confs[i])
        except Exception:
            c = 0.0
        if c < 0:
            continue

        l = int(lefts[i]) if i < len(lefts) else 0
        tp = int(tops[i]) if i < len(tops) else 0
        w = int(widths[i]) if i < len(widths) else 0
        h = int(heights[i]) if i < len(heights) else 0

        items.append({"text": s, "left": l, "top": tp, "right": l + w, "height": h})

    items.sort(key=lambda x: (x["top"], x["left"]))

    lines = []
    for it in items:
        placed = False
        if lines and abs(it["top"] - lines[-1]["top"]) <= line_tol:
            lines[-1]["words"].append(it)
            lines[-1]["top"] = min(lines[-1]["top"], it["top"])
            placed = True
        if not placed:
            for ln in reversed(lines):
                if abs(it["top"] - ln["top"]) <= line_tol:
                    ln["words"].append(it)
                    ln["top"] = min(ln["top"], it["top"])
                    placed = True
                    break
        if not placed:
            lines.append({"top": it["top"], "words": [it]})

    lines.sort(key=lambda ln: ln["top"])

    out_lines = []
    prev_row = None

    for ln in lines:
        words = sorted(ln["words"], key=lambda x: x["left"])
        row = int(round(ln["top"] / line_h)) if line_h > 0 else 0
        if prev_row is not None:
            gap = row - prev_row
            if gap > 1:
                for _ in range(gap - 1):
                    out_lines.append("")
        prev_row = row

        line_str = ""
        cursor = 0
        for w in words:
            col = int(round(w["left"] / char_w)) if char_w > 0 else 0
            if col < 0:
                col = 0

            if cursor == 0 and not line_str:
                if col > 0:
                    line_str += " " * col
                    cursor = col
            else:
                needed = col - cursor
                if needed <= 0:
                    needed = 1
                line_str += " " * needed
                cursor += needed

            line_str += w["text"]
            cursor += len(w["text"])

        out_lines.append(line_str)

    return "\n".join(out_lines)

# -------------------- OCR --------------------
config = build_config(
    args.oem,
    args.psm,
    config_flags,
    args.dpi,
    args.tessdata_dir,
    args.user_words,
    args.user_patterns,
)

if "DOCS" not in globals():
    DOCS = []

def _basename(val):
    if val is None:
        return None
    try:
        return Path(val).name
    except Exception:
        s = str(val)
        return s.replace("\\", "/").split("/")[-1]

# If DOCS is a list of pages (legacy), group into document-level objects
if DOCS and isinstance(DOCS[0], dict) and "pages" not in DOCS[0]:
    groups = {}
    for i, page in enumerate(DOCS, start=1):
        raw = str(page.get("path") or "batch")
        key = f"{raw}::p{page.get('page_index') or i}"
        groups.setdefault(key, []).append(page)

    packed = []
    for key, pages in groups.items():
        pages_sorted = sorted(pages, key=lambda p: int(p.get("page_index") or 0)) if pages else []

        source_files = [_basename(p.get("path")) for p in pages_sorted if _basename(p.get("path"))]
        source_files = list(dict.fromkeys(source_files))

        filename = source_files[0] if len(source_files) == 1 else (_basename(key) or "batch")

        doc = {"doc_id": str(uuid.uuid4()), "filename": filename, "source_files": source_files, "pages": []}
        page_index = 1
        for p in pages_sorted:
            idx = int(p.get("page_index") or page_index)
            src_path = p.get("path")
            doc["pages"].append({
                "page_index": idx,
                "image": p.get("original"),
                "prepped": p.get("prepped"),
                "source_path": src_path,
                "source_file": _basename(src_path)
            })
            page_index += 1
        doc["page_count_total"] = len(doc["pages"])
        packed.append(doc)

    DOCS = packed

# Ensure doc-level metadata consistency (even if DOCS already has pages)
for doc in DOCS:
    pages = doc.get("pages", []) or []
    for i, page in enumerate(pages, start=1):
        if not page.get("page_index"):
            page["page_index"] = i
        if not page.get("source_file"):
            src_path = page.get("source_path") or page.get("path")
            page["source_file"] = _basename(src_path)

    doc["page_count_total"] = len(pages)

    if not doc.get("source_files"):
        source_files = [p.get("source_file") for p in pages if p.get("source_file")]
        doc["source_files"] = list(dict.fromkeys(source_files))

    if not doc.get("filename"):
        if len(doc.get("source_files", [])) == 1:
            doc["filename"] = doc["source_files"][0]
        elif len(doc.get("source_files", [])) > 1:
            doc["filename"] = "batch"

for doc in DOCS:
    pages_text = []
    for page in doc.get("pages", []):
        prepped = page.get("prepped")
        if prepped is None:
            raise RuntimeError("prepped image missing. Run the input/preprocess cell first.")

        data = pytesseract.image_to_data(prepped, lang=args.lang, config=config, output_type=Output.DICT)
        w, h = prepped.size
        OCR_TEXT = _render_layout_from_data(data, w, h)

        page["ocr_text"] = OCR_TEXT
        pages_text.append(OCR_TEXT)

        if PRINT_DURING_EXTRACTION:
            src = page.get("source_file") or _basename(page.get("source_path")) or ""
            total = doc.get("page_count_total", 1)
            print(f"[ocr] {doc.get('filename')} | file={src} | page {page.get('page_index')}/{total}")
            print(OCR_TEXT)
            print("-" * 120)

    doc["pages_text"] = pages_text
    doc["ocr_text"] = "\n\n".join(pages_text)

# Backwards compatibility
if DOCS:
    OCR_TEXT = DOCS[-1].get("ocr_text", "")

# -------------------- EXTRACTION NATIVE POUR [skip] (TEXT_FILES) --------------------
def _get_pdf_reader_with_name():
    try:
        from pypdf import PdfReader  # type: ignore
        return PdfReader, "pypdf"
    except ImportError:
        try:
            from PyPDF2 import PdfReader  # type: ignore
            return PdfReader, "PyPDF2"
        except ImportError:
            return None, "none"

# AJOUT MINIMAL: tenter un mode "layout" si dispo, sinon fallback normal
def _pdf_extract_text_preserve_layout(page) -> str:
    try:
        return page.extract_text(extraction_mode="layout") or ""
    except TypeError:
        return page.extract_text() or ""
    except Exception:
        try:
            return page.extract_text() or ""
        except Exception:
            return ""

def _docx_xml_to_text(xml_bytes: bytes) -> str:
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    root = ET.fromstring(xml_bytes)

    out_lines = []
    for p in root.findall(".//w:p", ns):
        line_parts = []
        for node in p.iter():
            tag = node.tag
            if tag.endswith("}t"):
                line_parts.append(node.text if node.text is not None else "")
            elif tag.endswith("}tab"):
                line_parts.append("\t")
            elif tag.endswith("}br") or tag.endswith("}cr"):
                line_parts.append("\n")
        out_lines.append("".join(line_parts))
    return "\n".join(out_lines)

def _pptx_slide_xml_to_text(xml_bytes: bytes) -> str:
    ns = {
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
    }
    root = ET.fromstring(xml_bytes)

    out_lines = []
    for para in root.findall(".//a:p", ns):
        parts = []
        for node in para.iter():
            tag = node.tag
            if tag.endswith("}t"):
                parts.append(node.text if node.text is not None else "")
            elif tag.endswith("}br"):
                parts.append("\n")
        out_lines.append("".join(parts))
    return "\n".join(out_lines)

def _xlsx_col_to_index(col_letters: str) -> int:
    n = 0
    for ch in col_letters:
        if "A" <= ch <= "Z":
            n = n * 26 + (ord(ch) - ord("A") + 1)
    return n

def _xlsx_shared_strings(xml_bytes: bytes) -> list:
    root = ET.fromstring(xml_bytes)
    ns = {"s": "http://schemas.openxmlformats.org/spreadsheetml/2006/main"}
    out = []
    for si in root.findall(".//s:si", ns):
        parts = []
        for t in si.findall(".//s:t", ns):
            parts.append(t.text if t.text is not None else "")
        out.append("".join(parts))
    return out

def _xlsx_sheet_to_text(sheet_xml: bytes, shared: list) -> str:
    ns = {"s": "http://schemas.openxmlformats.org/spreadsheetml/2006/main"}
    root = ET.fromstring(sheet_xml)

    lines = []
    for row in root.findall(".//s:row", ns):
        cells = row.findall("./s:c", ns)
        row_map = {}
        max_col = 0

        for c in cells:
            r = c.get("r") or ""
            col_letters = "".join([ch for ch in r if ch.isalpha()]).upper()
            col_idx = _xlsx_col_to_index(col_letters) if col_letters else 0
            if col_idx > max_col:
                max_col = col_idx

            cell_type = c.get("t")
            v = c.find("./s:v", ns)
            is_node = c.find("./s:is", ns)

            val = ""
            if cell_type == "s" and v is not None and v.text is not None:
                try:
                    val = shared[int(v.text)]
                except Exception:
                    val = v.text
            elif cell_type == "inlineStr" and is_node is not None:
                parts = []
                for t in is_node.findall(".//s:t", ns):
                    parts.append(t.text if t.text is not None else "")
                val = "".join(parts)
            else:
                if v is not None and v.text is not None:
                    val = v.text

            row_map[col_idx] = val

        if max_col <= 0:
            lines.append("")
        else:
            parts = []
            for i in range(1, max_col + 1):
                parts.append(row_map.get(i, ""))
            lines.append("\t".join(parts))

    return "\n".join(lines)

def _odf_content_to_text(xml_bytes: bytes) -> str:
    ns_text = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
    root = ET.fromstring(xml_bytes)

    def walk(node):
        pieces = []
        if node.text is not None:
            pieces.append(node.text)

        for child in list(node):
            tag = child.tag
            if tag == f"{{{ns_text}}}s":
                c = child.get(f"{{{ns_text}}}c") or child.get("c") or "1"
                try:
                    pieces.append(" " * int(c))
                except Exception:
                    pieces.append(" ")
            else:
                pieces.append(walk(child))

            if child.tail is not None:
                pieces.append(child.tail)
        return "".join(pieces)

    out_lines = []
    for p in root.iter():
        if p.tag == f"{{{ns_text}}}p":
            out_lines.append(walk(p))
    return "\n".join(out_lines)

def _html_bytes_to_text_preserve(b: bytes) -> str:
    b = re.sub(rb"(?i)<br\s*/?>", b"\n", b)
    b = re.sub(rb"(?i)</p\s*>", b"\n", b)
    b = re.sub(rb"<[^>]+>", b" ", b)
    try:
        return b.decode("utf-8", errors="ignore")
    except Exception:
        return str(b)

def extract_text_native(path: str) -> dict:
    ft = detect_path_type(path)  # défini dans cellule précédente
    ext = ft.ext.lower()
    filename = Path(path).name

    # PDF
    if ext == ".pdf":
        PdfReader, backend = _get_pdf_reader_with_name()
        if PdfReader is not None:
            reader = PdfReader(path)
            pages = reader.pages
            pages_text = []
            total = len(pages)

            for i, page in enumerate(pages, start=1):
                # MODIF MINIMALE: extraction "layout" si dispo => garde mieux espaces/sauts de ligne
                txt = _pdf_extract_text_preserve_layout(page)
                pages_text.append(txt)
                if PRINT_DURING_EXTRACTION:
                    print(f"[native:{backend}] {filename} page {i}/{total}")
                    print(txt)
                    print("-" * 120)

            full = "\n\n".join(pages_text)
            return {
                "doc_id": str(uuid.uuid4()),
                "filename": filename,
                "source_path": path,
                "content": "text",
                "extraction": f"native:pdf:{backend}",
                "text": full,
                "pages_text": pages_text,
                "page_count_total": total,
            }

        # Fallback pdfminer
        try:
            from pdfminer.high_level import extract_text  # type: ignore
            full = extract_text(path) or ""
            pages = full.split("\f")
            pages_text = [p for p in pages]  # garder brut
            total = len(pages_text)

            if PRINT_DURING_EXTRACTION:
                for i, txt in enumerate(pages_text, start=1):
                    print(f"[native:pdfminer] {filename} page {i}/{total}")
                    print(txt)
                    print("-" * 120)

            full2 = "\n\n".join(pages_text)
            return {
                "doc_id": str(uuid.uuid4()),
                "filename": filename,
                "source_path": path,
                "content": "text",
                "extraction": "native:pdf:pdfminer",
                "text": full2,
                "pages_text": pages_text,
                "page_count_total": total,
            }
        except Exception:
            return {
                "doc_id": str(uuid.uuid4()),
                "filename": filename,
                "source_path": path,
                "content": "text",
                "extraction": "native:pdf:none",
                "text": "",
                "pages_text": [""],
                "page_count_total": 1,
            }

    # Office/OpenDocument/EPUB
    if ext in {".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub"}:
        try:
            with zipfile.ZipFile(path, "r") as z:
                names = z.namelist()

                if ext == ".docx":
                    parts = []
                    if "word/document.xml" in names:
                        parts.append(_docx_xml_to_text(z.read("word/document.xml")))
                    for nm in names:
                        if nm.startswith("word/header") and nm.endswith(".xml"):
                            parts.append(_docx_xml_to_text(z.read(nm)))
                        if nm.startswith("word/footer") and nm.endswith(".xml"):
                            parts.append(_docx_xml_to_text(z.read(nm)))
                    text = "\n\n".join(parts)
                    return {
                        "doc_id": str(uuid.uuid4()),
                        "filename": filename,
                        "source_path": path,
                        "content": "text",
                        "extraction": "native:docx:xml",
                        "text": text,
                        "pages_text": [text],      # docx: pas de "pages" fiables => 1 bloc
                        "page_count_total": 1,
                    }

                if ext == ".xlsx":
                    shared = []
                    if "xl/sharedStrings.xml" in names:
                        try:
                            shared = _xlsx_shared_strings(z.read("xl/sharedStrings.xml"))
                        except Exception:
                            shared = []

                    sheet_files = [nm for nm in names if nm.startswith("xl/worksheets/") and nm.endswith(".xml")]
                    sheet_files_sorted = sorted(sheet_files)

                    pages_text = []
                    total = len(sheet_files_sorted)
                    for nm in sheet_files_sorted:
                        sheet_text = _xlsx_sheet_to_text(z.read(nm), shared)
                        pages_text.append(sheet_text)

                    text = "\n\n".join(pages_text)
                    return {
                        "doc_id": str(uuid.uuid4()),
                        "filename": filename,
                        "source_path": path,
                        "content": "text",
                        "extraction": "native:xlsx:xml",
                        "text": text,
                        "pages_text": pages_text,   # sheets = pages
                        "page_count_total": max(1, total),
                    }

                if ext == ".pptx":
                    slides = [nm for nm in names if nm.startswith("ppt/slides/") and nm.endswith(".xml")]
                    slides_sorted = sorted(slides)
                    pages_text = []
                    total = len(slides_sorted)
                    for nm in slides_sorted:
                        pages_text.append(_pptx_slide_xml_to_text(z.read(nm)))
                    text = "\n\n".join(pages_text)
                    return {
                        "doc_id": str(uuid.uuid4()),
                        "filename": filename,
                        "source_path": path,
                        "content": "text",
                        "extraction": "native:pptx:xml",
                        "text": text,
                        "pages_text": pages_text,   # slides = pages
                        "page_count_total": max(1, total),
                    }

                if ext in {".odt", ".ods", ".odp"}:
                    text = ""
                    if "content.xml" in names:
                        text = _odf_content_to_text(z.read("content.xml"))
                    return {
                        "doc_id": str(uuid.uuid4()),
                        "filename": filename,
                        "source_path": path,
                        "content": "text",
                        "extraction": f"native:{ext[1:]}:xml",
                        "text": text,
                        "pages_text": [text],
                        "page_count_total": 1,
                    }

                if ext == ".epub":
                    htmls = [nm for nm in names if nm.lower().endswith((".xhtml", ".html", ".htm"))]
                    htmls_sorted = sorted(htmls)
                    pages_text = []
                    total = len(htmls_sorted)
                    for nm in htmls_sorted:
                        try:
                            b = z.read(nm)
                        except Exception:
                            b = b""
                        pages_text.append(_html_bytes_to_text_preserve(b))
                    text = "\n\n".join(pages_text)
                    return {
                        "doc_id": str(uuid.uuid4()),
                        "filename": filename,
                        "source_path": path,
                        "content": "text",
                        "extraction": "native:epub:html",
                        "text": text,
                        "pages_text": pages_text,
                        "page_count_total": max(1, total),
                    }

        except Exception as e:
            return {
                "doc_id": str(uuid.uuid4()),
                "filename": filename,
                "source_path": path,
                "content": "text",
                "extraction": "native:zip:error",
                "text": "",
                "pages_text": [""],
                "page_count_total": 1,
                "error": str(e),
            }

    return {
        "doc_id": str(uuid.uuid4()),
        "filename": filename,
        "source_path": path,
        "content": "text",
        "extraction": "native:unsupported",
        "text": "",
        "pages_text": [""],
        "page_count_total": 1,
    }

# TEXT_FILES vient de la cellule précédente (celle qui a fait les [skip])
TEXT_DOCS: List[dict] = []
if "TEXT_FILES" not in globals():
    TEXT_FILES = []

for p in TEXT_FILES:
    try:
        TEXT_DOCS.append(extract_text_native(p))
    except Exception as e:
        TEXT_DOCS.append({
            "doc_id": str(uuid.uuid4()),
            "filename": Path(p).name,
            "source_path": p,
            "content": "text",
            "extraction": "native:error",
            "text": "",
            "pages_text": [""],
            "page_count_total": 1,
            "error": str(e),
        })

# -------------------- SORTIE FINALE (OCR + NATIVE) --------------------
FINAL_DOCS: List[dict] = []

# OCR docs (images)
for d in DOCS:
    pages_text = d.get("pages_text") or []
    page_count_total = d.get("page_count_total") or len(pages_text) or 1
    FINAL_DOCS.append({
        "doc_id": d.get("doc_id"),
        "filename": d.get("filename"),
        "content": "image_only",
        "extraction": "ocr:tesseract",
        "text": d.get("ocr_text", ""),
        "pages_text": pages_text,
        "page_count_total": page_count_total,
    })

# Native docs (text)
for d in TEXT_DOCS:
    pages_text = d.get("pages_text") or [d.get("text") or ""]
    page_count_total = d.get("page_count_total") or len(pages_text) or 1
    FINAL_DOCS.append({
        "doc_id": d.get("doc_id"),
        "filename": d.get("filename"),
        "content": "text",
        "extraction": d.get("extraction"),
        "text": d.get("text", ""),
        "pages_text": pages_text,
        "page_count_total": page_count_total,
    })

for d in FINAL_DOCS:
    filename = d.get("filename")
    content = d.get("content")
    extraction = d.get("extraction")
    pages_text = d.get("pages_text") or []
    total = int(d.get("page_count_total") or len(pages_text) or 1)

    print(f"[doc] {filename} | content={content} | extraction={extraction} | pages={total}")

    if not pages_text:
        print("")
        print("\n" + ("-" * 120) + "\n")
        continue

    for i, txt in enumerate(pages_text, start=1):
        print(f"[page {i}/{total}]")
        print(txt if txt is not None else "")
        print("-" * 120)

    print()


[doc] testword.docx | content=text | extraction=native:docx:xml | pages=1
[page 1/1]
Équipée d'un moteur V10 de 620 chevaux, l'Audi R8 passe de 0 à 100 km/h en moins de 3,5 secondes, affirmant ainsi sa place de supercar emblématique de la marque aux anneaux. 
------------------------------------------------------------------------------------------------------------------------



### Tokenisation "layout" (sentences)

In [42]:
import re
import pickle
import math
from pathlib import Path
import nltk

# ==================== Réglages ====================
TARGET = None

PRINT_SENTENCES = True
MAX_SENTENCES_PREVIEW = 80   # None => imprime tout
PRINT_REPR = False           # True => debug espaces invisibles via repr(chunk)

MIN_SENTENCE_NONSPACE = 12
PRINT_ONLY_SENTENCES = True
PRINT_PAGE_TEXT = False

# ==================== NLTK data ====================
def _ensure_nltk():
    for pkg, probe in (("punkt", "tokenizers/punkt"), ("punkt_tab", "tokenizers/punkt_tab")):
        try:
            nltk.data.find(probe)
        except LookupError:
            try:
                nltk.download(pkg, quiet=True)
            except Exception as e:
                print(f"[warn] NLTK download failed for {pkg}: {e}")

_ensure_nltk()

# ==================== Détection langue (simple) ====================
_AR_RE = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]")
_WORD_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", flags=re.UNICODE)

_FR_HINT = {"le","la","les","des","une","un","est","avec","pour","dans","sur","facture","date","total","tva","montant"}
_EN_HINT = {"the","and","to","of","in","is","for","with","invoice","date","total","vat","amount"}

def detect_lang(text: str) -> str:
    t = text or ""
    if _AR_RE.search(t):
        return "ar"
    words = [w.lower() for w in _WORD_RE.findall(t[:8000])]
    if not words:
        return "en"
    fr_score = sum(1 for w in words if w in _FR_HINT)
    en_score = sum(1 for w in words if w in _EN_HINT)
    if re.search(r"[éèêàùçôîï]", t.lower()):
        fr_score += 1
    return "fr" if fr_score >= en_score else "en"

# ==================== Sentence split "layout" (fallback) ====================
_AR_END_RE = re.compile(r"([.!?؟]+)(\s+|$)", flags=re.UNICODE)

def split_ar_layout(text: str):
    if not text:
        return []
    chunks = []
    last = 0
    for m in _AR_END_RE.finditer(text):
        end = m.end()
        chunks.append(text[last:end])
        last = end
    if last < len(text):
        chunks.append(text[last:])
    return chunks

def _load_punkt_pickle(lang_pickle_name: str):
    p = nltk.data.find(f"tokenizers/punkt/{lang_pickle_name}.pickle")
    with open(p, "rb") as f:
        return pickle.load(f)

def split_punkt_layout(text: str, lang_pickle_name: str):
    if not text:
        return []
    tok = _load_punkt_pickle(lang_pickle_name)
    spans = list(tok.span_tokenize(text))
    if not spans:
        return [text]
    starts = [0] + [spans[i][0] for i in range(1, len(spans))]
    ends = [spans[i+1][0] for i in range(len(spans)-1)] + [len(text)]
    return [text[starts[i]:ends[i]] for i in range(len(ends))]

def sentence_chunks_layout(text: str, lang: str):
    lang = (lang or "").lower()
    if lang.startswith("ar"):
        return split_ar_layout(text)
    if lang.startswith("fr"):
        return split_punkt_layout(text, "french")
    if lang.startswith("en"):
        return split_punkt_layout(text, "english")
    return split_punkt_layout(text, "english")

# ==================== Split sections/alinéas (layout-preserving) ====================
def _iter_line_spans(text: str):
    if not text:
        return
    start = 0
    for m in re.finditer(r"\n", text):
        end = m.end()
        yield start, end
        start = end
    if start < len(text):
        yield start, len(text)

def _collapse_ws(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip())

def _mask_digits(s: str) -> str:
    return re.sub(r"\d", "#", s)

_NUM_SIMPLE_RE = re.compile(r"(?i)^[ \t]*\(?\d{1,3}\)?[ \t]*[.)][ \t]*(?:\S|$)")
_ALPHA_RE      = re.compile(r"(?i)^[ \t]*\(?[a-z]\)?[ \t]*[.)][ \t]*(?:\S|$)")
_ROMAN_RE      = re.compile(r"(?i)^[ \t]*\(?[ivxlcdm]{1,8}\)?[ \t]*[.)][ \t]*(?:\S|$)")
_NUM_MULTI_RE  = re.compile(r"^[ \t]*\d{1,3}(?:\.\d{1,3})+[ \t]*(?:[.)])?[ \t]+(?=\S)")

_KEYWORD_STRONG_RE = re.compile(r"(?i)^[ \t]*(article|section|chapitre|chapter|part)\b")
_KEYWORD_WEAK_HEADING_RE = re.compile(
    r"""(?ix)^[ \t]*
    (schedule|exhibit|appendix|annexe|annex)
    [ \t]+
    ([A-Z0-9]{1,8}|[ivxlcdm]{1,8}|\d{1,3})
    [ \t]*
    (?:[:\-–—][ \t]*\S.*)?
    [ \t]*$
    """
)
_SEP_RE = re.compile(r"^[ \t]*[-_]{4,}[ \t]*$")

_LABEL_ONLY_RE = re.compile(
    r"(?is)^[ \t]*"
    r"(?:\(?\d{1,3}\)?|\(?[a-z]\)?|\(?[ivxlcdm]{1,8}\)?)"
    r"[ \t]*[.)][ \t]*$"
)

def _is_section_start_line(line: str) -> bool:
    s = (line or "").rstrip("\n")
    st = s.strip()
    if not st:
        return False
    if _SEP_RE.match(st):
        return False
    if _KEYWORD_STRONG_RE.match(s):
        return True
    if _KEYWORD_WEAK_HEADING_RE.match(s):
        return True
    if _NUM_SIMPLE_RE.match(s):
        return True
    if _NUM_MULTI_RE.match(s):
        label = _collapse_ws(s).split(" ", 1)[0]
        parts = label.split(".")
        if len(parts) >= 2 and parts[-1] in ("00", "000"):
            return False
        return True
    if _ALPHA_RE.match(s) or _ROMAN_RE.match(s):
        return True
    return False

def _merge_label_only(chunks):
    out = []
    i = 0
    while i < len(chunks):
        if i + 1 < len(chunks) and _LABEL_ONLY_RE.match(chunks[i]):
            out.append(chunks[i] + chunks[i+1])
            i += 2
        else:
            out.append(chunks[i])
            i += 1
    return out

def split_sections_layout(text: str, allow_alpha_roman: bool = True):
    if not text:
        return []
    starts = {0}
    for ls, le in _iter_line_spans(text):
        line = text[ls:le]
        if _is_section_start_line(line):
            if not allow_alpha_roman:
                s = line.rstrip("\n")
                if (
                    _KEYWORD_STRONG_RE.match(s)
                    or _KEYWORD_WEAK_HEADING_RE.match(s)
                    or _NUM_SIMPLE_RE.match(s)
                    or _NUM_MULTI_RE.match(s)
                ):
                    starts.add(ls)
            else:
                starts.add(ls)

    starts = sorted(starts)
    if len(starts) == 1:
        return [text]

    chunks = []
    for i in range(len(starts) - 1):
        a, b = starts[i], starts[i+1]
        if a != b:
            chunks.append(text[a:b])
    chunks.append(text[starts[-1]:])

    return _merge_label_only(chunks)

_PARA_BREAK_RE = re.compile(r"(?:\n[ \t]*){2,}")

def split_paragraphs_layout(text: str):
    if not text:
        return []
    starts = [0]
    for m in _PARA_BREAK_RE.finditer(text):
        starts.append(m.end())
    starts = sorted(set(starts))
    if len(starts) == 1:
        return [text]
    out = []
    for i in range(len(starts) - 1):
        out.append(text[starts[i]:starts[i+1]])
    out.append(text[starts[-1]:])
    return out

def chunk_layout_universal(text: str, lang: str):
    if not text:
        return []

    lines = [text[ls:le].rstrip("\n") for ls, le in _iter_line_spans(text)]
    num_kw_hits = 0
    alpha_roman_hits = 0

    for ln in lines:
        if not ln.strip():
            continue
        if (
            _KEYWORD_STRONG_RE.match(ln)
            or _KEYWORD_WEAK_HEADING_RE.match(ln)
            or _NUM_SIMPLE_RE.match(ln)
            or _NUM_MULTI_RE.match(ln)
        ):
            num_kw_hits += 1
        elif _ALPHA_RE.match(ln) or _ROMAN_RE.match(ln):
            alpha_roman_hits += 1

    is_structured = (num_kw_hits >= 2) or (alpha_roman_hits >= 3)

    if is_structured:
        chunks = split_sections_layout(text, allow_alpha_roman=True)
        if len(chunks) > 1:
            return chunks

    paras = split_paragraphs_layout(text)
    if len(paras) > 1:
        return paras

    return sentence_chunks_layout(text, lang)

# ======================================================================
#  MULTI-COLONNES (général, robuste) + TABLE (inchangé)
#  + micro-table: interpréter les headers multi-col comme un "table chunk"
# ======================================================================

GAP_MIN_OCR = 10
GAP_MIN_NATIVE = 6

MERGE_COL_DIST_OCR = 22
MERGE_COL_DIST_NATIVE = 16

MICROTABLE_MAX_ROWS = 30
MICROTABLE_MIN_DENS = 0.25
MICROTABLE_MIN_MULTIROW = 2

TABLE_HINT_RE = re.compile(
    r"""(?ix)
    \b(
        qt[ée]|
        désignation|designation|
        prix|
        montan?t|
        r[ée]f[ée]rence|reference|
        description|
        quantit[ée]|
        p\.?\s*unitaire|
        valeur|
        total\s*ht|total|
        tva|vat
    )\b
    """
)

NUM_RE = re.compile(r"\d+(?:[.,]\d+)?")
DEC_RE = re.compile(r"\d+[.,]\d+")

def _space_runs_ge(s: str, n: int):
    return [(m.start(), m.end()) for m in re.finditer(r"[ ]{%d,}" % n, s or "")]

def _has_big_gap(s: str, gap_min: int, min_count: int = 1) -> bool:
    return len(_space_runs_ge(s, gap_min)) >= min_count

def _num_tokens(s: str) -> int:
    return len(NUM_RE.findall(s or ""))

def _dec_tokens(s: str) -> int:
    return len(DEC_RE.findall(s or ""))

def _is_table_line(line: str, gap_min: int) -> bool:
    s = (line or "").rstrip("\n")
    if not s.strip():
        return False
    if s.count("\t") >= 2:
        return True
    if TABLE_HINT_RE.search(s):
        return True
    if _has_big_gap(s, gap_min, min_count=2):
        if _num_tokens(s) >= 3:
            return True
        if _dec_tokens(s) >= 1:
            return True
    return False

def _cluster_centers(values, tol=2, min_hits=1):
    if not values:
        return []
    xs = sorted(values)
    clusters = []
    cur = [xs[0]]
    for v in xs[1:]:
        if abs(v - cur[-1]) <= tol:
            cur.append(v)
        else:
            clusters.append(cur)
            cur = [v]
    clusters.append(cur)

    centers = []
    for c in clusters:
        if len(c) >= min_hits:
            c2 = sorted(c)
            centers.append(c2[len(c2)//2])
    return sorted(set(centers))

def _upper_ratio(s: str) -> float:
    letters = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ]", s or "")
    if not letters:
        return 0.0
    upp = sum(1 for ch in letters if ch.isupper())
    return upp / max(1, len(letters))

def _sep_spans(line: str, gap_min: int):
    s = line or ""
    spans = []
    for m in re.finditer(r"\t+", s):
        spans.append((m.start(), m.end()))
    for m in re.finditer(r"[ ]{%d,}" % gap_min, s):
        spans.append((m.start(), m.end()))
    if not spans:
        return []
    spans.sort()
    merged = [spans[0]]
    for a, b in spans[1:]:
        la, lb = merged[-1]
        if a <= lb:
            merged[-1] = (la, max(lb, b))
        else:
            merged.append((a, b))
    return merged

def _line_segments_by_gaps(line: str, gap_min: int):
    s = (line or "").rstrip("\n")
    if not s.strip():
        return []
    seps = _sep_spans(s, gap_min)
    segs = []
    prev = 0
    cuts = seps + [(len(s), len(s))]
    for a, b in cuts:
        if a < prev:
            continue
        chunk = s[prev:a]
        m1 = re.search(r"\S", chunk)
        if m1:
            l = m1.start()
            r = len(chunk.rstrip(" \t"))
            text = chunk[l:r]
            segs.append({"x": prev + l, "a": prev + l, "b": prev + r, "text": text})
        prev = b
    return segs

def _looks_like_title_line(line: str) -> bool:
    s = (line or "").rstrip("\n").strip()
    if not s or len(s) > 50:
        return False
    if _SEP_RE.match(s):
        return False
    if _is_section_start_line(line):
        return True
    if _upper_ratio(s) >= 0.85 and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", s) and not re.search(r"\d", s):
        return True
    return False

def _is_multicol_candidate_line(line: str, gap_min: int, is_ocr: bool) -> bool:
    s = (line or "").rstrip("\n")
    st = s.strip()
    if not st:
        return False
    if _SEP_RE.match(st):
        return False
    if _is_table_line(line, gap_min):
        return False

    segs = _line_segments_by_gaps(s, gap_min)
    if len(segs) >= 3:
        return True
    if len(segs) == 2:
        if is_ocr:
            return True
        if _has_big_gap(s, gap_min, 1):
            return True
        if re.search(r"[:#№°/\\\-–—]", s) or re.search(r"\d", s):
            return True
        if _upper_ratio(s) >= 0.70:
            return True
    return False

_KV_GENERIC_RE = re.compile(r"^\s*(?P<k>[^:]{1,80}?)\s{2,}(?P<v>\S.+?)\s*$")

def _looks_like_header_pair(k: str, v: str) -> bool:
    k2 = (k or "").strip()
    v2 = (v or "").strip()
    if not k2 or not v2:
        return False
    if len(k2) <= 25 and len(v2) <= 25 and _upper_ratio(k2) >= 0.85 and _upper_ratio(v2) >= 0.85:
        if not re.search(r"\d", k2 + v2):
            return True
    return False

def _looks_like_addressish(line: str) -> bool:
    s = (line or "").strip()
    if not s:
        return False
    if re.search(r"(rue|route|avenue|bd|boulevard|street|st\.|road|zip|code\s*postal|bp)", s, flags=re.I):
        return True
    if len(s) >= 10 and not s.endswith(":") and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", s):
        return True
    return False

def _normalize_kv_generic(text: str) -> str:
    out = []
    for raw in (text or "").splitlines():
        line = raw.rstrip("\n")
        if not line.strip():
            out.append("")
            continue
        if ":" in line:
            out.append(line.strip())
            continue
        if _looks_like_addressish(line):
            out.append(line.strip())
            continue
        m = _KV_GENERIC_RE.match(line)
        if not m:
            out.append(line.strip())
            continue
        k = _collapse_ws(m.group("k"))
        v = _collapse_ws(m.group("v"))
        if _looks_like_header_pair(k, v):
            out.append(line.strip())
            continue
        if not re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", k):
            out.append(line.strip())
            continue
        out.append(f"{k}: {v}" if v else k)
    return "\n".join(out) + ("\n" if (text or "").endswith("\n") else "")

def _strip_sep_lines(block_text: str) -> str:
    if not block_text:
        return ""
    out = []
    for ln in (block_text or "").splitlines():
        if _SEP_RE.match(ln.strip()):
            continue
        out.append(ln.rstrip())
    txt = "\n".join(out).rstrip()
    return txt + ("\n" if (block_text or "").endswith("\n") else "")

def _assign_to_centers(x: int, centers, tol: int):
    if not centers:
        return 0
    best_i = 0
    best_d = abs(x - centers[0])
    for i in range(1, len(centers)):
        d = abs(x - centers[i])
        if d < best_d:
            best_d = d
            best_i = i
    return best_i

def _merge_close_columns(centers, row_cells, merge_dist: int):
    i = 0
    while i < len(centers) - 1:
        if (centers[i+1] - centers[i]) <= merge_dist:
            both = 0
            alone_next = 0
            for r in row_cells:
                hi = i in r
                hj = (i+1) in r
                if hj and hi:
                    both += 1
                elif hj and not hi:
                    alone_next += 1
            if both >= 1 and alone_next <= max(1, int(0.2 * (both + alone_next))):
                for r in row_cells:
                    if (i+1) in r:
                        t2, sp2 = r.pop(i+1)
                        if i in r:
                            t1, sp1 = r[i]
                            r[i] = ((t1 + "  " + t2).strip(), sp1 + sp2)
                        else:
                            r[i] = (t2, sp2)

                centers.pop(i+1)

                for r in row_cells:
                    ks = sorted([k for k in r.keys() if k > i+1])
                    for k in ks:
                        r[k-1] = r.pop(k)
                continue
        i += 1
    return centers, row_cells

def _is_grid_like(row_cells, col_count: int):
    if col_count < 2:
        return False
    rows = [r for r in row_cells if any((t.strip() for t, _ in r.values()))]
    if not rows:
        return False
    n_rows = len(rows)
    if n_rows > 5:
        return False
    dens = sum((len(r) / max(1, col_count)) for r in rows) / n_rows
    return dens >= 0.70

def _is_micro_table_like(row_cells, col_count: int) -> bool:
    if col_count < 2:
        return False
    rows = [r for r in row_cells if any((t.strip() for t, _ in r.values()))]
    if len(rows) < 2:
        return False
    if len(rows) > MICROTABLE_MAX_ROWS:
        return False
    multi = sum(1 for r in rows if len(r) >= 2)
    if multi < MICROTABLE_MIN_MULTIROW:
        return False
    dens = sum((len(r) / max(1, col_count)) for r in rows) / max(1, len(rows))
    return dens >= MICROTABLE_MIN_DENS

def _transpose_or_group_multicol(block_text: str, abs_start: int, gap_min: int, is_ocr: bool):
    lines = []
    segs_by_line = []

    for ls, le in _iter_line_spans(block_text):
        line_full = block_text[ls:le]
        s = line_full[:-1] if line_full.endswith("\n") else line_full

        lines.append((ls, le, line_full, s))

        if _SEP_RE.match(s.strip()):
            segs_by_line.append([])
            continue

        segs = _line_segments_by_gaps(s, gap_min)
        segs = [g for g in segs if g.get("text", "").strip()]
        segs_by_line.append(segs)

    xs = []
    for segs in segs_by_line:
        for g in segs:
            txt = g["text"].strip()
            if len(txt) == 1 and txt in (":", "|", "-", "_"):
                continue
            xs.append(int(g["x"]))

    if not xs:
        txt = _strip_sep_lines(block_text)
        return [{
            "text": txt,
            "spans": [(abs_start, abs_start + len(block_text))],
            "start": abs_start,
            "end": abs_start + len(block_text),
            "layout_kind": "plain",
            "col_index": None,
            "block_start": abs_start,
            "block_end": abs_start + len(block_text),
        }]

    tol_cluster = 3 if is_ocr else 2
    centers = _cluster_centers(xs, tol=tol_cluster, min_hits=1)
    min_x = min(xs)
    if min_x not in centers:
        centers = sorted([min_x] + centers)
    centers = centers[:8]

    tol_assign = 6 if is_ocr else 4
    row_cells = []
    for (ls, le, line_full, s), segs in zip(lines, segs_by_line):
        r = {}
        for g in segs:
            ci = _assign_to_centers(int(g["x"]), centers, tol_assign)
            a = abs_start + ls + int(g["a"])
            b = abs_start + ls + int(g["b"])
            txt = g["text"].strip()

            if ci in r:
                t0, sp0 = r[ci]
                r[ci] = ((t0 + " " + txt).strip(), sp0 + [(a, b)])
            else:
                r[ci] = (txt, [(a, b)])
        row_cells.append(r)

    merge_dist = MERGE_COL_DIST_OCR if is_ocr else MERGE_COL_DIST_NATIVE
    centers, row_cells = _merge_close_columns(centers, row_cells, merge_dist=merge_dist)
    col_count = len(centers)

    if _is_micro_table_like(row_cells, col_count):
        table_rows = []
        for (ls, le, line_full, s) in lines:
            table_rows.append({"text": line_full, "spans": [(abs_start + ls, abs_start + le)]})

        table_cells = []
        for r in row_cells:
            row = []
            for ci in range(col_count):
                if ci in r:
                    t, sp = r[ci]
                    row.append({"col": ci, "text": t, "spans": [(int(a), int(b)) for a, b in sp if b > a]})
                else:
                    row.append({"col": ci, "text": "", "spans": []})
            table_cells.append(row)

        txt = _strip_sep_lines(block_text)
        return [{
            "text": txt,
            "spans": [(abs_start, abs_start + len(block_text))],
            "start": abs_start,
            "end": abs_start + len(block_text),
            "layout_kind": "header",
            "col_index": None,
            "block_start": abs_start,
            "block_end": abs_start + len(block_text),
            "table_rows": table_rows,
            "table_cells": table_cells,
            "header_source": "micro_multicol",
            "column_centers": centers,
        }]

    if _is_grid_like(row_cells, col_count):
        return [{
            "text": _strip_sep_lines(block_text),
            "spans": [(abs_start, abs_start + len(block_text))],
            "start": abs_start,
            "end": abs_start + len(block_text),
            "layout_kind": "multicol_grid",
            "col_index": None,
            "block_start": abs_start,
            "block_end": abs_start + len(block_text),
        }]

    col_items = []
    for ci in range(col_count):
        out_lines = []
        spans = []
        for r in row_cells:
            if ci in r:
                t, sp = r[ci]
                out_lines.append(t)
                spans.extend(sp)
            else:
                out_lines.append("")

        while out_lines and not out_lines[0].strip():
            out_lines.pop(0)
        while out_lines and not out_lines[-1].strip():
            out_lines.pop()

        compact = []
        blank = 0
        for ln in out_lines:
            if not ln.strip():
                blank += 1
                if blank <= 1:
                    compact.append("")
            else:
                blank = 0
                compact.append(ln)

        txt = "\n".join(compact).rstrip() + ("\n" if block_text.endswith("\n") else "")
        txt = _normalize_kv_generic(txt)

        if not _collapse_ws(txt).strip():
            continue

        if spans:
            st = min(a for a, _ in spans)
            en = max(b for _, b in spans)
        else:
            st = abs_start
            en = abs_start + len(block_text)

        col_items.append({
            "text": txt,
            "spans": [(int(a), int(b)) for (a, b) in spans if b > a],
            "start": st,
            "end": en,
            "layout_kind": "multicol_col",
            "col_index": ci,
            "block_start": abs_start,
            "block_end": abs_start + len(block_text),
        })

    if not col_items:
        return [{
            "text": _strip_sep_lines(block_text),
            "spans": [(abs_start, abs_start + len(block_text))],
            "start": abs_start,
            "end": abs_start + len(block_text),
            "layout_kind": "plain",
            "col_index": None,
            "block_start": abs_start,
            "block_end": abs_start + len(block_text),
        }]

    return col_items

def _looks_like_paragraphish(line_full: str) -> bool:
    s = (line_full or "").strip()
    if not s:
        return False
    if len(s) >= 120:
        words = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", s)
        if len(words) >= 10 and not _has_big_gap(s, 6, 1):
            return True
    return False

def _is_address_continuation_line(line_full: str, gap_min: int, is_ocr: bool) -> bool:
    s = (line_full or "").rstrip("\n")
    st = s.strip()
    if not st:
        return True
    if _SEP_RE.match(st):
        return True
    if _is_table_line(line_full, gap_min):
        return False
    if TABLE_HINT_RE.search(st):
        return False
    if _is_section_start_line(line_full):
        return False
    if _dec_tokens(st) > 0:
        return False
    if _num_tokens(st) > (4 if is_ocr else 6):
        return False
    if re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", st) or _AR_RE.search(st):
        return True
    if re.match(r"^\d{4,6}$", st):
        return True
    if re.search(r"[@+/,-]", st) and len(st) <= 120:
        return True
    return False

def _collect_table_block(lines, start_i, gap_min):
    n = len(lines)
    i = start_i
    blank_run = 0
    seen_data = 0
    collected = []

    # FIX: reconnaître une "wrap line" indépendamment de la ligne précédente (utile pour chaîner wrap+wrap)
    def _looks_like_wrap_line(s_raw: str) -> bool:
        if not s_raw:
            return False
        # doit être indenté (comme dans ton exemple)
        if not re.match(r"^[ \t]{2,}\S", s_raw):
            return False
        s_l = s_raw.lstrip(" \t")
        # pas une ligne structurante, pas de structure table
        if _is_section_start_line(s_raw):
            return False
        if s_l.count("\t") >= 2:
            return False
        if _has_big_gap(s_l, gap_min, min_count=1):
            return False
        # très peu de signaux numériques (évite d'absorber TOTAL/TVA/etc)
        if _dec_tokens(s_l) != 0:
            return False
        if _num_tokens(s_l) > 1:
            return False
        return True

    while i < n:
        line_full, ls, le = lines[i]
        s = line_full.rstrip("\n")

        if not s.strip():
            blank_run += 1
            collected.append((line_full, ls, le))
            i += 1
            continue

        is_tbl = _is_table_line(line_full, gap_min)

        if is_tbl:
            blank_run = 0
            if _dec_tokens(s) >= 1 or _num_tokens(s) >= 3 or TABLE_HINT_RE.search(s):
                seen_data += 1
            collected.append((line_full, ls, le))
            i += 1
            continue

        # FIX: accepter wrap, y compris wrap qui suit wrap (pas seulement table_line)
        if seen_data >= 1 and _looks_like_wrap_line(s):
            prev_nonblank = None
            for plf, _, _ in reversed(collected):
                if plf.strip():
                    prev_nonblank = plf.rstrip("\n")
                    break
            # on continue si la ligne précédente est soit une ligne de table, soit déjà une wrap
            if prev_nonblank and (_is_table_line(prev_nonblank, gap_min) or _looks_like_wrap_line(prev_nonblank)):
                blank_run = 0
                collected.append((line_full, ls, le))
                i += 1
                continue

        if seen_data >= 2 and blank_run >= 2:
            break
        if seen_data >= 1 and blank_run >= 1:
            break
        break

    while collected and not collected[-1][0].strip():
        collected.pop()

    return collected, i

def _make_span_item(page_text, spans, text_override, kind, meta=None):
    spans2 = [(int(a), int(b)) for (a, b) in (spans or []) if b > a]
    if spans2:
        st = min(a for a, _ in spans2)
        en = max(b for _, b in spans2)
    else:
        st = 0
        en = 0
    it = {"text": text_override, "spans": spans2, "start": st, "end": en, "layout_kind": kind}
    if meta:
        it.update(meta)
    return it

def layout_items(page_text: str, lang: str, extraction: str = ""):
    if not page_text:
        return []

    is_ocr = str(extraction or "").startswith("ocr:")
    gap_min = GAP_MIN_OCR if is_ocr else GAP_MIN_NATIVE

    lines = []
    for ls, le in _iter_line_spans(page_text):
        lines.append((page_text[ls:le], ls, le))

    items = []
    i = 0
    n = len(lines)

    def _starts_table(i0):
        return _is_table_line(lines[i0][0], gap_min)

    def _starts_multicol(i0):
        return _is_multicol_candidate_line(lines[i0][0], gap_min=gap_min, is_ocr=is_ocr)

    while i < n:
        if _starts_table(i):
            collected, j = _collect_table_block(lines, i, gap_min=gap_min)
            if collected:
                a0 = collected[0][1]
                b0 = collected[-1][2]
                block_text = page_text[a0:b0]
                table_rows = [{"text": lf, "spans": [(lls, lle)]} for (lf, lls, lle) in collected]
                items.append(_make_span_item(
                    page_text,
                    spans=[(a0, b0)],
                    text_override=block_text,
                    kind="table",
                    meta={"table_rows": table_rows}
                ))
                i = j
                continue

        if _starts_multicol(i):
            start = i

            if start - 1 >= 0:
                prev_line = lines[start - 1][0]
                if _looks_like_title_line(prev_line) and not _starts_table(start - 1):
                    start -= 1

            j = i
            saw_any = False
            blank_run = 0
            noncol_inside = 0

            MAX_INBLOCK_BLANK = 6
            MAX_INBLOCK_LINES = 140
            MAX_NONCOL_INSIDE = 25
            weak_gap = max(3, gap_min - (3 if is_ocr else 2))

            while j < n and (j - start) < MAX_INBLOCK_LINES:
                if _starts_table(j):
                    break

                lf, lls, lle = lines[j]
                ss = lf.rstrip("\n")

                if not ss.strip() or _SEP_RE.match(ss.strip()):
                    blank_run += 1
                    j += 1
                    if saw_any and blank_run >= MAX_INBLOCK_BLANK:
                        break
                    continue

                blank_run = 0

                if _starts_multicol(j):
                    saw_any = True
                    noncol_inside = 0
                    j += 1
                    continue

                if saw_any and noncol_inside < MAX_NONCOL_INSIDE:
                    if _is_address_continuation_line(lf, gap_min=gap_min, is_ocr=is_ocr) and not _looks_like_paragraphish(lf):
                        noncol_inside += 1
                        j += 1
                        continue
                    if _has_big_gap(ss, weak_gap, min_count=1) and not _looks_like_paragraphish(lf):
                        noncol_inside += 1
                        j += 1
                        continue

                break

            end = j if j > i else i + 1

            a0 = lines[start][1]
            b0 = lines[end-1][2] if end-1 >= start else lines[start][2]
            block_text = page_text[a0:b0]

            items.extend(_transpose_or_group_multicol(block_text, abs_start=a0, gap_min=gap_min, is_ocr=is_ocr))

            i = end
            continue

        start = i
        j = i
        while j < n:
            if _starts_table(j) or _starts_multicol(j):
                break
            j += 1

        a0 = lines[start][1]
        b0 = lines[j-1][2] if j-1 >= start else lines[start][2]
        plain_text = page_text[a0:b0]

        chunks = chunk_layout_universal(plain_text, lang)
        pos = 0
        for ch in chunks:
            ca = a0 + pos
            cb = ca + len(ch)
            pos += len(ch)
            items.append(_make_span_item(page_text, spans=[(ca, cb)], text_override=ch, kind="plain"))

        i = j if j > start else i + 1

    def _k(it):
        if it.get("layout_kind") in ("multicol_col", "multicol_grid"):
            return (it.get("block_start", it.get("start", 0)), it.get("col_index", 0) if it.get("col_index") is not None else -1)
        return (it.get("start", 0), 0)

    items.sort(key=_k)
    return items

# ==================== Noise detection (audit) ====================
_NOISE_LINE_RE = re.compile(
    r"(?i)^\s*(sample|confidential|draft)\s*$|"
    r"^\s*page\s+\d+\s*(?:of|/)\s*\d+\s*$|"
    r"^\s*\d+\s*(?:of|/)\s*\d+\s*$"
)

def build_noise_keys_for_doc(pages_text):
    if not pages_text:
        return set()
    page_count = len(pages_text)
    if page_count < 3:
        return set()
    min_pages = max(3, int(math.ceil(page_count * 0.30)))

    counts = {}
    counts_masked = {}

    for txt in pages_text:
        seen = set()
        seen_m = set()
        for ls, le in _iter_line_spans(txt or ""):
            line = (txt[ls:le]).rstrip("\n")
            key = _collapse_ws(line).lower()
            if not key:
                continue

            if _SEP_RE.match(key) or _NOISE_LINE_RE.match(line):
                counts[key] = counts.get(key, 0) + 1
                continue

            mkey = _mask_digits(key)

            if key not in seen:
                counts[key] = counts.get(key, 0) + 1
                seen.add(key)
            if mkey not in seen_m:
                counts_masked[mkey] = counts_masked.get(mkey, 0) + 1
                seen_m.add(mkey)

    noise_keys = set()
    for k, c in counts.items():
        if c >= min_pages:
            noise_keys.add(k)
    for mk, c in counts_masked.items():
        if c >= min_pages:
            noise_keys.add(mk)

    return noise_keys

def chunk_is_noise(chunk_text: str, noise_keys: set) -> bool:
    if not chunk_text:
        return True

    has_nonempty = False
    for ls, le in _iter_line_spans(chunk_text):
        line = chunk_text[ls:le].rstrip("\n")
        st = line.strip()
        if not st:
            continue
        if _SEP_RE.match(st):
            continue

        has_nonempty = True
        key = _collapse_ws(line).lower()
        mkey = _mask_digits(key)

        if _NOISE_LINE_RE.match(line):
            continue
        if key in noise_keys or mkey in noise_keys:
            continue

        return False

    return True if has_nonempty else True

# ==================== Helpers emplacement (page) ====================
_WS_RE = re.compile(r"\s+", flags=re.UNICODE)

def _nonspace_len(s: str) -> int:
    return len(_WS_RE.sub("", s or ""))

def _line_col_from_offset(text: str, off: int):
    if off < 0:
        off = 0
    if off > len(text):
        off = len(text)
    line = text.count("\n", 0, off) + 1
    last_nl = text.rfind("\n", 0, off)
    col = off if last_nl < 0 else (off - last_nl - 1)
    return line, col

# ==================== Metadonnées depuis DOCS / TEXT_DOCS ====================
def _safe_str(x):
    try:
        return str(x)
    except Exception:
        return ""

def _unique_keep_order(seq):
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

def _pdf_extract_pages_text(path: str):
    PdfReader = _get_pdf_reader()  # défini dans ta cellule d'extraction
    if PdfReader is None:
        return None
    try:
        reader = PdfReader(path)
        out = []
        for p in reader.pages:
            out.append(p.extract_text() or "")
        return out
    except Exception:
        return None

def _pdf_page_count(path: str):
    PdfReader = _get_pdf_reader()
    if PdfReader is None:
        return None
    try:
        return len(PdfReader(path).pages)
    except Exception:
        return None

# ==================== Vérifier FINAL_DOCS ====================
if "FINAL_DOCS" not in globals() or not isinstance(FINAL_DOCS, list):
    raise RuntimeError("FINAL_DOCS not found. Exécute d'abord la cellule précédente (celle qui imprime FINAL PRINT).")

# ==================== Construire une structure DOC -> PAGES ====================
DOC_PACK = []

# 1) OCR: DOCS (si dispo)
if "DOCS" in globals() and isinstance(DOCS, list):
    for d in DOCS:
        doc_id = d.get("doc_id")
        filename = d.get("filename") or "unknown"
        pages = d.get("pages", []) or []
        page_count_total = d.get("page_count_total") if d.get("page_count_total") else len(pages)

        paths = []
        for p in pages:
            sp = p.get("source_path") or p.get("path")
            if sp:
                paths.append(_safe_str(sp))
        paths = _unique_keep_order(paths)

        pages_out = []
        for p in pages:
            pages_out.append({
                "page_index": int(p.get("page_index") or 1),
                "text": p.get("ocr_text") or "",
                "source_path": _safe_str(p.get("source_path") or p.get("path") or ""),
            })
        pages_out.sort(key=lambda x: x["page_index"])

        DOC_PACK.append({
            "doc_id": doc_id,
            "filename": filename,
            "content": "image_only",
            "extraction": "ocr:tesseract",
            "paths": paths,
            "page_count_total": page_count_total,
            "pages": pages_out,
        })

# 2) NATIVE: TEXT_DOCS (si dispo)
if "TEXT_DOCS" in globals() and isinstance(TEXT_DOCS, list):
    for d in TEXT_DOCS:
        doc_id = d.get("doc_id")
        filename = d.get("filename") or "unknown"
        extraction = d.get("extraction") or "native:unknown"
        sp = d.get("source_path") or ""
        paths = _unique_keep_order([_safe_str(sp)]) if sp else []
        full_text = d.get("text") or ""

        pages_out = []
        page_count_total = d.get("page_count_total", None)
        pages_text = d.get("pages_text", None)

        if pages_text is not None and isinstance(pages_text, list) and len(pages_text) > 0:
            page_count_total = page_count_total or len(pages_text)
            for i2, txt in enumerate(pages_text, start=1):
                pages_out.append({
                    "page_index": i2,
                    "text": txt or "",
                    "source_path": _safe_str(sp),
                })
        else:
            if sp and str(sp).lower().endswith(".pdf") and Path(sp).exists():
                pages_text2 = _pdf_extract_pages_text(sp)
                if pages_text2:
                    page_count_total = page_count_total or len(pages_text2)
                    for i2, txt in enumerate(pages_text2, start=1):
                        pages_out.append({
                            "page_index": i2,
                            "text": txt or "",
                            "source_path": _safe_str(sp),
                        })
                else:
                    pages_out.append({
                        "page_index": 1,
                        "text": full_text,
                        "source_path": _safe_str(sp),
                    })
                    page_count_total = page_count_total or 1
            else:
                pages_out.append({
                    "page_index": 1,
                    "text": full_text,
                    "source_path": _safe_str(sp),
                })
                page_count_total = page_count_total or 1

        if page_count_total is None and sp and str(sp).lower().endswith(".pdf") and Path(sp).exists():
            pc = _pdf_page_count(sp)
            if pc is not None:
                page_count_total = pc

        DOC_PACK.append({
            "doc_id": doc_id,
            "filename": filename,
            "content": "text",
            "extraction": extraction,
            "paths": paths,
            "page_count_total": page_count_total,
            "pages": pages_out,
        })

# 3) Fallback à FINAL_DOCS
if not DOC_PACK:
    for d in FINAL_DOCS:
        DOC_PACK.append({
            "doc_id": d.get("doc_id"),
            "filename": d.get("filename") or "unknown",
            "content": d.get("content"),
            "extraction": d.get("extraction"),
            "paths": [],
            "page_count_total": 1,
            "pages": [{"page_index": 1, "text": d.get("text") or "", "source_path": ""}],
        })

# ==================== Tokeniser: construire TOK_DOCS ====================
TOK_DOCS = []

for doc in DOC_PACK:
    doc_id = doc.get("doc_id")
    filename = doc.get("filename") or "unknown"
    extraction = doc.get("extraction") or ""
    content_type = doc.get("content")
    paths = doc.get("paths") or []
    page_count_total = doc.get("page_count_total")

    pages_text_for_noise = [(p.get("text") or "") for p in (doc.get("pages") or [])]
    noise_keys = build_noise_keys_for_doc(pages_text_for_noise)

    pages_tok = []
    doc_chars_total = 0
    recompose_ok_doc = True

    for pg in (doc.get("pages") or []):
        page_index = int(pg.get("page_index") or 1)
        page_text = pg.get("text") or ""
        doc_chars_total += len(page_text)

        lang = detect_lang(page_text)

        items = layout_items(page_text, lang, extraction=extraction)
        recompose_ok = False if any(it.get("layout_kind") in ("multicol_col", "multicol_grid", "table", "header") for it in items) else True
        if not recompose_ok:
            recompose_ok_doc = False

        sent_items = []
        for it in items:
            chunk = it["text"]
            start = int(it.get("start", 0))
            end = int(it.get("end", start + len(chunk)))

            line, col = _line_col_from_offset(page_text, start)
            nonspace = _nonspace_len(chunk)

            is_noise = chunk_is_noise(chunk, noise_keys)

            if it.get("layout_kind") in ("multicol_col", "multicol_grid", "table", "header"):
                is_sentence = (not is_noise) and (nonspace >= 1)
            else:
                is_sentence = (nonspace >= MIN_SENTENCE_NONSPACE) and (not is_noise)

            sent_items.append({
                "text": chunk,
                "start": start,
                "end": end,
                "line": line,
                "col": col,
                "chars": len(chunk),
                "nonspace": nonspace,
                "is_noise": is_noise,
                "is_sentence": is_sentence,
                "spans": it.get("spans", []),
                "layout_kind": it.get("layout_kind", "plain"),
                "col_index": it.get("col_index", None),
                "table_rows": it.get("table_rows", None),
                "header_rows": it.get("table_rows", None),
                "header_cells": it.get("table_cells", None),
                "header_source": it.get("header_source", None),
            })

        pages_tok.append({
            "page_index": page_index,
            "source_path": pg.get("source_path") or "",
            "lang": lang,
            "chars": len(page_text),
            "recompose_ok": recompose_ok,
            "sentences_layout": sent_items,
            "page_text": page_text,
        })

    pages_tok.sort(key=lambda x: x["page_index"])

    TOK_DOCS.append({
        "doc_id": doc_id,
        "filename": filename,
        "paths": paths,
        "page_count_total": page_count_total,
        "content": content_type,
        "extraction": extraction,
        "pages": pages_tok,
        "chars_total": doc_chars_total,
        "recompose_ok": recompose_ok_doc,
    })

def _sort_key(x):
    p = (x.get("paths") or [""])[0]
    return (x.get("filename") or "", str(p))

TOK_DOCS.sort(key=_sort_key)

TOK_BY_ID = {d["doc_id"]: d for d in TOK_DOCS if d.get("doc_id")}
TOK_BY_FILENAME = {}
for d in TOK_DOCS:
    TOK_BY_FILENAME.setdefault(d["filename"], []).append(d)

def _select_doc(target):
    if target is None:
        return TOK_DOCS
    if isinstance(target, int):
        if 0 <= target < len(TOK_DOCS):
            return [TOK_DOCS[target]]
        raise IndexError(f"TARGET index out of range: {target} (0..{len(TOK_DOCS)-1})")
    if isinstance(target, str):
        t = target.strip()
        if t in TOK_BY_ID:
            return [TOK_BY_ID[t]]
        if t in TOK_BY_FILENAME:
            return TOK_BY_FILENAME[t]
        hits = []
        for d in TOK_DOCS:
            if t.lower() in (d.get("filename","").lower()):
                hits.append(d)
                continue
            for p in d.get("paths") or []:
                if t.lower() in str(p).lower():
                    hits.append(d)
                    break
        if hits:
            return hits
        raise ValueError(f"No document matches TARGET='{target}' (by doc_id/filename/path).")
    raise TypeError("TARGET must be None, int, or str")

def print_one_doc(doc):
    print("=" * 120)
    print(f"[doc] {doc['filename']}")
    print(f"  doc_id       : {doc.get('doc_id')}")
    print(f"  content      : {doc.get('content')}")
    print(f"  extraction   : {doc.get('extraction')}")
    print(f"  pages_total  : {doc.get('page_count_total')}")
    print(f"  chars_total  : {doc.get('chars_total')}")
    print(f"  recompose_ok : {doc.get('recompose_ok')}")
    print("  paths:")
    if doc.get("paths"):
        for p in doc["paths"]:
            print(f"    - {p}")
    else:
        print("    - (unknown)")
    print("-" * 120)

    if not PRINT_SENTENCES:
        return

    for pg in (doc.get("pages") or []):
        print(f"[page {pg['page_index']}/{doc.get('page_count_total') or '?'}] source_path={pg.get('source_path')}")
        print(f"  lang         : {pg.get('lang')}")
        print(f"  chars        : {pg.get('chars')}")
        print("-" * 120)

        if PRINT_PAGE_TEXT:
            print(pg.get("page_text") or "")
            print("-" * 120)

        sent_items = pg.get("sentences_layout") or []

        total_all = len(sent_items)
        total_noise = sum(1 for s in sent_items if s.get("is_noise"))
        total_sentence = sum(1 for s in sent_items if s.get("is_sentence"))

        if PRINT_ONLY_SENTENCES:
            view = [s for s in sent_items if s.get("is_sentence")]
        else:
            view = list(sent_items)

        fallback_used = False
        if PRINT_ONLY_SENTENCES and not view and total_all > 0:
            view = list(sent_items)
            fallback_used = True

        total_view = len(view)
        show = total_view if MAX_SENTENCES_PREVIEW is None else min(total_view, MAX_SENTENCES_PREVIEW)

        print(
            f"  sentences_layout: {total_all} chunks total | "
            f"sentences={total_sentence} | noise={total_noise} | "
            f"showing {show}/{total_view} "
            f"(filter_is_sentence={PRINT_ONLY_SENTENCES}, fallback={fallback_used}, min_nonspace={MIN_SENTENCE_NONSPACE})"
        )
        print("-" * 120)

        for i2 in range(show):
            s = view[i2]
            chunk = s["text"]
            print(
                f"[sent {i2+1}/{total_view}] page={pg['page_index']} start={s['start']} end={s['end']} "
                f"line={s['line']} col={s['col']} chars={s['chars']} nonspace={s['nonspace']} "
                f"is_noise={s.get('is_noise')} is_sentence={s['is_sentence']} layout={s.get('layout_kind')}"
            )
            print(chunk, end="" if chunk.endswith("\n") else "\n")
            if PRINT_REPR:
                print("repr:", repr(chunk))
            print("-" * 80)

        if MAX_SENTENCES_PREVIEW is not None and total_view > show:
            print(f"... {total_view - show} chunks restants non affichés (MAX_SENTENCES_PREVIEW={MAX_SENTENCES_PREVIEW})")

        print()

# ==================== Exécution ====================
selected = _select_doc(TARGET)

if not selected:
    print("[info] Aucun document à traiter.")
else:
    for doc in selected:
        print_one_doc(doc)


[doc] testword.docx
  doc_id       : 4bdf95b6-9ade-4d32-8dea-b58bd8c864bb
  content      : text
  extraction   : native:docx:xml
  pages_total  : 1
  chars_total  : 174
  recompose_ok : True
  paths:
    - c:\Users\moura\OneDrive\Bureau\DMS\test\documents\testword.docx
------------------------------------------------------------------------------------------------------------------------
[page 1/1] source_path=c:\Users\moura\OneDrive\Bureau\DMS\test\documents\testword.docx
  lang         : fr
  chars        : 174
------------------------------------------------------------------------------------------------------------------------
  sentences_layout: 1 chunks total | sentences=1 | noise=0 | showing 1/1 (filter_is_sentence=True, fallback=False, min_nonspace=12)
------------------------------------------------------------------------------------------------------------------------
[sent 1/1] page=1 start=0 end=174 line=1 col=0 chars=174 nonspace=142 is_noise=False is_sentence=True layou

### attribue une catégorie grammaticale // jeu d’étiquettes NN, NNS, VB, VBD, JJ ...///

In [43]:
# =========================
# 1) Chemin vers tes .py
# =========================
import sys, types, re, importlib

BASE_DIR = r"C:\Users\moura\OneDrive\Bureau\DMS\test"  # dossier qui contient engcode.py / frcode.py / arabcode.py
if BASE_DIR not in sys.path:
    sys.path.insert(0, BASE_DIR)

# =========================
# 2) Petit "nb_utils" en mémoire (pas besoin de créer nb_utils.py)
#    -> utilisé par run_from_previous_cell() dans tes scripts
# =========================
nb_utils = types.ModuleType("nb_utils")

_AR_RE = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]")
_WORD_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", flags=re.UNICODE)
_FR_HINT = {"le","la","les","des","une","un","est","avec","pour","dans","sur","facture","date","total","tva","montant"}
_EN_HINT = {"the","and","to","of","in","is","for","with","invoice","date","total","vat","amount"}

def detect_lang(text: str) -> str:
    t = text or ""
    if _AR_RE.search(t):
        return "ar"
    words = [w.lower() for w in _WORD_RE.findall(t[:8000])]
    if not words:
        return "en"
    fr_score = sum(1 for w in words if w in _FR_HINT)
    en_score = sum(1 for w in words if w in _EN_HINT)
    if re.search(r"[éèêàùçôîï]", t.lower()):
        fr_score += 1
    return "fr" if fr_score >= en_score else "en"

def get_previous_cell_input():
    g = globals()
    for k in ("selected", "TOK_DOCS", "FINAL_DOCS", "DOCS", "TEXT_DOCS", "_"):
        if k in g and g[k] is not None:
            return g[k]
    return None

def iter_sentences_from_input(data):
    """
    Yield: (doc_name, page_idx, sent_idx, sent_text)
    Supporte: TOK_DOCS/selected (pages->sentences_layout), FINAL_DOCS (list[{text}]), etc.
    """
    if data is None:
        return

    # Cas 1: liste de docs avec pages (TOK_DOCS / selected)
    if isinstance(data, list) and data and isinstance(data[0], dict) and "pages" in data[0]:
        for d_i, doc in enumerate(data):
            doc_name = doc.get("filename") or doc.get("doc_id") or f"doc#{d_i}"
            pages = doc.get("pages") or []
            for p_i, pg in enumerate(pages):
                page_idx = pg.get("page_index", pg.get("page", p_i+1))
                sent_items = pg.get("sentences_layout") or pg.get("sentences") or pg.get("chunks") or []
                for s_i, s in enumerate(sent_items):
                    if isinstance(s, dict):
                        if s.get("is_sentence") is False:
                            continue
                        sent = s.get("text") or ""
                    else:
                        sent = str(s)
                    yield doc_name, page_idx, s_i, sent
        return

    # Cas 2: FINAL_DOCS : list[{text, filename?}]
    if isinstance(data, list) and data and isinstance(data[0], dict) and "text" in data[0]:
        for i, d in enumerate(data):
            doc_name = d.get("filename") or d.get("doc_id") or f"doc#{i}"
            yield doc_name, None, None, d.get("text") or ""
        return

    # Cas 3: dict {text:...}
    if isinstance(data, dict) and "text" in data:
        doc_name = data.get("filename") or data.get("doc_id") or "doc"
        yield doc_name, None, None, data.get("text") or ""
        return

    # Cas 4: string direct
    if isinstance(data, str):
        yield "text", None, None, data
        return

    raise TypeError(f"Format d'entrée non supporté: {type(data)}")

nb_utils.detect_lang = detect_lang
nb_utils.get_previous_cell_input = get_previous_cell_input
nb_utils.iter_sentences_from_input = iter_sentences_from_input
sys.modules["nb_utils"] = nb_utils  # rend "import nb_utils" possible

# =========================
# 3) Import + reload tes 3 modules
# =========================
import engcode
import frcode

# arabcode peut échouer si camel_tools n'est pas installé => on skip proprement
try:
    import arabcode
    HAVE_AR = True
except Exception as e:
    HAVE_AR = False
    print("[warn] arabcode.py non chargé (dépendances manquantes ?). Détail:", e)

importlib.reload(engcode)
importlib.reload(frcode)
if HAVE_AR:
    importlib.reload(arabcode)

# =========================
# 4) Exécution: chaque script filtre sa langue et print son output
# =========================
data = get_previous_cell_input()
if data is None:
    raise RuntimeError("Je ne trouve pas de données d'entrée. Assure-toi que la cellule précédente crée 'selected' (ou FINAL_DOCS / TOK_DOCS).")

MAX_SENTENCES_PER_LANG = None  # ex: 30 pour debug, ou None pour tout

print("\n" + "="*120)
print("RUN EN (engcode.py)")
print("="*120)
engcode.run_from_previous_cell(data=data, max_sentences=MAX_SENTENCES_PER_LANG)

print("\n" + "="*120)
print("RUN FR (frcode.py)")
print("="*120)
frcode.run_from_previous_cell(data=data, max_sentences=MAX_SENTENCES_PER_LANG)

if HAVE_AR:
    print("\n" + "="*120)
    print("RUN AR (arabcode.py)")
    print("="*120)
    arabcode.run_from_previous_cell(data=data, max_sentences=MAX_SENTENCES_PER_LANG)


Python kernel: C:\Users\moura\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
Local deps dir: (absent)
Python kernel: C:\Users\moura\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
Local deps dir: (absent)
Python: C:\Users\moura\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
Local deps: (absent)
Type an Arabic sentence (empty line to stop). You can paste ONE sentence or MANY quoted sentences.

RUN EN (engcode.py)

RUN FR (frcode.py)

##########################################################################################
DOC=testword.docx | page=1 | sent=0 | lang=fr
##########################################################################################
INPUT: Équipée d'un moteur V10 de 620 chevaux, l'Audi R8 passe de 0 à 100 km/h en moins de 3,5 secondes, affirmant ainsi sa place de supercar emblématique de la marque aux anneaux.
  

### topic extraction 

## Classification

In [20]:
# Affiche :
# [classification] <filename> -> best=<DOC_TYPE> | status=<OK/REVIEW> | scores: {...}

import sys, json, re, unicodedata, uuid, ast
from pathlib import Path
from typing import Dict, Any, List, Optional

# ========= CONFIG =========
BASE_DIR = r"C:\Users\moura\OneDrive\Bureau\DMS\test"  # où sont tes fichiers + dossier "classification"
CLASSIFICATION_DIR = (Path(BASE_DIR) / "classification") if (Path(BASE_DIR) / "classification").exists() else Path("classification")
COMMON_PATH = CLASSIFICATION_DIR / "common.json"

if BASE_DIR not in sys.path:
    sys.path.insert(0, BASE_DIR)

# ========= HELPERS =========
def _load_json(path: Path):
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return None

def _strip_accents(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFD", s or "")
        if unicodedata.category(c) != "Mn"
    )

def _norm_text(s: str) -> str:
    s = _strip_accents(s)
    return " ".join((s or "").upper().split())

def _ensure_kw_dict(d: Dict[str, Any]) -> Dict[str, List[str]]:
    kw = d.get("keywords")
    if isinstance(kw, dict):
        out = {"strong": [], "medium": [], "weak": [], "negative": [], "strong_negative": []}
        for k, v in kw.items():
            if isinstance(v, list):
                kk = str(k).lower()
                if kk in out:
                    out[kk] = [str(x).upper() for x in v]
        return out
    if isinstance(kw, list):
        return {"strong": [], "medium": [], "weak": [str(x).upper() for x in kw], "negative": [], "strong_negative": []}
    if isinstance(d, dict) and all(isinstance(v, list) for v in d.values()):
        flat = []
        for v in d.values():
            flat.extend(v)
        return {"strong": [], "medium": [], "weak": [str(x).upper() for x in flat], "negative": [], "strong_negative": []}
    return {"strong": [], "medium": [], "weak": [], "negative": [], "strong_negative": []}

def load_classification_configs():
    common = {
        "weights": {"strong": 5, "medium": 2, "weak": 1},
        "global_penalties": {"negative": -2, "strong_negative": -5},
        "threshold": 6,
        "margin": 3,
        "tie_breaker": "priority",
    }
    if COMMON_PATH.exists():
        d = _load_json(COMMON_PATH)
        if isinstance(d, dict):
            common.update(d)

    configs = {}
    if CLASSIFICATION_DIR.exists():
        for p in sorted(CLASSIFICATION_DIR.glob("*.json")):
            d = _load_json(p)
            if not isinstance(d, dict):
                continue
            doc_type = str(d.get("doc_type") or p.stem).upper()
            if doc_type == "COMMON":
                continue
            configs[doc_type] = {
                "doc_type": doc_type,
                "keywords": _ensure_kw_dict(d),
                "priority": int(d.get("priority", 0) or 0),
            }
    return common, configs

def _get_previous_cell_input():
    g = globals()
    for k in ("selected", "TOK_DOCS", "FINAL_DOCS", "DOCS", "TEXT_DOCS", "_"):
        if k in g and g[k] is not None:
            return g[k]
    return None

def _build_DOCS_from_input(data) -> List[Dict[str, Any]]:
    # Cas: list docs avec pages (TOK_DOCS/selected)
    if isinstance(data, list) and data and isinstance(data[0], dict) and "pages" in data[0]:
        out = []
        for i, doc in enumerate(data):
            name = doc.get("filename") or doc.get("doc_id") or f"doc#{i}"
            pages_out = []
            for p_i, pg in enumerate(doc.get("pages") or []):
                page_index = pg.get("page_index", pg.get("page", p_i + 1))
                txt = pg.get("ocr_text")
                if not txt:
                    sent_items = pg.get("sentences_layout") or pg.get("sentences") or pg.get("chunks") or []
                    parts = []
                    for s in sent_items:
                        if isinstance(s, dict):
                            if s.get("is_sentence") is False:
                                continue
                            parts.append(s.get("text") or "")
                        else:
                            parts.append(str(s))
                    txt = "\n".join([x for x in parts if x])
                pages_out.append({"page_index": page_index, "ocr_text": txt or ""})
            out.append({"filename": name, "pages": pages_out})
        return out

    # Cas: FINAL_DOCS list[{text,...}]
    if isinstance(data, list) and data and isinstance(data[0], dict) and "text" in data[0]:
        out = []
        for i, d in enumerate(data):
            name = d.get("filename") or d.get("doc_id") or f"doc#{i}"
            out.append({"filename": name, "pages": [{"page_index": 1, "ocr_text": d.get("text") or ""}]})
        return out

    # Cas: dict {text:...}
    if isinstance(data, dict) and "text" in data:
        name = data.get("filename") or data.get("doc_id") or "doc"
        return [{"filename": name, "pages": [{"page_index": 1, "ocr_text": data.get("text") or ""}]}]

    # Cas: string
    if isinstance(data, str):
        return [{"filename": "text", "pages": [{"page_index": 1, "ocr_text": data}]}]

    raise TypeError(f"Format d'entrée non supporté: {type(data)}")

def classify_scores(DOCS: List[Dict[str, Any]], common: Dict[str, Any], configs: Dict[str, Any]) -> None:
    weights = common.get("weights", {"strong": 5, "medium": 2, "weak": 1})
    penalties = common.get("global_penalties", {"negative": -2, "strong_negative": -5})

    def add_score(text: str, keywords: List[str], delta: int) -> int:
        if not keywords:
            return 0
        s = 0
        for k in keywords:
            k = str(k).upper()
            if k and k in text:
                s += delta
        return s

    for doc in DOCS:
        scores_doc = {dt: 0 for dt in configs.keys()}
        for page in doc.get("pages", []):
            text = _norm_text(page.get("ocr_text", ""))
            for dt, cfg in configs.items():
                kw = cfg["keywords"]
                score = 0
                score += add_score(text, kw.get("strong", []), int(weights.get("strong", 5)))
                score += add_score(text, kw.get("medium", []), int(weights.get("medium", 2)))
                score += add_score(text, kw.get("weak", []), int(weights.get("weak", 1)))
                score += add_score(text, kw.get("negative", []), int(penalties.get("negative", -2)))
                score += add_score(text, kw.get("strong_negative", []), int(penalties.get("strong_negative", -5)))
                scores_doc[dt] += score
        doc["scores"] = scores_doc

# ========= DECISION =========
def decide(scores: Dict[str, int], configs: Dict[str, Any], common: Dict[str, Any]) -> Dict[str, Any]:
    THRESHOLD = int(common.get("threshold", 6))
    MARGIN = int(common.get("margin", 3))

    PRIORITY = {dt: int((configs.get(dt, {}) or {}).get("priority", 0) or 0) for dt in configs.keys()}
    scores_stable = {dt: int(scores.get(dt, 0)) for dt in configs.keys()}

    # tri stable: score desc, priority desc, name asc
    items = sorted(
        scores_stable.items(),
        key=lambda kv: (-kv[1], -PRIORITY.get(kv[0], 0), kv[0])
    )
    top_type, top_score = items[0] if items else ("UNCLASSIFIED", 0)
    second_score = items[1][1] if len(items) > 1 else 0
    diff = top_score - second_score

    confident = (top_score > 0) and (top_score >= THRESHOLD) and (diff >= MARGIN)
    best = top_type if confident else "UNCLASSIFIED"
    status = "OK" if confident else "REVIEW"

    return {
        "best": best,
        "status": status,
        "top_score": top_score,
        "second_score": second_score,
        "diff": diff,
        "scores_stable": scores_stable
    }

# ========= RUN =========
data = _get_previous_cell_input()
if data is None:
    raise RuntimeError("Je ne trouve pas de données d'entrée (selected / TOK_DOCS / FINAL_DOCS / DOCS / TEXT_DOCS).")

common, configs = load_classification_configs()
if not configs:
    raise RuntimeError(f"Aucune classe trouvée dans: {CLASSIFICATION_DIR}")

DOCS = _build_DOCS_from_input(data)
classify_scores(DOCS, common, configs)

preferred = ["ARTICLE", "BON_DE_COMMANDE", "CONTRAT", "FACTURE", "FORMULAIRE"]
order = [c for c in preferred if c in configs] + [c for c in configs.keys() if c not in preferred]

RESULTS = []
for doc in DOCS:
    scores = doc.get("scores", {}) or {}
    ordered_scores = {k: int(scores.get(k, 0)) for k in order}

    d = decide(scores, configs, common)
    best, status = d["best"], d["status"]

    # ---- sortie lisible + compacte (tu vois enfin la classe)
    print(f"[classification] {doc.get('filename')} -> best={best} | status={status} | scores: {ordered_scores}")

    # (optionnel) stocker le json détaillé sans l'imprimer
    doc["result"] = {
        "doc_id": doc.get("doc_id") or str(uuid.uuid4()),
        "filename": doc.get("filename"),
        "doc_type": best,
        "status": status,
        "scores": d["scores_stable"],
        "threshold": int(common.get("threshold", 6)),
        "margin": int(common.get("margin", 3)),
        "decision_debug": {
            "top_score": d["top_score"],
            "second_score": d["second_score"],
            "diff": d["diff"],
        }
    }
    RESULTS.append(doc["result"])


[classification] arab.docx -> best=UNCLASSIFIED | status=REVIEW | scores: {'ARTICLE': 0, 'BON_DE_COMMANDE': 0, 'CONTRAT': 0, 'FACTURE': 0, 'FORMULAIRE': 2}
[classification] englais.docx -> best=UNCLASSIFIED | status=REVIEW | scores: {'ARTICLE': 0, 'BON_DE_COMMANDE': 0, 'CONTRAT': 0, 'FACTURE': 0, 'FORMULAIRE': 0}
[classification] francais.docx -> best=UNCLASSIFIED | status=REVIEW | scores: {'ARTICLE': 2, 'BON_DE_COMMANDE': 0, 'CONTRAT': 1, 'FACTURE': 0, 'FORMULAIRE': 2}


## Produire une sortie JSON 