## Pipeline OCR (Tesseract + OpenCV).

Langue (par défaut FR, mais bascule en EN si détecté)

Par défaut, l’OCR est en français :

DEFAULT_LANG = "fra" (côté Tesseract)

spacy.load("fr_core_news_sm", ...) (côté spaCy)

Si tu détectes que le texte est en anglais, tu fais basculer :

DEFAULT_LANG = "eng" (ou fra+eng si tu veux tolérer les deux)

spacy.load("en_core_web_sm", ...)

Trucs à modifier quand tu changes de langue :

la constante DEFAULT_LANG

le modèle spaCy chargé (fr_core_news_sm ↔ en_core_web_sm)

---

Fonctionnement global du script

Prendre une image (INPUT_FILE)

L’améliorer via le prétraitement (gris, upscale, contraste/sharpness, seuil, etc.)

Lancer Tesseract sur l’image prétraitée pour extraire le texte (OCR_TEXT)

### importation img et prétraitement

In [21]:
from typing import Optional

INPUT_FILE: Optional[str] = "image2tab.webp"

In [22]:
"""
Dependencies:
  * Python 3.8+
  * pytesseract
  * pillow
  * Tesseract binary with tessdata
"""

import argparse
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Tuple

import pytesseract
from PIL import Image, ImageEnhance, ImageFilter, ImageOps

try:
    import numpy as np  # type: ignore
except ImportError:  # pragma: no cover
    np = None

try:
    SCRIPT_DIR = Path(__file__).resolve().parent
except NameError:
    # In notebooks __file__ is undefined; fall back to current working directory.
    SCRIPT_DIR = Path.cwd()

DEFAULT_LANG = "fra"
DEFAULT_CONTRAST = 1.5
DEFAULT_SHARPNESS = 1.2
DEFAULT_BRIGHTNESS = 1.0
DEFAULT_UPSCALE = 1.5
DEFAULT_DPI = 300

 #/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
SHOW_PREPROCESSED = True


@dataclass
class EnhanceOptions:
    contrast: float = DEFAULT_CONTRAST
    sharpness: float = DEFAULT_SHARPNESS
    brightness: float = DEFAULT_BRIGHTNESS
    upscale: float = DEFAULT_UPSCALE
    gamma: Optional[float] = None  # gamma correction; <1 brightens darks, >1 darkens
    pad: int = 0  # pixels to pad around the image
    median: Optional[int] = None  # kernel size for median filter (odd int, e.g., 3)
    unsharp_radius: Optional[float] = None  # e.g., 1.0
    unsharp_percent: int = 150
    invert: bool = False
    autocontrast_cutoff: Optional[int] = None  # 0-100; percentage to clip for autocontrast
    equalize: bool = False  # histogram equalization
    auto_rotate: bool = False  # attempt orientation detection + rotate
    otsu: bool = False  # auto-threshold with Otsu (requires numpy)
    threshold: Optional[int] = None  # 0-255; if set, applies a binary threshold


def build_config(
    oem: Optional[int],
    psm: Optional[int],
    base_flags: Iterable[str],
    dpi: Optional[int],
    tessdata_dir: Optional[Path],
    user_words: Optional[Path],
    user_patterns: Optional[Path],
) -> str:
    parts: List[str] = []
    if oem is not None:
        parts.append(f"--oem {oem}")
    if psm is not None:
        parts.append(f"--psm {psm}")
    if dpi is not None:
        parts.append(f"--dpi {dpi}")
    if tessdata_dir is not None:
        parts.append(f'--tessdata-dir "{tessdata_dir}"')
    if user_words is not None:
        parts.append(f'--user-words "{user_words}"')
    if user_patterns is not None:
        parts.append(f'--user-patterns "{user_patterns}"')
    parts.extend(base_flags)
    return " ".join(parts)


def ensure_environment(lang: str) -> None:
    try:
        _ = pytesseract.get_tesseract_version()
    except pytesseract.TesseractNotFoundError:
        sys.exit("Tesseract binary not found on PATH. Install it and its language data.")
    if lang:
        try:
            available = set(pytesseract.get_languages(config=""))
            requested = set(lang.split("+"))
            missing = requested - available
            if missing:
                print(
                    f"Warning: missing languages: {', '.join(sorted(missing))}. "
                    f"Available: {', '.join(sorted(available))}",
                    file=sys.stderr,
                )
        except pytesseract.TesseractError:
            pass


def auto_rotate_if_needed(img: Image.Image, enhance: EnhanceOptions) -> Image.Image:
    if not enhance.auto_rotate:
        return img
    try:
        osd = pytesseract.image_to_osd(img)
        angle = None
        for line in osd.splitlines():
            if line.lower().startswith("rotate:"):
                try:
                    angle = int(line.split(":")[1].strip())
                except ValueError:
                    angle = None
                break
        if angle is not None and angle % 360 != 0:
            return img.rotate(-angle, expand=True)
    except Exception:
        pass
    return img


def preprocess_image(image: Image.Image, enhance: EnhanceOptions) -> Image.Image:
    img = image.convert("L")
    img = auto_rotate_if_needed(img, enhance)

    if enhance.invert:
        img = ImageOps.invert(img)

    if enhance.pad and enhance.pad > 0:
        img = ImageOps.expand(img, border=enhance.pad, fill=255)

    if enhance.autocontrast_cutoff is not None:
        cutoff = max(0, min(100, enhance.autocontrast_cutoff))
        img = ImageOps.autocontrast(img, cutoff=cutoff)

    if enhance.equalize:
        img = ImageOps.equalize(img)

    if enhance.upscale and enhance.upscale != 1.0:
        w, h = img.size
        img = img.resize((int(w * enhance.upscale), int(h * enhance.upscale)), Image.LANCZOS)

    if enhance.gamma and enhance.gamma > 0:
        inv_gamma = 1.0 / enhance.gamma
        lut = [pow(x / 255.0, inv_gamma) * 255 for x in range(256)]
        img = img.point(lut)

    if enhance.brightness and enhance.brightness != 1.0:
        img = ImageEnhance.Brightness(img).enhance(enhance.brightness)

    if enhance.contrast and enhance.contrast != 1.0:
        img = ImageEnhance.Contrast(img).enhance(enhance.contrast)

    if enhance.sharpness and enhance.sharpness != 1.0:
        img = ImageEnhance.Sharpness(img).enhance(enhance.sharpness)

    if enhance.unsharp_radius:
        img = img.filter(
            ImageFilter.UnsharpMask(
                radius=enhance.unsharp_radius,
                percent=enhance.unsharp_percent,
                threshold=0,
            )
        )

    if enhance.median and enhance.median > 1 and enhance.median % 2 == 1:
        img = img.filter(ImageFilter.MedianFilter(size=enhance.median))

    if enhance.threshold is not None:
        thr = max(0, min(255, enhance.threshold))
        img = img.point(lambda p, t=thr: 255 if p > t else 0, mode="1").convert("L")
    elif enhance.otsu and np is not None:
        arr = np.array(img, dtype=np.uint8)
        hist, _ = np.histogram(arr, bins=256, range=(0, 256))
        total = arr.size
        sum_total = np.dot(np.arange(256), hist)

        sum_b = 0.0
        w_b = 0.0
        max_var = 0.0
        threshold = 0

        for i in range(256):
            w_b += hist[i]
            if w_b == 0:
                continue
            w_f = total - w_b
            if w_f == 0:
                break
            sum_b += i * hist[i]
            m_b = sum_b / w_b
            m_f = (sum_total - sum_b) / w_f
            var_between = w_b * w_f * (m_b - m_f) ** 2
            if var_between > max_var:
                max_var = var_between
                threshold = i

        img = img.point(lambda p, t=threshold: 255 if p > t else 0, mode="1").convert("L")

    return img


def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("-l", "--lang", default=DEFAULT_LANG)
    parser.add_argument("--oem", type=int, choices=range(0, 4), default=None)
    parser.add_argument("--psm", type=int, choices=range(0, 14), default=None)
    parser.add_argument("--dpi", type=int, default=DEFAULT_DPI)
    parser.add_argument("--tessdata-dir", type=Path, default=None)
    parser.add_argument("--user-words", type=Path, default=None)
    parser.add_argument("--user-patterns", type=Path, default=None)
    parser.add_argument("--whitelist", type=str, default=None)
    parser.add_argument("--blacklist", type=str, default=None)

    parser.add_argument("--contrast", type=float, default=DEFAULT_CONTRAST)
    parser.add_argument("--sharpness", type=float, default=DEFAULT_SHARPNESS)
    parser.add_argument("--brightness", type=float, default=DEFAULT_BRIGHTNESS)
    parser.add_argument("--upscale", type=float, default=DEFAULT_UPSCALE)
    parser.add_argument("--gamma", type=float, default=None)
    parser.add_argument("--pad", type=int, default=0)
    parser.add_argument("--threshold", type=int, default=None)
    parser.add_argument("--median", type=int, default=None)
    parser.add_argument("--unsharp-radius", type=float, default=None)
    parser.add_argument("--unsharp-percent", type=int, default=150)
    parser.add_argument("--invert", action="store_true")
    parser.add_argument("--autocontrast-cutoff", type=int, default=None)
    parser.add_argument("--equalize", action="store_true")
    parser.add_argument("--auto-rotate", action="store_true")
    parser.add_argument("--otsu", action="store_true")

    parser.add_argument(
        "--config",
        nargs="*",
        default=[],
        metavar="CFG",
        help="Additional configuration flags passed verbatim to tesseract (e.g., -c foo=bar).",
    )

    return parser.parse_args(list(argv) if argv is not None else [])


# --------- Exécution Cellule 1 (jusqu’à l’affichage) ---------

args = parse_args()
ensure_environment(args.lang)

enhance = EnhanceOptions(
    contrast=args.contrast,
    sharpness=args.sharpness,
    brightness=args.brightness,
    upscale=args.upscale,
    gamma=args.gamma,
    pad=args.pad,
    median=args.median,
    unsharp_radius=args.unsharp_radius,
    unsharp_percent=args.unsharp_percent,
    invert=args.invert,
    autocontrast_cutoff=args.autocontrast_cutoff,
    equalize=args.equalize,
    auto_rotate=args.auto_rotate,
    otsu=args.otsu,
    threshold=args.threshold,
)

config_flags: List[str] = list(args.config)
if args.whitelist:
    config_flags.append(f"-c tessedit_char_whitelist={args.whitelist}")
if args.blacklist:
    config_flags.append(f"-c tessedit_char_blacklist={args.blacklist}")

if not INPUT_FILE:
    sys.exit("INPUT_FILE is not set. Put your image filename in INPUT_FILE.")

path = Path(INPUT_FILE)
if not path.is_absolute():
    path = (SCRIPT_DIR / path).resolve()

if not path.exists():
    sys.exit(f"INPUT_FILE not found: {path}")

print(f"[info] Using INPUT_FILE={path}", file=sys.stderr)

original = Image.open(path)
prepped = preprocess_image(original, enhance)

# Afficher les 2 images (original + prétraitée)
original.show(title="original")
if "SHOW_PREPROCESSED" not in globals() or SHOW_PREPROCESSED:
    prepped.show(title="preprocessed")


[info] Using INPUT_FILE=C:\Users\moura\OneDrive\Bureau\DMS\test\image2tab.webp


### tesseract

In [23]:
config = build_config(
    args.oem,
    args.psm,
    config_flags,
    args.dpi,
    args.tessdata_dir,
    args.user_words,
    args.user_patterns,
)

OCR_TEXT = pytesseract.image_to_string(prepped, lang=args.lang, config=config)
print(OCR_TEXT)


FACTURE

CODE CLENT NUMERO
FCo0o1 4/20/2016 0002
Ma petite entreprise CLIENT
19,rue de place 1° mai SARL EL HANA
16000 Alger Centre IROUTE DE BEJAIA SETIF
Tel : 00-00-52-12- 119000
Ident Fiscal : 160
N°art : 160100000000
Mode de paiement : Espèce
Date Échéance : 5/20/2016
Référence Description Produit Quantité P.Unitaire Valeur
cl001 _Produit1 1000 1.00 1,000.00
c1002 _ |Produit 2 1001 2.00 2,002.00
c1003 _ jProduit 3 1002 3.00 3,006.00
c1004 _ |Produit4 1003 4.00 4,012.00
c1005 __|Produit5 1004 5.00 5,020.00
c1006 _ |Produit 6 1005 6.00 6,030.00
c1007 _ |Produit 7 1006 11.00 11,066.00
c1008 Produit8 1007 118.00 118,826.00
c1009 Produit 9 1008 19.00 19,152.00
c1010 _ |Produit 10 1009 10.00 10,090.00
Non assujetti à latva [Montant à payer 180,204.00
[rimbre 1,802.00
Montant à payer ttc 182,006.00

Monatnt Facture enLettre … Cinq mille huit cent quatre vingt huit Dinars Algériens

Cachet & Signature



### Pipeline SpaCy de base & Tokenisation

modifer pour la langue :fr_core_news_sm

In [24]:
import re
import spacy
from langdetect import detect

texte = OCR_TEXT

# 1) détecter la langue sur un gros extrait (plus stable et plus rapide)
sample = texte[:2000]  
try:
    doc_lang = detect(sample)
except:
    doc_lang = "fr"  # si probleme avec detection de langue (on forcer fr)
 
# 2) charger UN seul modèle
nlp = spacy.load("fr_core_news_sm", disable=["parser", "tagger", "ner", "lemmatizer"])
# nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "ner", "lemmatizer"])

# 3) split phrases rapide
sent_split = re.compile(r'(?<=[.!?])\s+')

for phrase in sent_split.split(texte):
    phrase = phrase.strip()
    if len(phrase) < 20:
        continue

    # tokenisation spaCy (mais pipeline ultra léger)
    doc = nlp.make_doc(phrase)
    print("\nPhrase :", phrase)
    print("Langue :", doc_lang)
    print("Tokens :", [t.text for t in doc])



Phrase : FACTURE

CODE CLENT NUMERO
FCo0o1 4/20/2016 0002
Ma petite entreprise CLIENT
19,rue de place 1° mai SARL EL HANA
16000 Alger Centre IROUTE DE BEJAIA SETIF
Tel : 00-00-52-12- 119000
Ident Fiscal : 160
N°art : 160100000000
Mode de paiement : Espèce
Date Échéance : 5/20/2016
Référence Description Produit Quantité P.Unitaire Valeur
cl001 _Produit1 1000 1.00 1,000.00
c1002 _ |Produit 2 1001 2.00 2,002.00
c1003 _ jProduit 3 1002 3.00 3,006.00
c1004 _ |Produit4 1003 4.00 4,012.00
c1005 __|Produit5 1004 5.00 5,020.00
c1006 _ |Produit 6 1005 6.00 6,030.00
c1007 _ |Produit 7 1006 11.00 11,066.00
c1008 Produit8 1007 118.00 118,826.00
c1009 Produit 9 1008 19.00 19,152.00
c1010 _ |Produit 10 1009 10.00 10,090.00
Non assujetti à latva [Montant à payer 180,204.00
[rimbre 1,802.00
Montant à payer ttc 182,006.00

Monatnt Facture enLettre … Cinq mille huit cent quatre vingt huit Dinars Algériens

Cachet & Signature
Langue : fr
Tokens : ['FACTURE', '\n\n', 'CODE', 'CLENT', 'NUMERO', '\n', 'FCo0

## Schéma de BDD 

In [None]:
from IPython.display import HTML, display
import re, json, uuid

raw = r"""
---
config:
  layout: elk
  theme: redux-dark-color
---

erDiagram
    ROLES {
        INT id PK
        VARCHAR name
    }

    USERS {
        INT id PK
        VARCHAR username
        VARCHAR email
        VARCHAR password_hash
        INT role_id FK
        DATETIME created_at
    }

    DOMAINS {
        INT id PK
        VARCHAR name
    }

    RULE_CONFIGS {
        INT id PK
        INT domain_id FK
        VARCHAR version_label_Regex
        JSONB Regex_json
        INT created_by FK
        DATETIME created_at
        BOOLEAN is_active
    }

    %% Nouvelle table: "API" = profil/config par domaine (langue, règles, paramètres)
    APIS {
        INT id PK
        INT domain_id FK
        INT rule_config_id FK
        VARCHAR name
        VARCHAR language_code          
        JSONB settings_json           
        BOOLEAN is_active
        INT created_by FK
        DATETIME created_at
    }

    %% Clés par API (une API/profil peut avoir plusieurs clés d'accès)
    API_KEYS {
        INT id PK
        INT api_id FK
        VARCHAR key_hash              
        JSONB scopes                  
        DATETIME created_at
        DATETIME last_used_at
        DATETIME expires_at
        DATETIME revoked_at
    }

    DOCUMENTS {
        UUID id PK
        VARCHAR filename
        INT domain_id FK
        VARCHAR status
        VARCHAR empreinte_numerique
        DATETIME uploaded_at
        INT uploaded_by FK
        INT api_id FK                 
    }

    FILE_STORAGE {
        INT id PK
        UUID document_id FK
        VARCHAR object_path
        INT size
        VARCHAR empreinte_numerique
        DATETIME stored_at
    }

    EXTRACTIONS {
        INT id PK
        UUID document_id FK
        INT rule_config_id FK
        VARCHAR field_name
        TEXT extracted_value
        JSONB coordinates
        BOOLEAN is_valid
        BOOLEAN is_overridden
        INT overridden_by FK
        DATETIME overridden_at
    }

    QUALITY_GATE_LOGS {
        INT id PK
        UUID document_id FK
        BOOLEAN is_passed
        TEXT failure_reason
        DATETIME checked_at
        VARCHAR check_origin          
        INT checked_by FK            
        BOOLEAN is_final_decision    
        TEXT decision_comment
    }

    AUDIT_LOGS {
        INT id PK
        INT user_id FK
        UUID document_id FK
        VARCHAR action
        VARCHAR entity_type
        INT entity_id
        JSONB changes
        DATETIME timestamp
        VARCHAR ip_address
    }

    %% Relations façon Workbench
    ROLES ||--o{ USERS : "role_id"
    USERS ||--o{ RULE_CONFIGS : "created_by"
    USERS ||--o{ AUDIT_LOGS : "user_id"

    DOMAINS ||--o{ RULE_CONFIGS : "domain_id"

    %% Domaine -> APIs (plusieurs APIs dans le même domaine)
    DOMAINS ||--o{ APIS : "domain_id"
    RULE_CONFIGS ||--o{ APIS : "rule_config_id"
    USERS ||--o{ APIS : "created_by"

    %% API -> API_KEYS (plusieurs clés par API)
    APIS ||--o{ API_KEYS : "api_id"

    %% Domaine -> Documents
    DOMAINS ||--o{ DOCUMENTS : "domain_id"
    USERS ||--o{ DOCUMENTS : "uploaded_by"
    APIS ||--o{ DOCUMENTS : "api_id"

    %% Documents -> stockage + extractions + quality
    DOCUMENTS ||--|| FILE_STORAGE : "document_id"
    DOCUMENTS ||--o{ EXTRACTIONS : "document_id"
    RULE_CONFIGS ||--o{ EXTRACTIONS : "rule_config_id"
    USERS ||--o{ EXTRACTIONS : "overridden_by"

    DOCUMENTS ||--o{ QUALITY_GATE_LOGS : "document_id"
    USERS ||--o{ QUALITY_GATE_LOGS : "checked_by"

    DOCUMENTS ||--o{ AUDIT_LOGS : "document_id"
"""

def extract_front_matter(mermaid_text: str):
    s = mermaid_text.strip("\n")
    if not s.lstrip().startswith("---"):
        return {}, s

    m = re.match(r"^\s*---\s*(.*?)\s*---\s*(.*)$", s, flags=re.DOTALL)
    if not m:
        return {}, s

    front = m.group(1)
    body = m.group(2)

    theme = None
    layout = None
    for line in front.splitlines():
        line = line.strip()
        if line.startswith("theme:"):
            theme = line.split(":", 1)[1].strip()
        if line.startswith("layout:"):
            layout = line.split(":", 1)[1].strip()

    init = {}
    if theme:
        init["theme"] = theme
    if layout:
        init["layout"] = layout

    return init, body.strip("\n")

init_cfg, diagram = extract_front_matter(raw)
init_directive = ""
if init_cfg:
    init_directive = f"%%{{init: {json.dumps(init_cfg)} }}%%\n"

diagram_final = init_directive + diagram
div_id = f"mmd-{uuid.uuid4().hex}"

html = f"""
<div id="{div_id}" class="mermaid">
{diagram_final}
</div>

<script type="module">
  const render = async () => {{
    if (!window.__mermaid_loaded__) {{
      const mermaid = (await import("https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs")).default;
      window.mermaid = mermaid;
      window.__mermaid_loaded__ = true;
      mermaid.initialize({{
        startOnLoad: false,
        securityLevel: "loose"
      }});
    }}
    await window.mermaid.run({{
      nodes: [document.getElementById("{div_id}")]
    }});
  }};
  render();
</script>
"""

display(HTML(html))


## Classification

lire les mot cle (token) du document et dire de quelle classe apartien selon les mot cle quil a :

    "BON_DE_COMMANDE": [
        "BON DE COMMANDE",
        "COMMANDE",
        "TOTAL TTC",
        "PRIX UNITAIRE",
        "TVA"
    ],
    "PURCHASE_ORDER": [
        "PURCHASE ORDER",
        "PO NUMBER",
        "UNIT PRICE",
        "QUANTITY",
        "TOTAL AMOUNT"
    ],
    "CONTRAT": [
        "CONTRAT",
        "IL A ÉTÉ CONVENU",
        "ENTRE LES SOUSSIGNÉS",
        "RÉSILIATION",
        "SIGNATURE"
    ],
    "ARTICLE": [
        "ARTICLE",
        "VU LA LOI",
        "CONSIDÉRANT",
        "DÉCRET",
        "DISPOSITION"
    ]

In [25]:
import re
import spacy
from langdetect import detect
from collections import defaultdict

# TEXTE OCR
texte = OCR_TEXT

#  MOTS-CLÉS PAR CLASSE 
KEYWORDS = {

    "FACTURE": [
        # FR forts
        "FACTURE",
        "NUMERO DE FACTURE",
        "N° FACTURE",
        "REFERENCE FACTURE",
        "DATE DE FACTURE",
        "ECHEANCE",
        "DATE D'ECHEANCE",
        "MONTANT A PAYER",
        "MONTANT A PAYER TTC",
        "MONTANT TTC",
        "TOTAL TTC",
        "TOTAL HT",
        "MONTANT HT",
        "TVA",
        "TAUX DE TVA",
        "MONTANT TVA",
        "SOUS-TOTAL",
        "NET A PAYER",
        "NET A PAYER TTC",
        "SOLDE DU",
        "A REGLER",
        "MODE DE PAIEMENT",
        "REGLEMENT",
        "PAIEMENT",
        "IBAN",
        "BIC",
        "RIB",
        "VIREMENT",
        "CHEQUE",
        "ESPECES",
        "BANQUE",
        "REFERENCE CLIENT",
        "CODE CLIENT",
        "CLIENT",
        "ADRESSE DE FACTURATION",
        "ADRESSE DE LIVRAISON",
        "SIRET",
        "SIREN",
        "RCS",
        "N° TVA",
        "TVA INTRACOMMUNAUTAIRE",
        "N° TVA INTRACOMMUNAUTAIRE",
        "BON DE LIVRAISON",
        "BL",
        "COMMANDE",
        "N° COMMANDE",
        "REFERENCE COMMANDE",
        "DESIGNATION",
        "DESCRIPTION",
        "QUANTITE",
        "PRIX UNITAIRE",
        "P.U.",
        "MONTANT LIGNE",
        "TOTAL LIGNE",
        "REMISE",
        "DISCOUNT",
        "FRAIS DE PORT",
        "LIVRAISON",
        "PENALITES DE RETARD",
        "CONDITIONS DE PAIEMENT",
        "TTC",
        "HT",
        "TIMBRE",
        # EN
        "INVOICE",
        "INVOICE NUMBER",
        "INVOICE NO",
        "BILL TO",
        "SHIP TO",
        "DUE DATE",
        "PAYMENT TERMS",
        "SUBTOTAL",
        "TAX",
        "VAT",
        "TOTAL",
        "TOTAL AMOUNT",
        "AMOUNT DUE",
        "BALANCE DUE",
        "BANK TRANSFER",
        "IBAN",
        "BIC",
        "SWIFT"
    ],

    "BON_DE_COMMANDE": [
        # FR forts
        "BON DE COMMANDE",
        "BC",
        "N° BC",
        "NUMERO DE COMMANDE",
        "N° COMMANDE",
        "REFERENCE COMMANDE",
        "DATE DE COMMANDE",
        "COMMANDE",
        "ACHETEUR",
        "FOURNISSEUR",
        "ADRESSE DE LIVRAISON",
        "ADRESSE DE FACTURATION",
        "LIVRAISON",
        "DATE DE LIVRAISON",
        "CONDITIONS DE LIVRAISON",
        "INCOTERM",
        "INCOTERMS",
        "DESIGNATION",
        "ARTICLE",
        "REFERENCE",
        "REF",
        "CODE ARTICLE",
        "CODE PRODUIT",
        "SKU",
        "QUANTITE",
        "QTE",
        "UNITE",
        "PU",
        "P.U.",
        "PRIX UNITAIRE",
        "PRIX UNITARE",  # typo OCR fréquent
        "MONTANT",
        "TOTAL",
        "TOTAL HT",
        "TOTAL TTC",
        "TVA",
        "SOUS-TOTAL",
        "REMISE",
        "CONDITIONS DE PAIEMENT",
        "DELAI DE PAIEMENT",
        "SIGNATURE",
        "VALIDATION",
        "APPROBATION",
        "BON POUR ACCORD",
        # EN
        "PURCHASE ORDER",
        "PO",
        "PO NUMBER",
        "ORDER NUMBER",
        "ORDER DATE",
        "BUYER",
        "VENDOR",
        "SUPPLIER",
        "SHIP TO",
        "BILL TO",
        "DELIVERY DATE",
        "DELIVERY TERMS",
        "INCOTERMS",
        "ITEM",
        "ITEM CODE",
        "SKU",
        "DESCRIPTION",
        "QUANTITY",
        "QTY",
        "UNIT PRICE",
        "PRICE",
        "SUBTOTAL",
        "TAX",
        "VAT",
        "TOTAL AMOUNT",
        "AUTHORIZED SIGNATURE"
    ],

    "CONTRAT": [
        # FR forts
        "CONTRAT",
        "CONVENTION",
        "ACCORD",
        "IL A ETE CONVENU",
        "ENTRE LES SOUSSIGNES",
        "LES PARTIES",
        "PARTIE",
        "PREAMBULE",
        "OBJET DU CONTRAT",
        "OBJET",
        "DUREE",
        "DATE D'EFFET",
        "ENTREE EN VIGUEUR",
        "RENOUVELLEMENT",
        "RESILIATION",
        "RESILIATION ANTICIPEE",
        "CLAUSE",
        "ARTICLE 1",
        "ARTICLE 2",
        "OBLIGATIONS",
        "ENGAGEMENTS",
        "RESPONSABILITE",
        "CONFIDENTIALITE",
        "NON-DIVULGATION",
        "PROPRIETE INTELLECTUELLE",
        "FORCE MAJEURE",
        "LITIGE",
        "JURIDICTION",
        "TRIBUNAL COMPETENT",
        "DROIT APPLICABLE",
        "LOI APPLICABLE",
        "INDEMNISATION",
        "PENALITES",
        "GARANTIE",
        "ANNEXE",
        "AVENANT",
        "SIGNATURE",
        "FAIT A",
        "LE PRESENT CONTRAT",
        # EN
        "CONTRACT",
        "AGREEMENT",
        "THIS AGREEMENT",
        "WHEREAS",
        "BETWEEN THE UNDERSIGNED",
        "PARTIES",
        "TERM",
        "EFFECTIVE DATE",
        "COMMENCEMENT",
        "RENEWAL",
        "TERMINATION",
        "CONFIDENTIALITY",
        "NONDISCLOSURE",
        "INTELLECTUAL PROPERTY",
        "GOVERNING LAW",
        "JURISDICTION",
        "LIABILITY",
        "INDEMNIFICATION",
        "FORCE MAJEURE",
        "AMENDMENT",
        "APPENDIX",
        "SIGNATURE"
    ],

    "ARTICLE": [
        # FR
        "ARTICLE",
        "ART.",
        "VU LA LOI",
        "VU LE CODE",
        "CODE",
        "CONSIDERANT",
        "CONSIDÉRANT",
        "ATTENDU QUE",
        "DECRET",
        "DÉCRET",
        "ARRETE",
        "ARRÊTÉ",
        "LOI",
        "ORDONNANCE",
        "CIRCULAIRE",
        "DISPOSITION",
        "ALINEA",
        "PARAGRAPHE",
        "CHAPITRE",
        "SECTION",
        "TITRE",
        "JOURNAL OFFICIEL",
        "REPUBLIC",
        "REPUBLIQUE",
        "MINISTERE",
        "MINISTÈRE",
        "TRIBUNAL",
        "COUR D'APPEL",
        "CONSEIL D'ETAT",
        "CONSEIL D’ÉTAT",
        "DECISION",
        "DÉCISION",
        "JURISPRUDENCE",
        "PROCEDURE",
        "PROCÉDURE",
        "SANCTION",
        "AMENDE",
        "AMENDEMENT",
        "CONFORMEMENT A",
        "EN APPLICATION DE",
        "A COMPTER DU",
        "ENTREE EN VIGUEUR",
        # EN
        "ARTICLE",
        "SECTION",
        "CHAPTER",
        "WHEREAS",
        "ACT",
        "DECREE",
        "REGULATION",
        "LAW",
        "PROVISION",
        "Pursuant to",
        "In accordance with",
        "ENTRY INTO FORCE",
        "EFFECTIVE"
    ],

    "FORMULAIRE": [
        # FR
        "FORMULAIRE",
        "DEMANDE",
        "DEMANDEUR",
        "BENEFICIAIRE",
        "BÉNÉFICIAIRE",
        "NOM",
        "PRENOM",
        "PRÉNOM",
        "DATE",
        "DATE DE L'EXAMEN",
        "SIGNATURE",
        "CACHET",
        "SIGNATURE ET CACHET",
        "JE CERTIFIE",
        "CERTIFIE QUE",
        "CERTIFICATION",
        "REPRESENTANT",
        "REPRÉSENTANT",
        "REPRESENTANT DU CLUB",
        "CLUB",
        "LICENCIE",
        "LICENCIÉ",
        "PIECES FOURNIES",
        "DOCUMENT",
        "INFORMATIONS FIGURANT",
        "CACHEt DOIT ETRE LISIBLE",
        "CADRE RESERVE",
        "A REMPLIR",
        # EN
        "FORM",
        "APPLICATION",
        "APPLICANT",
        "BENEFICIARY",
        "NAME",
        "FIRST NAME",
        "DATE",
        "SIGNATURE",
        "STAMP",
        "I CERTIFY",
        "CERTIFY THAT"
    ]
}


#  DETECTION LANGUE 
sample = texte[:2000]
try:
    doc_lang = detect(sample)
except:
    doc_lang = "fr"

#  CHARGER SPACY LEGER 
nlp = spacy.load(
    "fr_core_news_sm",
    disable=["parser", "tagger", "ner", "lemmatizer"]
)

#  TOKENISATION GLOBALE 
doc = nlp.make_doc(texte)

tokens = [t.text.upper() for t in doc if not t.is_space]

# texte normalisé pour détection de phrases clés
text_upper = " ".join(tokens)

#  SCORING DETERMINISTE 
scores = defaultdict(int)
matched_keywords = defaultdict(list)

for doc_type, keywords in KEYWORDS.items():
    for kw in keywords:
        if kw in text_upper:
            scores[doc_type] += 1
            matched_keywords[doc_type].append(kw)

#  DECISION 
if scores:
    detected_class = max(scores, key=scores.get)
else:
    detected_class = "UNKNOWN"

#  RESULTAT 
print("Langue détectée :", doc_lang)
print("Classe détectée :", detected_class)
print("\nScores détaillés :")
for k, v in scores.items():
    print(f"  {k} -> {v}")

print("\nMots-clés détectés :")
for k, v in matched_keywords.items():
    print(f"  {k} -> {v}")


Langue détectée : fr
Classe détectée : FACTURE

Scores détaillés :
  FACTURE -> 7
  BON_DE_COMMANDE -> 4
  CONTRAT -> 2
  ARTICLE -> 2
  FORMULAIRE -> 5

Mots-clés détectés :
  FACTURE -> ['FACTURE', 'TVA', 'MODE DE PAIEMENT', 'PAIEMENT', 'CLIENT', 'DESCRIPTION', 'TTC']
  BON_DE_COMMANDE -> ['MONTANT', 'TVA', 'SIGNATURE', 'DESCRIPTION']
  CONTRAT -> ['SIGNATURE', 'SIGNATURE']
  ARTICLE -> ['CODE', 'ACT']
  FORMULAIRE -> ['DATE', 'SIGNATURE', 'CACHET', 'DATE', 'SIGNATURE']


## Produire une sortie JSON stable (contrat de ton système)

In [43]:
# ============================
# Cellule: Produire une sortie JSON stable (contrat de ton système)
# + (6.5) Routage JSON/YAML (chargeur de règles par doc_type / template_id)
# ============================

import uuid
import json
from typing import Dict, Any, Optional

# =========================
# PARAMS de décision (stables)
# =========================
THRESHOLD = 3
MARGIN = 2

# =========================
# 1) Normaliser la langue -> language_hint
# (doc_lang vient de ta cellule précédente)
# =========================
if isinstance(doc_lang, str):
    if doc_lang.startswith("fr"):
        language_hint = "fr"
    elif doc_lang.startswith("en"):
        language_hint = "en"
    else:
        language_hint = "mix"
else:
    language_hint = "mix"

# =========================
# 2) Stabiliser scores/matched_keywords
# - On veut que toutes les classes apparaissent, même à 0 / []
# - KEYWORDS vient de ta cellule précédente
# =========================
scores_stable = {cls: int(scores.get(cls, 0)) for cls in KEYWORDS.keys()}
matched_stable = {cls: list(matched_keywords.get(cls, [])) for cls in KEYWORDS.keys()}

# =========================
# 3) Calculer top_score et second_score
# =========================
sorted_items = sorted(scores_stable.items(), key=lambda kv: kv[1], reverse=True)
top_type, top_score = sorted_items[0] if sorted_items else ("UNKNOWN", 0)
second_score = sorted_items[1][1] if len(sorted_items) > 1 else 0

# =========================
# 4) Décision OK/REVIEW + doc_type final
# - si aucun signal -> UNKNOWN + REVIEW
# - sinon OK si seuil+margin, sinon REVIEW (classe = meilleure hypothèse)
# =========================
if top_score == 0:
    doc_type_final = "UNKNOWN"
    status = "REVIEW"
else:
    doc_type_final = top_type
    if top_score >= THRESHOLD and (top_score - second_score) >= MARGIN:
        status = "OK"
    else:
        status = "REVIEW"

# =========================
# 5) Construire le JSON stable (contrat)
# =========================
result: Dict[str, Any] = {
    "doc_id": str(uuid.uuid4()),
    "doc_type": doc_type_final,
    "status": status,
    "scores": scores_stable,
    "matched_keywords": matched_stable,
    "threshold": THRESHOLD,
    "margin": MARGIN,
    "language_hint": language_hint
}

# Debug utile pendant dev (optionnel)
result["decision_debug"] = {
    "top_score": top_score,
    "second_score": second_score,
    "diff": top_score - second_score
}

# ==========================================================
# (6.5) Routage JSON/YAML (déterministe)
# Objectif: préparer le "cerveau" qui charge les règles
# par doc_type + template_id (sans duplication de code).
#
# Notes:
# - Si PyYAML n'est pas installé, fallback JSON (ou dict en dur).
# - Pour notebook MVP, on peut garder des "règles inline" si fichiers absents.
# ==========================================================

def safe_load_yaml_or_json(path: str) -> Optional[Dict[str, Any]]:
    """
    Charge un fichier YAML ou JSON et retourne un dict.
    Retourne None si fichier introuvable/erreur.
    """
    try:
        import os
        if not os.path.exists(path):
            return None

        if path.lower().endswith(".json"):
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)

        # YAML
        try:
            import yaml  # type: ignore
        except Exception:
            return None

        with open(path, "r", encoding="utf-8") as f:
            return yaml.safe_load(f)
    except Exception:
        return None

def merge_rules(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
    """
    Merge simple et déterministe:
    - dict: override keys
    - list: concat (base + override) sans dédup (tu peux dédup plus tard si besoin)
    - autres: override
    """
    out = dict(base)
    for k, v in override.items():
        if k not in out:
            out[k] = v
            continue
        if isinstance(out[k], dict) and isinstance(v, dict):
            out[k] = merge_rules(out[k], v)
        elif isinstance(out[k], list) and isinstance(v, list):
            out[k] = out[k] + v
        else:
            out[k] = v
    return out

def route_rules(
    doc_type: str,
    template_id: Optional[str] = None,
    rules_dir: str = "rules"
) -> Dict[str, Any]:
    """
    Résolution déterministe:
    1) rules/common.(yaml|json) (facultatif)
    2) rules/{doc_type}.(yaml|json) (facultatif)
    3) rules/templates/{template_id}.(yaml|json) (facultatif)
    Fallback: règles inline minimales.
    """
    # 1) common
    common = (
        safe_load_yaml_or_json(f"{rules_dir}/common.yaml")
        or safe_load_yaml_or_json(f"{rules_dir}/common.json")
        or {}
    )

    # 2) doc_type specific
    dt_rules = (
        safe_load_yaml_or_json(f"{rules_dir}/{doc_type}.yaml")
        or safe_load_yaml_or_json(f"{rules_dir}/{doc_type}.json")
        or {}
    )

    merged = merge_rules(common, dt_rules)

    # 3) template specific
    if template_id:
        tpl_rules = (
            safe_load_yaml_or_json(f"{rules_dir}/templates/{template_id}.yaml")
            or safe_load_yaml_or_json(f"{rules_dir}/templates/{template_id}.json")
            or {}
        )
        merged = merge_rules(merged, tpl_rules)

    # Fallback minimal si rien trouvé: règles vides, mais structure stable
    if not merged:
        merged = {
            "ruleset_id": f"INLINE_{doc_type}_V1",
            "extractors": {},
            "validators": {}
        }

    # Ajout meta de routage (traçabilité)
    merged.setdefault("ruleset_id", f"RULESET_{doc_type}_V1")
    merged.setdefault("extractors", {})
    merged.setdefault("validators", {})
    return merged

# On attache une "config active" au result (pour l'étape API ensuite)
# template_id peut ne pas exister encore (calculé plus tard), donc None ici
result["routing"] = {
    "rules_dir": "rules",
    "doc_type": result["doc_type"],
    "template_id": result.get("template_id", None),
    "ruleset": route_rules(result["doc_type"], template_id=result.get("template_id", None))
}

print(json.dumps(result, ensure_ascii=False, indent=2))


{
  "doc_id": "fe57bb6d-e02e-4d49-ad20-cf93b795ed78",
  "doc_type": "FACTURE",
  "status": "OK",
  "scores": {
    "FACTURE": 7,
    "BON_DE_COMMANDE": 4,
    "CONTRAT": 2,
    "ARTICLE": 2,
    "FORMULAIRE": 5
  },
  "matched_keywords": {
    "FACTURE": [
      "FACTURE",
      "TVA",
      "MODE DE PAIEMENT",
      "PAIEMENT",
      "CLIENT",
      "DESCRIPTION",
      "TTC"
    ],
    "BON_DE_COMMANDE": [
      "MONTANT",
      "TVA",
      "SIGNATURE",
      "DESCRIPTION"
    ],
    "CONTRAT": [
      "SIGNATURE",
      "SIGNATURE"
    ],
    "ARTICLE": [
      "CODE",
      "ACT"
    ],
    "FORMULAIRE": [
      "DATE",
      "SIGNATURE",
      "CACHET",
      "DATE",
      "SIGNATURE"
    ]
  },
  "threshold": 3,
  "margin": 2,
  "language_hint": "fr",
  "decision_debug": {
    "top_score": 7,
    "second_score": 5,
    "diff": 2
  },
  "routing": {
    "rules_dir": "rules",
    "doc_type": "FACTURE",
    "template_id": null,
    "ruleset": {
      "ruleset_id": "INLINE_FACTURE_V

## METADATA “empreinte de mise en page”


In [44]:
# ============================
# Cellule: METADATA “empreinte de mise en page”
# + Quality gate appliqué sur status (enforced)
# ============================

import json
import hashlib
import numpy as np
import cv2
import pytesseract

# suppose: prepped (PIL image), OCR_TEXT (str), result (dict) existent déjà

def pil_to_gray_np(pil_img):
    return np.array(pil_img.convert("L"))

def get_orientation_deg(pil_img) -> int:
    """
    Orientation 0/90/180/270 via Tesseract OSD.
    Si OSD échoue, retourne 0.
    """
    try:
        osd = pytesseract.image_to_osd(pil_img)
        for line in osd.splitlines():
            if line.lower().startswith("rotate:"):
                return int(line.split(":")[1].strip())
    except Exception:
        pass
    return 0

def rotate_pil_by_deg(pil_img, deg: int):
    if deg % 360 == 0:
        return pil_img
    return pil_img.rotate(-deg, expand=True)

def estimate_skew_hough_deg(gray: np.ndarray) -> float:
    """
    Estime un skew petit angle via lignes Hough.
    Retour typiquement dans [-10, +10] si doc normal.
    """
    blur = cv2.GaussianBlur(gray, (3, 3), 0)
    _, bw = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    edges = cv2.Canny(bw, 50, 150)

    lines = cv2.HoughLinesP(
        edges,
        1,
        np.pi / 180,
        threshold=120,
        minLineLength=max(60, int(min(gray.shape) * 0.12)),
        maxLineGap=10
    )

    if lines is None:
        return 0.0

    angles = []
    for x1, y1, x2, y2 in lines[:, 0]:
        dx = x2 - x1
        dy = y2 - y1
        if dx == 0:
            continue
        angle = np.degrees(np.arctan2(dy, dx))
        if angle < -45:
            angle += 90
        if angle > 45:
            angle -= 90
        if -15 <= angle <= 15:
            angles.append(angle)

    if not angles:
        return 0.0

    return float(np.median(angles))

def has_table_hough(gray: np.ndarray) -> dict:
    blur = cv2.GaussianBlur(gray, (3, 3), 0)
    _, bw = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    edges = cv2.Canny(bw, 50, 150)

    lines = cv2.HoughLinesP(
        edges, 1, np.pi / 180,
        threshold=80,
        minLineLength=max(40, int(min(gray.shape) * 0.08)),
        maxLineGap=10
    )

    h_count = 0
    v_count = 0
    if lines is not None:
        for x1, y1, x2, y2 in lines[:, 0]:
            dx = x2 - x1
            dy = y2 - y1
            if abs(dy) <= max(2, 0.1 * abs(dx)):
                h_count += 1
            if abs(dx) <= max(2, 0.1 * abs(dy)):
                v_count += 1

    has_table = (h_count >= 6 and v_count >= 4)
    return {"has_table": bool(has_table), "h_lines": int(h_count), "v_lines": int(v_count)}

def ocr_confidence_stats(pil_img) -> dict:
    try:
        data = pytesseract.image_to_data(
            pil_img,
            lang=getattr(args, "lang", None) or "fra",
            config=globals().get("config", ""),
            output_type=pytesseract.Output.DICT
        )
        confs = []
        for c in data.get("conf", []):
            try:
                c = float(c)
                if c >= 0:
                    confs.append(c)
            except Exception:
                pass

        if not confs:
            return {"ocr_avg_conf": None, "ocr_words": 0}

        return {"ocr_avg_conf": float(sum(confs) / len(confs)), "ocr_words": int(len(confs))}
    except Exception:
        return {"ocr_avg_conf": None, "ocr_words": 0}

def text_density_stats(ocr_text: str, pil_img) -> dict:
    w, h = pil_img.size
    area_mpx = (w * h) / 1_000_000.0
    area_mpx = area_mpx if area_mpx > 0 else 1.0
    alnum_chars = sum(1 for ch in ocr_text if ch.isalnum())
    return {
        "text_density_alnum_chars_per_mpx": float(alnum_chars / area_mpx),
        "image_area_mpx": float(area_mpx)
    }

def stable_template_id(fingerprint: dict) -> str:
    canon = json.dumps(fingerprint, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
    digest = hashlib.sha256(canon.encode("utf-8")).hexdigest()
    return f"tpl_{digest[:16]}"

def bucketize_lines(n: int) -> str:
    if n <= 0:
        return "0"
    if 1 <= n <= 5:
        return "1-5"
    if 6 <= n <= 15:
        return "6-15"
    return ">15"

def round_size(x: int, base: int = 50) -> int:
    return int(base * round(x / base))

def round_ratio(x: float, decimals: int = 2) -> float:
    return float(round(x, decimals))

def compute_grid_type_from_counts(h_lines: int, v_lines: int) -> str:
    """
    Heuristique déterministe:
      - ITEM_TABLE: tableau d'articles (beaucoup de lignes H et V)
      - BOXED_FORM: formulaire encadré (beaucoup de H, peu de V)
      - NONE: pas de structure de grille notable
    """
    if h_lines >= 20 and v_lines >= 8:
        return "ITEM_TABLE"
    if h_lines >= 20 and v_lines < 8:
        return "BOXED_FORM"
    return "NONE"

def compute_quality_gate(layout_fp: dict) -> dict:
    reasons = []

    # OCR confidence
    ocr_list = layout_fp.get("ocr_confidence", [])
    ocr_avg = None
    ocr_words = None
    if ocr_list:
        ocr_avg = ocr_list[0].get("ocr_avg_conf", None)
        ocr_words = ocr_list[0].get("ocr_words", None)

    if ocr_avg is not None and ocr_avg < 75:
        reasons.append("LOW_OCR_CONF")
    if ocr_words is not None and ocr_words < 40:
        reasons.append("TOO_FEW_WORDS")

    # skew
    skew_list = layout_fp.get("skew_angle_deg", [])
    if skew_list:
        skew = float(skew_list[0])
        if abs(skew) > 5.0:
            reasons.append("HIGH_SKEW")

    # densité
    dens_list = layout_fp.get("text_density", [])
    if dens_list:
        dens = dens_list[0].get("text_density_alnum_chars_per_mpx", None)
        if dens is not None and dens < 120:
            reasons.append("LOW_TEXT_DENSITY")

    status = "PASS" if len(reasons) == 0 else "REVIEW"
    return {
        "status": status,
        "reasons": reasons,
        "thresholds": {
            "min_ocr_avg_conf": 75,
            "min_ocr_words": 40,
            "max_abs_skew_deg": 5.0,
            "min_text_density_alnum_chars_per_mpx": 120
        }
    }

# -----------------------------
# 1) Orientation + correction
# -----------------------------
orientation_deg = get_orientation_deg(prepped)
prepped_oriented = rotate_pil_by_deg(prepped, orientation_deg)

# -----------------------------
# 2) Calcul features par page (1 page ici)
# -----------------------------
pages = [prepped_oriented]

page_sizes = []
skew_angles = []
table_flags = []
ocr_confs = []
densities = []

for p in pages:
    w, h = p.size
    page_sizes.append({"w": int(w), "h": int(h)})

    gray = pil_to_gray_np(p)

    skew_angles.append(estimate_skew_hough_deg(gray))
    tbl = has_table_hough(gray)
    table_flags.append(tbl)

    ocr_confs.append(ocr_confidence_stats(p))
    densities.append(text_density_stats(OCR_TEXT, p))

# -----------------------------
# 3) layout_fingerprint complet
# -----------------------------
layout_fingerprint = {
    "page_count": int(len(pages)),
    "page_sizes": page_sizes,
    "orientation_deg": int(orientation_deg),
    "skew_angle_deg": [float(a) for a in skew_angles],
    "has_table": [bool(t["has_table"]) for t in table_flags],
    "table_line_counts": [{"h_lines": t["h_lines"], "v_lines": t["v_lines"]} for t in table_flags],
    "text_density": densities,
    "ocr_confidence": ocr_confs,
    "aspect_ratio": [float(ps["w"] / ps["h"]) for ps in page_sizes]
}

# -----------------------------
# 3bis) grid_type
# -----------------------------
h_lines0 = int(layout_fingerprint["table_line_counts"][0]["h_lines"]) if layout_fingerprint["table_line_counts"] else 0
v_lines0 = int(layout_fingerprint["table_line_counts"][0]["v_lines"]) if layout_fingerprint["table_line_counts"] else 0
grid_type = compute_grid_type_from_counts(h_lines0, v_lines0)
layout_fingerprint["grid_type"] = grid_type

# -----------------------------
# 3ter) quality_gate
# -----------------------------
quality_gate = compute_quality_gate(layout_fingerprint)

# ============================
# ENFORCE: appliquer la quality gate sur status
# ============================
# - si gate FAIL/REVIEW => on force status REVIEW (même si classification OK)
# - on laisse doc_type inchangé (hypothèse), mais status devient REVIEW
if isinstance(result, dict) and quality_gate.get("status") == "REVIEW":
    result["status"] = "REVIEW"
    result.setdefault("quality_gate_enforced", {})
    result["quality_gate_enforced"] = {
        "forced_status": "REVIEW",
        "reason_count": len(quality_gate.get("reasons", [])),
        "reasons": list(quality_gate.get("reasons", [])),
        "rule_id": "RULE_QUALITY_GATE_V1"
    }

# -----------------------------
# 4) template_id stable bucketed
# -----------------------------
w0 = int(page_sizes[0]["w"]) if page_sizes else 0
h0_size = int(page_sizes[0]["h"]) if page_sizes else 0
ratio0 = (w0 / h0_size) if (w0 and h0_size) else 0.0

layout_signature_bucketed = {
    "page_count": layout_fingerprint["page_count"],
    "w_rounded": round_size(w0, base=50),
    "h_rounded": round_size(h0_size, base=50),
    "aspect_ratio_rounded": round_ratio(ratio0, 2),
    "orientation_deg": layout_fingerprint["orientation_deg"],
    "has_table": layout_fingerprint["has_table"],
    "h_lines_bucket": bucketize_lines(int(layout_fingerprint["table_line_counts"][0]["h_lines"])) if layout_fingerprint["table_line_counts"] else "0",
    "v_lines_bucket": bucketize_lines(int(layout_fingerprint["table_line_counts"][0]["v_lines"])) if layout_fingerprint["table_line_counts"] else "0",
    "grid_type": grid_type
}
template_id = stable_template_id(layout_signature_bucketed)

# -----------------------------
# 5) Attacher au JSON principal
# -----------------------------
result["layout_fingerprint"] = layout_fingerprint
result["template_id"] = template_id
result["template_signature"] = layout_signature_bucketed
result["quality_gate"] = quality_gate

# Update routing si présent (doc_type déjà, maintenant template_id est connu)
if "routing" in result and isinstance(result["routing"], dict):
    result["routing"]["template_id"] = template_id
    # On peut re-router avec template_id (si tu as des règles template)
    try:
        ruleset = result["routing"].get("ruleset", {})
        # Si route_rules existe (cellule précédente), on recharge:
        if "route_rules" in globals():
            result["routing"]["ruleset"] = route_rules(result.get("doc_type", "UNKNOWN"), template_id=template_id)
    except Exception:
        pass

# metadata_summary
summary = result.get("metadata_summary", {})
summary.update({
    "page_count": layout_fingerprint["page_count"],
    "template_id": template_id,
    "orientation_deg": orientation_deg,
    "avg_skew_deg": float(sum(skew_angles) / len(skew_angles)) if skew_angles else 0.0,
    "has_table_any": bool(any(layout_fingerprint["has_table"])),
    "ocr_avg_conf_mean": (
        float(
            sum(c["ocr_avg_conf"] for c in ocr_confs if c["ocr_avg_conf"] is not None) /
            max(1, len([c for c in ocr_confs if c["ocr_avg_conf"] is not None]))
        )
        if ocr_confs else None
    ),
    "grid_type": grid_type,
    "quality_gate_status": quality_gate["status"],
    "status_after_quality_gate": result.get("status")
})
result["metadata_summary"] = summary

print(json.dumps(result, ensure_ascii=False, indent=2))


{
  "doc_id": "fe57bb6d-e02e-4d49-ad20-cf93b795ed78",
  "doc_type": "FACTURE",
  "status": "OK",
  "scores": {
    "FACTURE": 7,
    "BON_DE_COMMANDE": 4,
    "CONTRAT": 2,
    "ARTICLE": 2,
    "FORMULAIRE": 5
  },
  "matched_keywords": {
    "FACTURE": [
      "FACTURE",
      "TVA",
      "MODE DE PAIEMENT",
      "PAIEMENT",
      "CLIENT",
      "DESCRIPTION",
      "TTC"
    ],
    "BON_DE_COMMANDE": [
      "MONTANT",
      "TVA",
      "SIGNATURE",
      "DESCRIPTION"
    ],
    "CONTRAT": [
      "SIGNATURE",
      "SIGNATURE"
    ],
    "ARTICLE": [
      "CODE",
      "ACT"
    ],
    "FORMULAIRE": [
      "DATE",
      "SIGNATURE",
      "CACHET",
      "DATE",
      "SIGNATURE"
    ]
  },
  "threshold": 3,
  "margin": 2,
  "language_hint": "fr",
  "decision_debug": {
    "top_score": 7,
    "second_score": 5,
    "diff": 2
  },
  "routing": {
    "rules_dir": "rules",
    "doc_type": "FACTURE",
    "template_id": "tpl_223dc390c94e7d2c",
    "ruleset": {
      "ruleset_id":

## Chunking par page avec identifiants

In [45]:
# ============================
# Chunking par page avec identifiants (base du chat + citations)
# ============================
import uuid
from typing import List, Dict, Any

# Cette cellule suppose que tu as déjà :
# - result (dict) : contient au moins result["doc_id"] + result["status"]
# - OCR_TEXT (str) : texte OCR complet (pour 1 page dans ton pipeline actuel)
#
# Améliorations :
# - skip si status=REVIEW (quality_gate enforced)
# - excerpt (pour citations)
# - char_len (debug/qualité)
# - nettoyage minimal stable
# - garde page + start/end pour citer précisément

CHUNK_SIZE = 800      # 600-900 conseillé
CHUNK_OVERLAP = 150   # 100-150 conseillé
EXCERPT_LEN = 220     # extrait court pour citations JSON


def clean_text_stable(text: str) -> str:
    text = text.replace("\r", "\n")
    text = "\n".join(line.rstrip() for line in text.splitlines())
    text = " ".join(text.split())
    return text.strip()


def chunk_text_fixed(text: str, chunk_size: int, overlap: int) -> List[Dict[str, Any]]:
    """
    Découpe en chunks de taille fixe avec overlap.
    Retourne: {chunk_index, start_char, end_char, text}
    """
    if chunk_size <= 0:
        raise ValueError("chunk_size doit être > 0")
    if overlap < 0 or overlap >= chunk_size:
        raise ValueError("overlap doit être >= 0 et < chunk_size")

    text = clean_text_stable(text)
    n = len(text)
    if n == 0:
        return []

    step = chunk_size - overlap
    chunks = []
    start = 0
    idx = 0

    while start < n:
        end = min(start + chunk_size, n)
        chunk_str = text[start:end]

        if len(chunk_str.strip()) >= 20:
            chunks.append({
                "chunk_index": idx,
                "start_char": start,
                "end_char": end,
                "text": chunk_str
            })

        idx += 1
        if end == n:
            break
        start += step

    return chunks


def build_chunks_for_pages(
    doc_id: str,
    pages_text: List[str],
    chunk_size: int,
    overlap: int,
    excerpt_len: int
) -> List[Dict[str, Any]]:
    """
    Construit des chunks par page.
    Format chunk:
      chunk_id, doc_id, page, chunk_index, start_char, end_char, char_len, text, excerpt
    """
    all_chunks = []
    for page_num, page_text in enumerate(pages_text, start=1):
        cleaned = clean_text_stable(page_text)
        page_chunks = chunk_text_fixed(cleaned, chunk_size, overlap)

        for c in page_chunks:
            chunk_id = f"chk_{uuid.uuid4().hex[:16]}"
            chunk_text = c["text"]
            excerpt = chunk_text[:excerpt_len]

            all_chunks.append({
                "chunk_id": chunk_id,
                "doc_id": doc_id,
                "page": page_num,
                "chunk_index": c["chunk_index"],
                "start_char": c["start_char"],
                "end_char": c["end_char"],
                "char_len": int(c["end_char"] - c["start_char"]),
                "text": chunk_text,
                "excerpt": excerpt
            })
    return all_chunks


# -----------------------------------
# Enforced: si status=REVIEW -> pas de chunking
# -----------------------------------
if result.get("status") == "REVIEW":
    result["chunking"] = {
        "chunk_size": CHUNK_SIZE,
        "overlap": CHUNK_OVERLAP,
        "excerpt_len": EXCERPT_LEN,
        "chunks_count": 0,
        "skipped_reason": "status=REVIEW (quality_gate enforced)"
    }
    result["chunks"] = []
    print("[skip] status=REVIEW (quality_gate enforced) -> pas de chunking")
else:
    doc_id = result["doc_id"]
    pages_text = [OCR_TEXT]  # 1 page (plus tard: liste de textes par page)

    chunks = build_chunks_for_pages(
        doc_id=doc_id,
        pages_text=pages_text,
        chunk_size=CHUNK_SIZE,
        overlap=CHUNK_OVERLAP,
        excerpt_len=EXCERPT_LEN
    )

    result["chunking"] = {
        "chunk_size": CHUNK_SIZE,
        "overlap": CHUNK_OVERLAP,
        "excerpt_len": EXCERPT_LEN,
        "chunks_count": len(chunks)
    }
    result["chunks"] = chunks

    print(f"Chunks générés: {len(chunks)}")
    print("Aperçu 2 premiers chunks:")
    for c in chunks[:2]:
        preview = c["excerpt"].replace("\n", " ")
        print(f"- page={c['page']} chunk_index={c['chunk_index']} start={c['start_char']} end={c['end_char']} len={c['char_len']} id={c['chunk_id']}")
        print(f"  excerpt: {preview}...")


Chunks générés: 2
Aperçu 2 premiers chunks:
- page=1 chunk_index=0 start=0 end=800 len=800 id=chk_7b97d3c6ee1b4b0b
  excerpt: FACTURE CODE CLENT NUMERO FCo0o1 4/20/2016 0002 Ma petite entreprise CLIENT 19,rue de place 1° mai SARL EL HANA 16000 Alger Centre IROUTE DE BEJAIA SETIF Tel : 00-00-52-12- 119000 Ident Fiscal : 160 N°art : 160100000000 ...
- page=1 chunk_index=1 start=650 end=907 len=257 id=chk_9bda55a9a431402f
  excerpt:  19.00 19,152.00 c1010 _ |Produit 10 1009 10.00 10,090.00 Non assujetti à latva [Montant à payer 180,204.00 [rimbre 1,802.00 Montant à payer ttc 182,006.00 Monatnt Facture enLettre … Cinq mille huit cent quatre vingt hui...


## Embeddings + recherche

In [46]:
# ============================
# Embeddings + recherche (TF-IDF char n-grams)
# ============================
import numpy as np
from typing import Dict, Any, List, Optional

# Cette cellule suppose que tu as déjà :
# - result (dict) avec chunks[]
# - result["chunks"] contient: chunk_id, doc_id, page, text, excerpt
# - result["status"] existe (enforced)

# -------------------------
# 0) Corpus en mémoire
# -------------------------
if "CORPUS" not in globals():
    CORPUS = []

existing_ids = {d.get("doc_id") for d in CORPUS}
if result.get("doc_id") not in existing_ids:
    CORPUS.append(result)

# -------------------------
# 1) Index TF-IDF (char n-grams) - robuste OCR
# -------------------------
from sklearn.feature_extraction.text import TfidfVectorizer

_vectorizer = None
_embeddings = None
_chunks_meta: List[Dict[str, Any]] = []


def _normalize_rows(x: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(x, axis=1, keepdims=True)
    norms = np.where(norms == 0, 1.0, norms)
    return x / norms


def _collect_all_chunks(corpus: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    all_chunks = []
    for doc in corpus:
        # Enforced: ne pas indexer les docs en REVIEW
        if doc.get("status") == "REVIEW":
            continue

        doc_id = doc.get("doc_id")
        doc_type = doc.get("doc_type")
        template_id = doc.get("template_id")
        lang = doc.get("language_hint")
        status = doc.get("status")

        for ch in doc.get("chunks", []):
            all_chunks.append({
                "doc_id": doc_id,
                "doc_type": doc_type,
                "template_id": template_id,
                "language_hint": lang,
                "status": status,
                "page": ch.get("page"),
                "chunk_id": ch.get("chunk_id"),
                "text": ch.get("text", ""),
                "excerpt": ch.get("excerpt", "")
            })
    return all_chunks


def build_index(corpus: List[Dict[str, Any]]):
    global _vectorizer, _embeddings, _chunks_meta

    _chunks_meta = _collect_all_chunks(corpus)
    texts = [c["text"] for c in _chunks_meta]

    if len(texts) == 0:
        _embeddings = np.zeros((0, 1), dtype=np.float32)
        return

    _vectorizer = TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3, 5),
        max_features=20000
    )
    X = _vectorizer.fit_transform(texts)
    emb = X.toarray().astype(np.float32)
    _embeddings = _normalize_rows(emb)


def _embed_query(query: str) -> np.ndarray:
    if _embeddings is None or len(_chunks_meta) == 0:
        return np.zeros((1, 1), dtype=np.float32)

    q = _vectorizer.transform([query]).toarray().astype(np.float32)
    q = _normalize_rows(q)
    return q


def retrieve(
    query: str,
    top_k: int = 5,
    filters: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
    """
    Retourne top-k chunks avec {doc_id, page, chunk_id, excerpt, score}
    filters possibles:
      - doc_type: str ou [str]
      - template_id: str ou [str]
      - language_hint: str ou [str]
      - status: str ou [str]  (optionnel)
    """
    if _embeddings is None or len(_chunks_meta) == 0:
        return []

    q = _embed_query(query)
    sims = (_embeddings @ q.T).reshape(-1)

    idxs = np.arange(len(_chunks_meta))

    if filters:
        def _as_set(v):
            if v is None:
                return None
            if isinstance(v, list):
                return set(v)
            return {v}

        dt = _as_set(filters.get("doc_type"))
        tpl = _as_set(filters.get("template_id"))
        lang = _as_set(filters.get("language_hint"))
        st = _as_set(filters.get("status"))

        mask = np.ones(len(_chunks_meta), dtype=bool)
        if dt is not None:
            mask &= np.array([c["doc_type"] in dt for c in _chunks_meta], dtype=bool)
        if tpl is not None:
            mask &= np.array([c["template_id"] in tpl for c in _chunks_meta], dtype=bool)
        if lang is not None:
            mask &= np.array([c["language_hint"] in lang for c in _chunks_meta], dtype=bool)
        if st is not None:
            mask &= np.array([c["status"] in st for c in _chunks_meta], dtype=bool)

        idxs = idxs[mask]
        sims = sims[mask]

    if len(idxs) == 0:
        return []

    k = min(top_k, len(idxs))
    top_local = np.argpartition(-sims, k - 1)[:k]
    top_sorted = top_local[np.argsort(-sims[top_local])]

    out = []
    for j in top_sorted:
        meta = _chunks_meta[idxs[j]]
        out.append({
            "doc_id": meta["doc_id"],
            "page": meta["page"],
            "chunk_id": meta["chunk_id"],
            "excerpt": meta["excerpt"],
            "score": float(sims[j])
        })
    return out


# -------------------------
# 2) Construire l'index + test
# -------------------------
build_index(CORPUS)
print(f"[index] docs={len(CORPUS)} chunks={len(_chunks_meta)} dim={_embeddings.shape[1] if _embeddings is not None else 'NA'}")

test_query = "tel ?"
hits = retrieve(test_query, top_k=3, filters={"doc_type": "FACTURE"})
print("[retrieve] query:", test_query)
for h in hits:
    print(f"- score={h['score']:.4f} doc_id={h['doc_id']} page={h['page']} chunk_id={h['chunk_id']}")
    print(f"  excerpt: {h['excerpt'][:180]}...")


[index] docs=2 chunks=4 dim=1240
[retrieve] query: tel ?
- score=0.0467 doc_id=cbcebf7f-14b0-4b0e-9bf9-36c96a7ee9bf page=1 chunk_id=chk_54b2fa7ac9304a27
  excerpt: FACTURE CODE CLENT NUMERO FCo0o1 4/20/2016 0002 Ma petite entreprise CLIENT 19,rue de place 1° mai SARL EL HANA 16000 Alger Centre IROUTE DE BEJAIA SETIF Tel : 00-00-52-12- 119000 ...
- score=0.0467 doc_id=fe57bb6d-e02e-4d49-ad20-cf93b795ed78 page=1 chunk_id=chk_7b97d3c6ee1b4b0b
  excerpt: FACTURE CODE CLENT NUMERO FCo0o1 4/20/2016 0002 Ma petite entreprise CLIENT 19,rue de place 1° mai SARL EL HANA 16000 Alger Centre IROUTE DE BEJAIA SETIF Tel : 00-00-52-12- 119000 ...
- score=0.0000 doc_id=cbcebf7f-14b0-4b0e-9bf9-36c96a7ee9bf page=1 chunk_id=chk_4521a4edb87f46fd
  excerpt:  19.00 19,152.00 c1010 _ |Produit 10 1009 10.00 10,090.00 Non assujetti à latva [Montant à payer 180,204.00 [rimbre 1,802.00 Montant à payer ttc 182,006.00 Monatnt Facture enLettre...


## API “chat” minimal : answer + sources (citations)

In [47]:
# ============================
# API “chat” minimal : answer + sources (citations) + Rule IDs + PO line-items + vérif totaux
# ============================
import re
import json
import unicodedata
from typing import Dict, Any, Optional, List, Tuple

# Cette cellule suppose que tu as déjà :
# - retrieve(query, top_k, filters) (cellule précédente)
# - CORPUS / result existants
# - chunks contiennent excerpt
# - (enforced) docs status=REVIEW ne sont pas indexés par retrieve() (cellule embeddings)

# -------------------------
# Helpers normalisation
# -------------------------
def strip_accents(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
    )

def norm(s: str) -> str:
    s = (s or "").replace("’", "'")
    s = strip_accents(s)
    s = s.lower()
    s = " ".join(s.split())
    return s

def parse_number_pair(raw: str) -> Dict[str, Any]:
    """
    Retourne {raw, norm} où norm est un float-string stable (machine).
    """
    r = (raw or "").strip()
    compact = r.replace(" ", "")

    # Cas 182,006.00 -> 182006.00
    if "," in compact and "." in compact:
        norm_val = compact.replace(",", "")
        return {"raw": r, "norm": norm_val}

    # Cas 182,006 -> 182.006
    if "," in compact and "." not in compact:
        norm_val = compact.replace(",", ".")
        return {"raw": r, "norm": norm_val}

    # Cas 182006.00
    return {"raw": r, "norm": compact}

def to_float_loose(num_norm: str) -> Optional[float]:
    if num_norm is None:
        return None
    s = str(num_norm).strip()
    if s == "":
        return None
    try:
        return float(s)
    except Exception:
        return None

def normalize_date_iso(date_str: str) -> Dict[str, Any]:
    """
    Essaie de normaliser vers ISO 8601: YYYY-MM-DD
    Support simple:
      - dd/mm/yyyy
      - mm/dd/yyyy (si ambigu, on garde raw et iso null)
    """
    raw = (date_str or "").strip()

    m = re.match(r"^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{2,4})$", raw)
    if not m:
        return {"raw": raw, "iso": None}

    a = int(m.group(1))
    b = int(m.group(2))
    y = int(m.group(3))
    if y < 100:
        y += 2000

    # Heuristique déterministe:
    # - si a > 12 => dd/mm
    # - si b > 12 => mm/dd
    # - sinon ambigu => iso None
    if a > 12 and 1 <= b <= 12:
        dd, mm = a, b
    elif b > 12 and 1 <= a <= 12:
        mm, dd = a, b
    else:
        return {"raw": raw, "iso": None}

    if not (1 <= mm <= 12 and 1 <= dd <= 31):
        return {"raw": raw, "iso": None}

    return {"raw": raw, "iso": f"{y:04d}-{mm:02d}-{dd:02d}"}

# -------------------------
# Audit: Rule IDs (par champ)
# -------------------------
RULES = {
    "invoice.montant_a_payer_ttc": "R_INV_001_TTC",
    "invoice.montant_a_payer": "R_INV_002_PAY",
    "invoice.timbre": "R_INV_003_TIMBRE",
    "invoice.total_ttc": "R_INV_004_TOTAL_TTC",
    "invoice.date": "R_INV_005_DATE",
    "invoice.numero_facture": "R_INV_006_NUM",
    "invoice.entreprise_nom": "R_INV_007_VENDOR_NAME",
    "po.line_item": "R_PO_001_LINE_ROW",
    "po.total_ht": "R_PO_010_TOTAL_HT",
    "po.total_tva": "R_PO_011_TOTAL_TVA",
    "po.total_ttc": "R_PO_012_TOTAL_TTC",
    "po.validate_totals": "R_PO_090_VALIDATE_TOTALS"
}

def mk_field(value: Any, rule_id: str, evidence: Optional[str] = None) -> Dict[str, Any]:
    out = {"value": value, "rule_id": rule_id}
    if evidence is not None:
        out["evidence"] = evidence
    return out

# -------------------------
# Extraction FACTURE (déterministe + Rule IDs)
# -------------------------
def extract_invoice_fields(context: str) -> Dict[str, Any]:
    out: Dict[str, Any] = {}
    c_raw = context or ""
    c = norm(c_raw)

    # Montant à payer TTC
    m = re.search(r"montant\s+a\s+payer\s+ttc\s*[:\]\[]?\s*([0-9][0-9\s,\.]{2,})", c)
    if m:
        val = parse_number_pair(m.group(1))
        out["montant_a_payer_ttc"] = mk_field(
            value=val,
            rule_id=RULES["invoice.montant_a_payer_ttc"],
            evidence=m.group(0)
        )

    # Montant à payer (si TTC absent)
    m = re.search(r"montant\s+a\s+payer\s*[:\]\[]?\s*([0-9][0-9\s,\.]{2,})", c)
    if m and "montant_a_payer_ttc" not in out:
        val = parse_number_pair(m.group(1))
        out["montant_a_payer"] = mk_field(
            value=val,
            rule_id=RULES["invoice.montant_a_payer"],
            evidence=m.group(0)
        )

    # Timbre (OCR peut écrire rimbre)
    m = re.search(r"(timbre|rimbre)\s*[:\]\[]?\s*([0-9][0-9\s,\.]{2,})", c)
    if m:
        val = parse_number_pair(m.group(2))
        out["timbre"] = mk_field(
            value=val,
            rule_id=RULES["invoice.timbre"],
            evidence=m.group(0)
        )

    # Total TTC
    m = re.search(r"total\s+ttc\s*[:\]\[]?\s*([0-9][0-9\s,\.]{2,})", c)
    if m:
        val = parse_number_pair(m.group(1))
        out["total_ttc"] = mk_field(
            value=val,
            rule_id=RULES["invoice.total_ttc"],
            evidence=m.group(0)
        )

    # Date
    m = re.search(r"\b(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})\b", c)
    if m:
        d = normalize_date_iso(m.group(1))
        out["date"] = mk_field(
            value=d,
            rule_id=RULES["invoice.date"],
            evidence=m.group(0)
        )

    # Numéro facture (heuristique OCR)
    m = re.search(r"(numero\s+facture|n[°o]\s*facture|invoice\s+number)\s*[:\]\[]?\s*([a-z0-9\-\/]+)", c)
    if m:
        num = m.group(2)
        out["numero_facture"] = mk_field(
            value={"raw": num, "norm": num},
            rule_id=RULES["invoice.numero_facture"],
            evidence=m.group(0)
        )

    # Nom entreprise (vendeur) : heuristique simple autour d'entête
    # Cherche une ligne type "ma petite entreprise" / "SARL ..." / "EURL ..." / "SPA ..." près du début
    head = c_raw[:700]
    head_n = norm(head)
    m = re.search(r"\b(sarl|eurl|spa|s\.a\.r\.l|s\.p\.a)\s+([a-z0-9' \-]{2,})", head_n)
    if m:
        vendor = (m.group(1) + " " + m.group(2)).strip()
        out["entreprise_nom"] = mk_field(
            value={"raw": vendor, "norm": vendor},
            rule_id=RULES["invoice.entreprise_nom"],
            evidence=m.group(0)
        )
    else:
        # fallback: motif "ma petite entreprise" (exemple OCR)
        m = re.search(r"\b(ma\s+petite\s+entreprise)\b", head_n)
        if m:
            vendor = m.group(1).strip()
            out["entreprise_nom"] = mk_field(
                value={"raw": vendor, "norm": vendor},
                rule_id=RULES["invoice.entreprise_nom"],
                evidence=m.group(0)
            )

    return out

# -------------------------
# Extraction PO line-items (6.1) + vérification totaux (déterministe)
# -------------------------
def _tokenize_lines(text: str) -> List[str]:
    # Split stable (OCR)
    t = (text or "").replace("\r", "\n")
    lines = [ln.strip() for ln in t.split("\n")]
    return [ln for ln in lines if ln]

def extract_po_line_items(context: str) -> Dict[str, Any]:
    """
    Heuristique déterministe (MVP) :
      - détecte des lignes avec: (code?) + description + qty + pu + total
      - renvoie items[] et totaux extraits si présents
    Important:
      - sur OCR bruité, on vise une extraction "suffisante" + audit, pas parfaite
    """
    c_raw = context or ""
    lines = _tokenize_lines(c_raw)

    items: List[Dict[str, Any]] = []
    totals: Dict[str, Any] = {}

    # Patterns nombres (tolérant aux séparateurs)
    num_pat = r"([0-9][0-9\s,\.]{0,})"

    # Totaux (HT/TVA/TTC) si présents dans le texte
    c_n = norm(c_raw)
    m = re.search(r"\btotal\s+ht\s*[:\]\[]?\s*" + num_pat, c_n)
    if m:
        totals["total_ht"] = mk_field(parse_number_pair(m.group(1)), RULES["po.total_ht"], m.group(0))
    m = re.search(r"\b(total\s+tva|montant\s+tva|tva)\s*[:\]\[]?\s*" + num_pat, c_n)
    if m:
        totals["total_tva"] = mk_field(parse_number_pair(m.group(2) if m.lastindex and m.lastindex >= 2 else m.group(1)),
                                       RULES["po.total_tva"], m.group(0))
    m = re.search(r"\btotal\s+ttc\s*[:\]\[]?\s*" + num_pat, c_n)
    if m:
        totals["total_ttc"] = mk_field(parse_number_pair(m.group(1)), RULES["po.total_ttc"], m.group(0))

    # Ligne item (MVP) :
    #  - on cherche 3 nombres sur la ligne (qty, pu, total) + du texte
    #  - exemple OCR: "Produit 10 1009 10.00 10,090.00"
    for ln in lines:
        ln_n = norm(ln)

        # Heuristique: contient mot produit/article/item ou bien contient au moins 3 nombres
        nums = re.findall(r"[0-9][0-9\s,\.]{0,}", ln_n)
        if len(nums) < 3:
            continue

        # on essaye de détecter qty/pu/total en prenant les 3 derniers nombres
        qty_raw = nums[-3]
        pu_raw = nums[-2]
        total_raw = nums[-1]

        qty = parse_number_pair(qty_raw)
        pu = parse_number_pair(pu_raw)
        total = parse_number_pair(total_raw)

        # description = ligne sans les 3 derniers nombres (simple)
        # (déterministe: on enlève occurrences exactes qty/pu/total dans la ligne originale)
        desc = ln
        for r in [qty_raw, pu_raw, total_raw]:
            desc = re.sub(re.escape(r), " ", desc, count=1)
        desc = " ".join(desc.split()).strip()

        # garde-fou: description minimale
        if len(desc) < 3:
            continue

        items.append({
            "description": mk_field({"raw": desc, "norm": norm(desc)}, RULES["po.line_item"], evidence=ln),
            "qty": mk_field(qty, RULES["po.line_item"], evidence=ln),
            "unit_price": mk_field(pu, RULES["po.line_item"], evidence=ln),
            "line_total": mk_field(total, RULES["po.line_item"], evidence=ln),
        })

    return {"items": items, "totals": totals}

def validate_totals(po: Dict[str, Any], tolerance: float = 0.01) -> Dict[str, Any]:
    """
    Vérif déterministe :
      - somme(line_total) ≈ total_ht ou total_ttc si dispo
      - renvoie {status: PASS/REVIEW, reasons[], rule_id}
    """
    reasons = []
    items = po.get("items", [])
    totals = po.get("totals", {})

    # somme des lignes
    sum_lines = 0.0
    sum_ok = False
    for it in items:
        v = it.get("line_total", {}).get("value", {})
        f = to_float_loose(v.get("norm"))
        if f is not None:
            sum_lines += f
            sum_ok = True

    if not sum_ok:
        return {
            "status": "REVIEW",
            "reasons": ["NO_LINE_TOTALS_PARSED"],
            "rule_id": RULES["po.validate_totals"]
        }

    # Compare à total_ht si présent, sinon total_ttc
    target = None
    target_name = None

    if "total_ht" in totals:
        target = to_float_loose(totals["total_ht"]["value"]["norm"])
        target_name = "total_ht"
    elif "total_ttc" in totals:
        target = to_float_loose(totals["total_ttc"]["value"]["norm"])
        target_name = "total_ttc"

    if target is None:
        return {
            "status": "REVIEW",
            "reasons": ["NO_TOTAL_FIELD_FOUND"],
            "rule_id": RULES["po.validate_totals"]
        }

    diff = abs(sum_lines - target)
    if diff > tolerance:
        reasons.append(f"TOTAL_MISMATCH({target_name}): sum_lines={sum_lines:.2f} target={target:.2f} diff={diff:.2f}")

    return {
        "status": "PASS" if len(reasons) == 0 else "REVIEW",
        "reasons": reasons,
        "rule_id": RULES["po.validate_totals"],
        "details": {"sum_lines": sum_lines, "target": target, "target_name": target_name, "tolerance": tolerance}
    }

# -------------------------
# Router extraction selon doc_type (déterministe)
# -------------------------
def extract_fields_for_doc(doc_type: str, context: str) -> Dict[str, Any]:
    dt = (doc_type or "").upper().strip()
    if dt == "FACTURE":
        return {"invoice": extract_invoice_fields(context)}
    if dt in ("BON_DE_COMMANDE", "PO", "PURCHASE_ORDER"):
        po = extract_po_line_items(context)
        po_validation = validate_totals(po)
        po["validation"] = po_validation
        return {"po": po}
    # fallback: rien
    return {}

# -------------------------
# Build answer
# -------------------------
def build_answer(query: str, hits: List[Dict[str, Any]], mode: str, doc_type_hint: Optional[str]) -> Dict[str, Any]:
    if not hits:
        return {"answer": "Je n’ai trouvé aucun passage pertinent dans le corpus.", "fields": {}, "audit": {}}

    context = "\n".join([h["excerpt"] for h in hits])

    extracted = extract_fields_for_doc(doc_type_hint or "", context)

    # FACTURE -> format réponse simple
    if "invoice" in extracted and extracted["invoice"]:
        inv = extracted["invoice"]
        lines = []
        # champs si présents
        if "entreprise_nom" in inv:
            v = inv["entreprise_nom"]["value"]
            lines.append(f"Entreprise : {v['raw']}")
        if "montant_a_payer_ttc" in inv:
            v = inv["montant_a_payer_ttc"]["value"]
            lines.append(f"Montant à payer TTC : {v['raw']} (norm={v['norm']})")
        if "timbre" in inv:
            v = inv["timbre"]["value"]
            lines.append(f"Timbre : {v['raw']} (norm={v['norm']})")
        if "date" in inv:
            d = inv["date"]["value"]
            if d.get("iso"):
                lines.append(f"Date : {d['raw']} (iso={d['iso']})")
            else:
                lines.append(f"Date : {d['raw']} (iso=UNKNOWN)")
        if "numero_facture" in inv:
            lines.append(f"Numéro de facture : {inv['numero_facture']['value']['raw']}")

        audit = {k: {"rule_id": inv[k]["rule_id"], "evidence": inv[k].get("evidence")} for k in inv.keys()}
        return {"answer": "\n".join(lines) if lines else "Champs facture non trouvés.", "fields": extracted, "audit": audit}

    # PO -> format réponse simple
    if "po" in extracted and extracted["po"]:
        po = extracted["po"]
        items = po.get("items", [])
        val = po.get("validation", {})
        lines = []
        lines.append(f"Lignes articles détectées : {len(items)}")
        # preview top 5
        for it in items[:5]:
            desc = it["description"]["value"]["raw"]
            qty = it["qty"]["value"]["raw"]
            pu = it["unit_price"]["value"]["raw"]
            tot = it["line_total"]["value"]["raw"]
            lines.append(f"- {desc} | qty={qty} | pu={pu} | total={tot}")

        if val:
            lines.append(f"Vérif totaux : {val.get('status')} (rule_id={val.get('rule_id')})")
            for r in val.get("reasons", []):
                lines.append(f"  - {r}")

        audit = {
            "po": {
                "line_item_rule_id": RULES["po.line_item"],
                "validate_totals_rule_id": RULES["po.validate_totals"]
            }
        }
        return {"answer": "\n".join(lines), "fields": extracted, "audit": audit}

    # fallback extractif
    n = 3 if mode == "fast" else 5
    answer = "Passages pertinents trouvés :\n" + "\n".join(
        [f"- {h['excerpt'][:220]}..." for h in hits[:n]]
    )
    return {"answer": answer, "fields": {}, "audit": {}}

def ask(
    query: str,
    mode: str = "fast",
    filters: Optional[Dict[str, Any]] = None,
    top_k_fast: int = 5,
    top_k_normal: int = 10
) -> Dict[str, Any]:
    mode = (mode or "fast").lower().strip()
    top_k = top_k_fast if mode == "fast" else top_k_normal

    hits = retrieve(query, top_k=top_k, filters=filters)

    # doc_type_hint (déterministe) : si filter doc_type présent, on l'utilise
    doc_type_hint = None
    if filters and "doc_type" in filters:
        dt = filters["doc_type"]
        if isinstance(dt, str):
            doc_type_hint = dt
        elif isinstance(dt, list) and dt:
            doc_type_hint = dt[0]

    built = build_answer(query, hits, mode, doc_type_hint=doc_type_hint)

    return {
        "query": query,
        "mode": mode,
        "filters_applied": filters or {},
        "answer": built["answer"],
        "fields": built["fields"],
        "audit": built["audit"],      # Rule IDs + evidence
        "sources": hits               # citations: doc_id/page/chunk_id/excerpt/score
    }

# -------------------------
# Tests
# -------------------------
q1 = "Quel est le montant TTC à payer ?"
resp1 = ask(q1, mode="fast", filters={"doc_type": "FACTURE"})
print(json.dumps(resp1, ensure_ascii=False, indent=2))

q2 = "Quel est le nom de l'entreprise ?"
resp2 = ask(q2, mode="fast", filters={"doc_type": "FACTURE"})
print(json.dumps(resp2, ensure_ascii=False, indent=2))

# PO (si tu as des docs classés BON_DE_COMMANDE dans CORPUS)
q3 = "Donne les lignes d'articles et vérifie les totaux"
resp3 = ask(q3, mode="fast", filters={"doc_type": "BON_DE_COMMANDE"})
print(json.dumps(resp3, ensure_ascii=False, indent=2))


{
  "query": "Quel est le montant TTC à payer ?",
  "mode": "fast",
  "filters_applied": {
    "doc_type": "FACTURE"
  },
  "answer": "Entreprise : sarl el hana 16000 alger centre iroute de bejaia setif tel\nMontant à payer TTC : 182,006.00 (norm=182006.00)\nTimbre : 1,802.00 (norm=1802.00)\nDate : 4/20/2016 (iso=2016-04-20)",
  "fields": {
    "invoice": {
      "montant_a_payer_ttc": {
        "value": {
          "raw": "182,006.00",
          "norm": "182006.00"
        },
        "rule_id": "R_INV_001_TTC",
        "evidence": "montant a payer ttc 182,006.00 "
      },
      "timbre": {
        "value": {
          "raw": "1,802.00",
          "norm": "1802.00"
        },
        "rule_id": "R_INV_003_TIMBRE",
        "evidence": "rimbre 1,802.00 "
      },
      "date": {
        "value": {
          "raw": "4/20/2016",
          "iso": "2016-04-20"
        },
        "rule_id": "R_INV_005_DATE",
        "evidence": "4/20/2016"
      },
      "entreprise_nom": {
        "value": 