### Pipeline OCR (Tesseract + OpenCV)28.

In [1]:
#!/usr/bin/env python3
"""
Auto OCR helper wrapping Tesseract with accuracy-focused options.

Behaviors:
- Auto mode (no args): scans this script's directory for images/PDFs and OCRs them (no user input).
- Manual mode: pass a single image/PDF plus an output base; retains Tesseract-like switches.

Accuracy aids:
- Preprocessing: grayscale, optional upscaling, brightness/contrast/sharpness tuning, optional binarization.
- Tesseract tuning: DPI hint, tessdata dir override, user words/patterns, char white/blacklists, OEM/PSM, extra -c flags.

Outputs:
- Per-input text files saved to ./ocr_output with source name + method noted.
- Progress/warnings logged to stderr.

Dependencies:
  * Python 3.8+
  * pytesseract
  * pillow
  * Tesseract binary with tessdata
Optional for PDFs:
  * pdf2image + poppler (for scanned/image PDFs)
  * PyPDF2 (for digital-text PDFs)
"""

import argparse
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Tuple

import pytesseract
from pytesseract import Output
from PIL import Image, ImageEnhance, ImageFilter, ImageOps

OCR_TEXT = None

try:
    import numpy as np  # type: ignore
except ImportError:  # pragma: no cover
    np = None

try:
    import pdf2image  # type: ignore
except ImportError:  # pragma: no cover
    pdf2image = None

try:
    import PyPDF2  # type: ignore
except ImportError:  # pragma: no cover
    PyPDF2 = None


try:
    SCRIPT_DIR = Path(__file__).resolve().parent
except NameError:
    # In notebooks __file__ is undefined; fall back to current working directory.
    SCRIPT_DIR = Path.cwd()
OUTPUT_DIR = SCRIPT_DIR / "ocr_output"
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
PDF_EXTS = {".pdf"}
DEFAULT_FORMATS = ["text"]
DEFAULT_LANG = "eng"
DEFAULT_CONTRAST = 1.5
DEFAULT_SHARPNESS = 1.2
DEFAULT_BRIGHTNESS = 1.0
DEFAULT_UPSCALE = 1.5
DEFAULT_DPI = 300
# Notebook-friendly: set this to a filename to OCR and print results to stdout (no file writes).
# Example: "image2tab.webp". Leave as None to keep current behavior (CLI/manual or auto-scan).
INPUT_FILE: Optional[str] = "image2tab.webp"


@dataclass
class EnhanceOptions:
    contrast: float = DEFAULT_CONTRAST
    sharpness: float = DEFAULT_SHARPNESS
    brightness: float = DEFAULT_BRIGHTNESS
    upscale: float = DEFAULT_UPSCALE
    gamma: Optional[float] = None  # gamma correction; <1 brightens darks, >1 darkens
    pad: int = 0  # pixels to pad around the image
    median: Optional[int] = None  # kernel size for median filter (odd int, e.g., 3)
    unsharp_radius: Optional[float] = None  # e.g., 1.0
    unsharp_percent: int = 150
    invert: bool = False
    autocontrast_cutoff: Optional[int] = None  # 0-100; percentage to clip for autocontrast
    equalize: bool = False  # histogram equalization
    auto_rotate: bool = False  # attempt orientation detection + rotate
    otsu: bool = False  # auto-threshold with Otsu (requires numpy)
    threshold: Optional[int] = None  # 0-255; if set, applies a binary threshold


def build_config(
    oem: Optional[int],
    psm: Optional[int],
    base_flags: Iterable[str],
    dpi: Optional[int],
    tessdata_dir: Optional[Path],
    user_words: Optional[Path],
    user_patterns: Optional[Path],
) -> str:
    parts: List[str] = []
    if oem is not None:
        parts.append(f"--oem {oem}")
    if psm is not None:
        parts.append(f"--psm {psm}")
    if dpi is not None:
        parts.append(f"--dpi {dpi}")
    if tessdata_dir is not None:
        parts.append(f"--tessdata-dir \"{tessdata_dir}\"")
    if user_words is not None:
        parts.append(f"--user-words \"{user_words}\"")
    if user_patterns is not None:
        parts.append(f"--user-patterns \"{user_patterns}\"")
    parts.extend(base_flags)
    return " ".join(parts)


def ensure_environment(lang: str) -> None:
    try:
        version = pytesseract.get_tesseract_version()
    except pytesseract.TesseractNotFoundError:
        sys.exit("Tesseract binary not found on PATH. Install it and its language data.")
    if lang:
        try:
            available = set(pytesseract.get_languages(config=""))
            requested = set(lang.split("+"))
            missing = requested - available
            if missing:
                print(
                    f"Warning: missing languages: {', '.join(sorted(missing))}. "
                    f"Available: {', '.join(sorted(available))}",
                    file=sys.stderr,
                )
        except pytesseract.TesseractError:
            # Listing languages can fail if tessdata is not configured; continue anyway.
            pass


def save_bytes(content: bytes, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_bytes(content)


def save_text(content: str, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")


def auto_rotate_if_needed(img: Image.Image, enhance: EnhanceOptions) -> Image.Image:
    if not enhance.auto_rotate:
        return img
    try:
        osd = pytesseract.image_to_osd(img)
        angle = None
        for line in osd.splitlines():
            if line.lower().startswith("rotate:"):
                try:
                    angle = int(line.split(":")[1].strip())
                except ValueError:
                    angle = None
                break
        if angle is not None and angle % 360 != 0:
            # Rotate counter-clockwise by the reported angle to deskew.
            return img.rotate(-angle, expand=True)
    except Exception:
        pass
    return img


def preprocess_image(image: Image.Image, enhance: EnhanceOptions) -> Image.Image:
    img = image.convert("L")
    img = auto_rotate_if_needed(img, enhance)
    if enhance.invert:
        img = ImageOps.invert(img)
    if enhance.pad and enhance.pad > 0:
        img = ImageOps.expand(img, border=enhance.pad, fill=255)
    if enhance.autocontrast_cutoff is not None:
        cutoff = max(0, min(100, enhance.autocontrast_cutoff))
        img = ImageOps.autocontrast(img, cutoff=cutoff)
    if enhance.equalize:
        img = ImageOps.equalize(img)
    if enhance.upscale and enhance.upscale != 1.0:
        w, h = img.size
        img = img.resize((int(w * enhance.upscale), int(h * enhance.upscale)), Image.LANCZOS)
    if enhance.gamma and enhance.gamma > 0:
        inv_gamma = 1.0 / enhance.gamma
        lut = [pow(x / 255.0, inv_gamma) * 255 for x in range(256)]
        img = img.point(lut)
    if enhance.brightness and enhance.brightness != 1.0:
        img = ImageEnhance.Brightness(img).enhance(enhance.brightness)
    if enhance.contrast and enhance.contrast != 1.0:
        img = ImageEnhance.Contrast(img).enhance(enhance.contrast)
    if enhance.sharpness and enhance.sharpness != 1.0:
        img = ImageEnhance.Sharpness(img).enhance(enhance.sharpness)
    if enhance.unsharp_radius:
        img = img.filter(
            ImageFilter.UnsharpMask(
                radius=enhance.unsharp_radius,
                percent=enhance.unsharp_percent,
                threshold=0,
            )
        )
    if enhance.median and enhance.median > 1 and enhance.median % 2 == 1:
        img = img.filter(ImageFilter.MedianFilter(size=enhance.median))
    if enhance.threshold is not None:
        thr = max(0, min(255, enhance.threshold))
        img = img.point(lambda p, t=thr: 255 if p > t else 0, mode="1").convert("L")
    elif enhance.otsu and np is not None:
        arr = np.array(img, dtype=np.uint8)
        hist, _ = np.histogram(arr, bins=256, range=(0, 256))
        total = arr.size
        sum_total = np.dot(np.arange(256), hist)
        sum_b = 0.0
        w_b = 0.0
        max_var = 0.0
        threshold = 0
        for i in range(256):
            w_b += hist[i]
            if w_b == 0:
                continue
            w_f = total - w_b
            if w_f == 0:
                break
            sum_b += i * hist[i]
            m_b = sum_b / w_b
            m_f = (sum_total - sum_b) / w_f
            var_between = w_b * w_f * (m_b - m_f) ** 2
            if var_between > max_var:
                max_var = var_between
                threshold = i
        img = img.point(lambda p, t=threshold: 255 if p > t else 0, mode="1").convert("L")
    return img


def load_preprocessed_image(path: Path, enhance: EnhanceOptions) -> Image.Image:
    image = Image.open(path)
    return preprocess_image(image, enhance)


def ocr_image_file(path: Path, lang: str, config: str, enhance: EnhanceOptions) -> str:
    prepped = load_preprocessed_image(path, enhance)
    return pytesseract.image_to_string(prepped, lang=lang, config=config)


def ocr_pdf_file(path: Path, lang: str, config: str, enhance: EnhanceOptions) -> Tuple[str, str]:
    page_text: List[str] = []
    method_used = ""
    if pdf2image:
        try:
            images = pdf2image.convert_from_path(str(path))
            for idx, img in enumerate(images, start=1):
                prepped = preprocess_image(img, enhance)
                text = pytesseract.image_to_string(prepped, lang=lang, config=config)
                page_text.append(f"--- Page {idx} (image OCR) ---\n{text}")
            method_used = "pdf2image + pytesseract"
        except Exception as exc:  # pragma: no cover
            print(f"[warn] PDF rasterization failed for {path.name}: {exc}", file=sys.stderr)
    if not page_text and PyPDF2:
        try:
            with path.open("rb") as fh:
                reader = PyPDF2.PdfReader(fh)
                for idx, page in enumerate(reader.pages, start=1):
                    text = page.extract_text() or ""
                    page_text.append(f"--- Page {idx} (PyPDF2 text) ---\n{text}")
            method_used = "PyPDF2"
        except Exception as exc:  # pragma: no cover
            print(f"[warn] PyPDF2 text extraction failed for {path.name}: {exc}", file=sys.stderr)
    if not page_text:
        raise RuntimeError(
            "No PDF handler succeeded; install pdf2image (with poppler) for scanned PDFs "
            "or PyPDF2 for text PDFs."
        )
    return "\n".join(page_text), method_used


def process_file(
    path: Path,
    lang: str,
    oem: Optional[int],
    psm: Optional[int],
    config_flags: Iterable[str],
    enhance: EnhanceOptions,
    dpi: Optional[int],
    tessdata_dir: Optional[Path],
    user_words: Optional[Path],
    user_patterns: Optional[Path],
) -> Path:
    suffix = path.suffix.lower()
    config = build_config(oem, psm, config_flags, dpi, tessdata_dir, user_words, user_patterns)
    if suffix in IMAGE_EXTS:
        text = ocr_image_file(path, lang, config, enhance)
        method = "pytesseract"
    elif suffix in PDF_EXTS:
        text, method = ocr_pdf_file(path, lang, config, enhance)
    else:
        raise ValueError(f"Unsupported file type: {path.name}")

    output_path = OUTPUT_DIR / f"{path.stem}.txt"
    save_text(f"Source file: {path.name}\nMethod: {method}\n\n{text}", output_path)
    print(f"[ok] {path.name} -> {output_path}", file=sys.stderr)
    return output_path


def ocr_to_text(
    path: Path,
    lang: str,
    oem: Optional[int],
    psm: Optional[int],
    config_flags: Iterable[str],
    enhance: EnhanceOptions,
    dpi: Optional[int],
    tessdata_dir: Optional[Path],
    user_words: Optional[Path],
    user_patterns: Optional[Path],
) -> Tuple[str, str]:
    suffix = path.suffix.lower()
    config = build_config(oem, psm, config_flags, dpi, tessdata_dir, user_words, user_patterns)
    if suffix in PDF_EXTS:
        return ocr_pdf_file(path, lang, config, enhance)
    text = ocr_image_file(path, lang, config, enhance)
    return text, "pytesseract"


def find_candidate_files(directory: Path) -> List[Path]:
    candidates: List[Path] = []
    for item in directory.iterdir():
        if not item.is_file():
            continue
        suffix = item.suffix.lower()
        if suffix in IMAGE_EXTS or suffix in PDF_EXTS:
            candidates.append(item)
    return sorted(candidates)


def auto_process(
    directory: Path,
    lang: str,
    oem: Optional[int],
    psm: Optional[int],
    config_flags: Iterable[str],
    enhance: EnhanceOptions,
    dpi: Optional[int],
    tessdata_dir: Optional[Path],
    user_words: Optional[Path],
    user_patterns: Optional[Path],
) -> None:
    directory = directory.resolve()
    candidates = find_candidate_files(directory)
    if not candidates:
        print(f"[info] No images or PDFs found in {directory}", file=sys.stderr)
        return
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    print(f"[info] Processing {len(candidates)} file(s) in {directory}", file=sys.stderr)
    for path in candidates:
        try:
            process_file(
                path,
                lang,
                oem,
                psm,
                config_flags,
                enhance,
                dpi,
                tessdata_dir,
                user_words,
                user_patterns,
            )
        except Exception as exc:  # pragma: no cover
            print(f"[warn] Skipped {path.name}: {exc}", file=sys.stderr)


def run_ocr(
    input_path: Path,
    output_base: str,
    lang: str,
    oem: Optional[int],
    psm: Optional[int],
    formats: Iterable[str],
    config_flags: Iterable[str],
    enhance: EnhanceOptions,
    dpi: Optional[int],
    tessdata_dir: Optional[Path],
    user_words: Optional[Path],
    user_patterns: Optional[Path],
) -> None:
    suffix = input_path.suffix.lower()
    config = build_config(oem, psm, config_flags, dpi, tessdata_dir, user_words, user_patterns)
    if suffix in PDF_EXTS:
        text, method = ocr_pdf_file(input_path, lang, config, enhance)
        save_text(f"Source file: {input_path.name}\nMethod: {method}\n\n{text}", Path(f"{output_base}.txt"))
        return

    image = load_preprocessed_image(input_path, enhance)

    for fmt in formats:
        fmt_lower = fmt.lower()
        if fmt_lower == "text":
            result = pytesseract.image_to_string(image, lang=lang, config=config)
            save_text(result, Path(f"{output_base}.txt"))
        elif fmt_lower == "tsv":
            result = pytesseract.image_to_data(
                image, lang=lang, config=config, output_type=Output.STRING
            )
            save_text(result, Path(f"{output_base}.tsv"))
        elif fmt_lower == "hocr":
            result = pytesseract.image_to_pdf_or_hocr(
                image, lang=lang, config=config, extension="hocr"
            )
            save_bytes(result, Path(f"{output_base}.hocr"))
        elif fmt_lower == "pdf":
            result = pytesseract.image_to_pdf_or_hocr(
                image, lang=lang, config=config, extension="pdf"
            )
            save_bytes(result, Path(f"{output_base}.pdf"))
        else:
            raise SystemExit(
                "Unsupported format '{fmt}'. Choose from text, hocr, pdf, tsv.".format(
                    fmt=fmt
                )
            )


def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Python OCR helper: manual mode (pass image + output base) or auto-scan the script directory when no inputs are given."
        )
    )
    parser.add_argument("image", nargs="?", help="Optional: path to a single image or PDF to process.")
    parser.add_argument(
        "output_base",
        nargs="?",
        help="Optional: base path (without extension) for manual mode outputs.",
    )
    parser.add_argument(
        "-l",
        "--lang",
        default=DEFAULT_LANG,
        help="Languages to use (e.g., eng+fra). Defaults to eng.",
    )
    parser.add_argument(
        "--oem",
        type=int,
        choices=range(0, 4),
        help=(
            "OCR Engine Mode; matches tesseract --oem (0=Legacy+LSTM, 1=Legacy, "
            "2=LSTM, 3=Default)."
        ),
    )
    parser.add_argument(
        "--psm",
        type=int,
        choices=range(0, 14),
        help="Page segmentation mode; matches tesseract --psm options.",
    )
    parser.add_argument(
        "-f",
        "--format",
        nargs="+",
        default=DEFAULT_FORMATS,
        choices=["text", "hocr", "pdf", "tsv"],
        help="Output format(s) for manual mode. Auto mode always writes text.",
    )
    parser.add_argument(
        "--config",
        nargs="*",
        default=[],
        metavar="CFG",
        help="Additional configuration flags passed verbatim to tesseract (e.g., -c foo=bar).",
    )
    parser.add_argument(
        "--scan-dir",
        type=Path,
        default=SCRIPT_DIR,
        help="Directory to auto-scan when no manual inputs are provided.",
    )
    parser.add_argument(
        "--contrast",
        type=float,
        default=DEFAULT_CONTRAST,
        help=f"Contrast multiplier (default {DEFAULT_CONTRAST}).",
    )
    parser.add_argument(
        "--sharpness",
        type=float,
        default=DEFAULT_SHARPNESS,
        help=f"Sharpness multiplier (default {DEFAULT_SHARPNESS}).",
    )
    parser.add_argument(
        "--brightness",
        type=float,
        default=DEFAULT_BRIGHTNESS,
        help=f"Brightness multiplier (default {DEFAULT_BRIGHTNESS}).",
    )
    parser.add_argument(
        "--upscale",
        type=float,
        default=DEFAULT_UPSCALE,
        help=f"Upscale factor before OCR (default {DEFAULT_UPSCALE}).",
    )
    parser.add_argument(
        "--gamma",
        type=float,
        default=None,
        help="Gamma correction (e.g., 0.8 brightens dark input; 1.2 darkens).",
    )
    parser.add_argument(
        "--pad",
        type=int,
        default=0,
        help="Pad image with white border (pixels) before OCR.",
    )
    parser.add_argument(
        "--threshold",
        type=int,
        default=None,
        help="Optional binarization threshold 0-255 (e.g., 150).",
    )
    parser.add_argument(
        "--median",
        type=int,
        default=None,
        help="Median filter size (odd int, e.g., 3) to denoise before OCR.",
    )
    parser.add_argument(
        "--unsharp-radius",
        type=float,
        default=None,
        help="Radius for unsharp mask (e.g., 1.0) to crisp edges.",
    )
    parser.add_argument(
        "--unsharp-percent",
        type=int,
        default=150,
        help="Percent for unsharp mask (default 150).",
    )
    parser.add_argument(
        "--invert",
        action="store_true",
        help="Invert image (useful if text is white on black).",
    )
    parser.add_argument(
        "--autocontrast-cutoff",
        type=int,
        default=None,
        help="Autocontrast cutoff percent (0-100) to clip extremes.",
    )
    parser.add_argument(
        "--equalize",
        action="store_true",
        help="Apply histogram equalization.",
    )
    parser.add_argument(
        "--auto-rotate",
        action="store_true",
        help="Detect orientation with tesseract OSD and rotate automatically.",
    )
    parser.add_argument(
        "--otsu",
        action="store_true",
        help="Apply Otsu automatic thresholding (requires numpy).",
    )
    parser.add_argument(
        "--dpi",
        type=int,
        default=DEFAULT_DPI,
        help=f"DPI hint passed to tesseract (default {DEFAULT_DPI}).",
    )
    parser.add_argument(
        "--tessdata-dir",
        type=Path,
        default=None,
        help="Override tessdata directory (e.g., ./tessdata).",
    )
    parser.add_argument(
        "--user-words",
        type=Path,
        default=None,
        help="Path to user words list (see tesseract --help-extra).",
    )
    parser.add_argument(
        "--user-patterns",
        type=Path,
        default=None,
        help="Path to user patterns list (see tesseract --help-extra).",
    )
    parser.add_argument(
        "--whitelist",
        type=str,
        default=None,
        help="Limit recognition to these characters (tessedit_char_whitelist).",
    )
    parser.add_argument(
        "--blacklist",
        type=str,
        default=None,
        help="Exclude these characters (tessedit_char_blacklist).",
    )
    # In notebooks, sys.argv contains IPython arguments; default to [] to avoid parse errors.
    return parser.parse_args(list(argv) if argv is not None else [])


def main(argv: Optional[Iterable[str]] = None) -> None:
    args = parse_args(argv)
    ensure_environment(args.lang)
    enhance = EnhanceOptions(
        contrast=args.contrast,
        sharpness=args.sharpness,
        brightness=args.brightness,
        upscale=args.upscale,
        gamma=args.gamma,
        pad=args.pad,
        median=args.median,
        unsharp_radius=args.unsharp_radius,
        unsharp_percent=args.unsharp_percent,
        invert=args.invert,
        autocontrast_cutoff=args.autocontrast_cutoff,
        equalize=args.equalize,
        auto_rotate=args.auto_rotate,
        otsu=args.otsu,
        threshold=args.threshold,
    )

    # Build a rich config list leveraging Tesseract options from the repo/manpage.
    config_flags: List[str] = list(args.config)
    if args.whitelist:
        config_flags.append(f"-c tessedit_char_whitelist={args.whitelist}")
    if args.blacklist:
        config_flags.append(f"-c tessedit_char_blacklist={args.blacklist}")

    if args.image and args.output_base:
        run_ocr(
            Path(args.image),
            args.output_base,
            args.lang,
            args.oem,
            args.psm,
            args.format,
            config_flags,
            enhance,
            args.dpi,
            args.tessdata_dir,
            args.user_words,
            args.user_patterns,
        )
    elif INPUT_FILE:
        path = Path(INPUT_FILE)
        if not path.exists():
            sys.exit(f"INPUT_FILE not found: {path}")
        print(f"[info] Using INPUT_FILE={path}", file=sys.stderr)
        global OCR_TEXT
        OCR_TEXT, method = ocr_to_text(
            path,
            args.lang,
            args.oem,
            args.psm,
            config_flags,
            enhance,
            args.dpi,
            args.tessdata_dir,
            args.user_words,
            args.user_patterns,
        )
        print(OCR_TEXT)
    else:
        print(
            f"[info] Auto-scanning {args.scan_dir.resolve()} for images/PDFs (no user input needed)",
            file=sys.stderr,
        )
        auto_process(
            args.scan_dir,
            args.lang,
            args.oem,
            args.psm,
            config_flags,
            enhance,
            args.dpi,
            args.tessdata_dir,
            args.user_words,
            args.user_patterns,
        )
if __name__ == "__main__":
    main()


[info] Using INPUT_FILE=image2tab.webp


FACTURE

CODE CLENT NUMERO
Fcoo1 4/20/2016 0002
Ma petite entreprise CLIENT
19, rue de place 1° mai ISARL EL HANA
16000 Alger Centre IROUTE DE BEJAIA SETIF
Tel : 00-00-52-12- 119000
Ident Fiscal : 160
N°Art : 16010000000
Mode de paiement : Espéce
Date Echéance : 5/20/2016
Référence Description Produit Quantité P.Unitaire Valeur
1001 _—‘[Produit 1 1000 1.00 1,000.00
c1002 [Produit 2 1001 2.00 2,002.00
c1003 [Produit 3 1002 3.00 3,006.00
1004 _—s [Produit 4 1003 4.00 4,012.00
c1005__—[ProduitS 1004 5.00 5,020.00
c1006 [Produit 6 1005 6.00 6,030.00
1007 _—‘[Produit 7 1006 11.00 11,066.00
1008 = (Produit 8 1007 118.00 118,826.00
1009 Produit 9 1008 19.00 19,152.00
c1010__—‘[Produit 10 1009 10.00 10,090.00
Non assujetti dlatva [Montant a payer 180,204.00
\Timbre 1,802.00
IMontant a payer ttc 182,006.00

Monatnt Facture en Lettre = Cinq mille huit cent quatre vingt huit Dinars Algériens

Cachet & Signature



### Pipeline SpaCy de base & Tokenisation

In [3]:
from langdetect import detect
import spacy
import re

# Charger modèles une seule fois
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

texte = OCR_TEXT

# Split phrases (plus propre que re.split simple)
sent_split = re.compile(r'(?<=[.!?])\s+')

for phrase in sent_split.split(texte):
    phrase = phrase.strip()

    # Ignorer vide / trop court
    if not phrase or len(phrase) < 20:
        continue

    # Lang detect (langdetect peut bugger sur texte sale)
    try:
        lang = detect(phrase)
    except:
        continue

    if lang == "fr":
        doc = nlp_fr(phrase)
    elif lang == "en":
        doc = nlp_en(phrase)
    else:
        continue

    print("\nPhrase :", phrase)
    print("Langue :", lang)
    print("Tokens :", [t.text for t in doc])



Phrase : FACTURE

CODE CLENT NUMERO
Fcoo1 4/20/2016 0002
Ma petite entreprise CLIENT
19, rue de place 1° mai ISARL EL HANA
16000 Alger Centre IROUTE DE BEJAIA SETIF
Tel : 00-00-52-12- 119000
Ident Fiscal : 160
N°Art : 16010000000
Mode de paiement : Espéce
Date Echéance : 5/20/2016
Référence Description Produit Quantité P.Unitaire Valeur
1001 _—‘[Produit 1 1000 1.00 1,000.00
c1002 [Produit 2 1001 2.00 2,002.00
c1003 [Produit 3 1002 3.00 3,006.00
1004 _—s [Produit 4 1003 4.00 4,012.00
c1005__—[ProduitS 1004 5.00 5,020.00
c1006 [Produit 6 1005 6.00 6,030.00
1007 _—‘[Produit 7 1006 11.00 11,066.00
1008 = (Produit 8 1007 118.00 118,826.00
1009 Produit 9 1008 19.00 19,152.00
c1010__—‘[Produit 10 1009 10.00 10,090.00
Non assujetti dlatva [Montant a payer 180,204.00
\Timbre 1,802.00
IMontant a payer ttc 182,006.00

Monatnt Facture en Lettre = Cinq mille huit cent quatre vingt huit Dinars Algériens

Cachet & Signature
Langue : fr
Tokens : ['FACTURE', '\n\n', 'CODE', 'CLENT', 'NUMERO', '\n', 'F