# STEP 01 : Chargement des documents

Accepter les fichiers aux formats image (JPG, PNG, TIFF).

Convertir les pages PDF en images si nécessaire.

In [None]:
import os
import sys
from collections import Counter
from pathlib import Path

try:
    from pdf2image import convert_from_path
except ImportError:
    print("Please install pdf2image (pip install pdf2image) and poppler-utils in your system.")
    sys.exit(1)

SUPPORTED_IMAGES = {'.jpg', '.jpeg', '.png', '.tiff', '.tif'}
SUPPORTED_PDF = {'.pdf'}


def count_file_types(folder_path):
    """
    Scan the folder and count files by extension.
    Returns a Counter mapping extension to count.
    """
    cnt = Counter()
    folder = Path(folder_path)
    for file in folder.iterdir():
        if file.is_file():
            ext = file.suffix.lower()
            if ext:
                cnt[ext] += 1
    return cnt


def convert_pdfs_to_images(folder_path, output_folder=None, dpi=200):
    """
    Convert each page of every PDF in folder_path to images.
    Saves images in output_folder or a subfolder 'pdf_images' by default.
    """
    folder = Path(folder_path)
    out_base = Path(output_folder) if output_folder else folder / 'pdf_images'
    out_base.mkdir(exist_ok=True)

    for pdf_file in folder.glob('*.pdf'):
        doc_name = pdf_file.stem
        pages = convert_from_path(str(pdf_file), dpi=dpi)
        doc_out = out_base / doc_name
        doc_out.mkdir(exist_ok=True)
        for i, page in enumerate(pages, start=1):
            out_path = doc_out / f"{doc_name}_page_{i}.png"
            page.save(out_path, 'PNG')
        print(f"Converted {pdf_file.name} ({len(pages)} pages) to images in {doc_out}")



# Set folder paths directly here
folder_path = r"C:\Users\User\Desktop\university\S2\EXTRACTION\batch_1"
output_folder = r"C:\Users\User\Desktop\university\S2\EXTRACTION\output"  # Or set to None to use default

counts = count_file_types(folder_path)
print("File type counts:")
for ext, num in counts.items():
    print(f"  {ext}: {num}")

# Filter for supported PDFs
pdf_count = counts.get('.pdf', 0)
if pdf_count:
    print(f"\nConverting {pdf_count} PDF file(s) to images...")
    convert_pdfs_to_images(folder_path, output_folder)
else:
    print("\nNo PDF files to convert.")


File type counts:
  .jpg: 1489

No PDF files to convert.


# STEP 02 : Prétraitement d’image

Appliquer des filtres (binarisation, nettoyage du bruit).

Améliorer la lisibilité pour optimiser l’OCR.

In [17]:
import cv2
import numpy as np
from pathlib import Path

# Extensions supportées pour le prétraitement
SUPPORTED_IMAGES = {'.jpg', '.jpeg', '.png', '.tiff', '.tif'}

def preprocess_image(in_path, out_path):
    # 1. Chargement en niveaux de gris
    img = cv2.imread(str(in_path), cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f"Erreur de lecture: {in_path}")
        return

    # 2. Binarisation adaptative (conserve l’orientation d’origine)
    bin_img = cv2.adaptiveThreshold(
        img, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=35,
        C=10
    )

    # 3. Nettoyage du bruit (ouverture morphologique)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    cleaned = cv2.morphologyEx(bin_img, cv2.MORPH_OPEN, kernel, iterations=1)

    # 4. Enregistrement
    cv2.imwrite(str(out_path), cleaned)
    print(f"Prétraitée et sauvegardée: {out_path}")

def preprocess_folder(input_folder, output_folder):
    """
    Parcourt input_folder, prétraite toutes les images supportées
    et les sauvegarde dans output_folder en conservant l'arborescence.
    """
    inp = Path(input_folder)
    out = Path(output_folder)
    for img_path in inp.rglob('*'):
        if img_path.suffix.lower() in SUPPORTED_IMAGES:
            rel = img_path.relative_to(inp)
            target = out / rel
            target.parent.mkdir(parents=True, exist_ok=True)
            preprocess_image(img_path, target)

if __name__ == "__main__":
    # Exemples de répertoires, à adapter
    folder_in  = r"C:\Users\User\Desktop\university\S2\EXTRACTION\batch_1"
    folder_out = r"C:\Users\User\Desktop\university\S2\EXTRACTION\output2"

    preprocess_folder(folder_in, folder_out)


Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_0_color_B_248.pdf().jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_1.pdf0.jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_10.pdf0.jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_100.pdf0.jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_100_color_B_242.pdf0.jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_101.pdf0.jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_101_color_B_245.pdf0.jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_102.pdf0.jpg
Prétraitée et sauvegardée: C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_102_color_B_240.pdf0.jpg
Prétraitée et sauvegardée: C:

# STEP 03 : Extraction du texte avec EASYOCR

Utiliser un OCR pour l’extraction de caractéristiques visuelles de l’image et obtenir le texte brut présent dans le document.

In [21]:
import cv2
import easyocr
from pathlib import Path

# 1. Configure EasyOCR reader (adjust languages as needed)
#    Make sure you have installed easyocr, torch and opencv-python:
#      pip install --user easyocr torch torchvision opencv-python
reader = easyocr.Reader(['en', 'fr'], gpu=False)  # change ['en','fr'] to your languages

# 2. Supported image extensions
SUPPORTED_IMAGES = {'.jpg', '.jpeg', '.png', '.tiff', '.tif'}

def ocr_image_easyocr(image_path):
    """
    Runs EasyOCR on a single image and returns the extracted text.
    detail=0 returns just the text strings.
    """
    img = cv2.imread(str(image_path))
    if img is None:
        print(f"[!] Could not read image: {image_path}")
        return ""
    # readtext returns a list of strings when detail=0
    texts = reader.readtext(img, detail=0)
    return "\n".join(texts)

def ocr_folder(input_folder, output_folder):
    """
    Recursively processes all supported images under input_folder,
    runs OCR on each, and writes the result to a .txt file under output_folder.
    """
    inp = Path(input_folder)
    out = Path(output_folder)
    out.mkdir(parents=True, exist_ok=True)

    for img_path in inp.rglob('*'):
        if img_path.suffix.lower() in SUPPORTED_IMAGES:
            # build corresponding .txt path
            rel_txt = img_path.relative_to(inp).with_suffix('.txt')
            txt_path = out / rel_txt
            txt_path.parent.mkdir(parents=True, exist_ok=True)

            print(f"OCR → {img_path} → {txt_path}")
            extracted = ocr_image_easyocr(img_path)
            txt_path.write_text(extracted, encoding='utf-8')

if __name__ == "__main__":
    # --- Edit these paths to match your setup ---
    preprocessed_folder = r"C:\Users\User\Desktop\university\S2\EXTRACTION\output2"
    ocr_output_folder  = r"C:\Users\User\Desktop\university\S2\EXTRACTION\output3_easyocr"

    ocr_folder(preprocessed_folder, ocr_output_folder)


Using CPU. Note: This module is much faster with a GPU.
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% CompleteOCR → C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_0_color_B_248.pdf().jpg → C:\Users\User\Desktop\university\S2\EXTRACTION\output3_easyocr\invoice_0_color_B_248.pdf().txt
OCR → C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_1.pdf0.jpg → C:\Users\User\Desktop\university\S2\EXTRACTION\output3_easyocr\invoice_1.pdf0.txt
OCR → C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_10.pdf0.jpg → C:\Users\User\Desktop\university\S2\EXTRACTION\output3_easyocr\invoice_10.pdf0.txt
OCR → C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_100.pdf0.jpg → C:\Users\User\Desktop\university\S2\EXTRACTION\output3_easyocr\invoice_100.pdf0.txt
OCR → C:\Users\User\Desktop\university\S2\EXTRACTION\output2\invoice_100_color_B_242.pdf0.jpg → C:\Users\User\Desktop\university\S2\EXTRACTION\output3_easyocr\invoice_100_color_B_242.pdf0.txt
OCR → C:\Users\User\Desktop\university\S2\