Bibliothèques à installer 

In [None]:
import sys
!{sys.executable} -m pip install git+https://github.com/kermitt2/grobid_client_python.git

In [None]:
!pip install grobid-client-python
!pip install requests
!pip install ocrmypdf
!pip install PyPDF2
!pip install grobid-client
!pip install beautifulsoup4
!pip install PyMuPDF


Pour convertir les pdfs scannés en pdfs exportés

In [None]:
import os
import ocrmypdf
from PyPDF2 import PdfReader

def pdf_has_text(pdf_path, threshold=10):
    """Check if a PDF file has text content greater than threshold length."""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
            if len(text) > threshold:
                return True
        return False
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        
        return False

def process_pdfs_in_folder(folder_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            input_pdf = os.path.join(folder_path, filename)
            output_pdf = os.path.join(output_folder, filename)
            
            print(f"Processing {filename}...")
            if pdf_has_text(input_pdf):
                print(" -> Already searchable, skipping OCR.")
                
                if input_pdf != output_pdf:
                    import shutil
                    shutil.copy2(input_pdf, output_pdf)
            else:
                print(" -> No text detected, running OCR...")
                try:
                    ocrmypdf.ocr(input_pdf, output_pdf, use_threads=True)
                    print(f" --> OCR done: {output_pdf}")
                except Exception as e:
                    print(f"Error during OCR for {filename}: {e}")

if __name__ == "__main__":
    folder = "PDFs_Extraction\contrats"
    output = "PDFs_Extraction\contrats_scannées"
    process_pdfs_in_folder(folder, output)


Pour extraire les infos des pdfs scannés en utilisant GROBID + Regex & Keyword-Based Heuristics

In [None]:
import os
import json
import requests
import fitz  # PyMuPDF
import re
from bs4 import BeautifulSoup

# Paths
PDF_FOLDER = "PDFs_Extraction/contrats_scannées"
COUNTRY_LIST_FILE = "PDFs_Extraction/countries_fr.txt"
GROBID_URL = "http://localhost:8070/api/processFulltextDocument"
OUTPUT_JSON = "extracted_contract_info.json"

# Company Keywords
company_keywords = [
    "Ltd", "Inc", "Corp", "LLC", "Company", "Corporation", "Enterprises", "Group",
    "Holdings", "Services", "Solutions", "Industries", "Systems", "Technologies",
    "Partners", "Consulting", "Management", "Trading", "Operations",
    "SARL", "SA", "SAS", "SNC", "Entreprise", "Compagnie", "Société", "Groupe",
    "Conseil", "Commerce", "Gestion", "Opérations", "Sprl", "Office"
]

# Encoding Fix
def fix_encoding(text: str) -> str:
    try:
        return text.encode("latin1").decode("utf-8")
    except:
        return text

# Load Country List
def load_country_list(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip().lower() for line in f if line.strip()]

# Improved Country Extraction Using Regex Word Boundaries
def extract_countries(text: str, country_list: list) -> list:
    found_countries = set()
    text = text.lower()
    for country in country_list:
        pattern = r'\b' + re.escape(country) + r'\b'
        if re.search(pattern, text):
            found_countries.add(country)
    return list(found_countries)

# Extract First Page Text with PyMuPDF
def extract_first_page_text_with_pymupdf(pdf_path: str) -> str:
    try:
        doc = fitz.open(pdf_path)
        if len(doc) == 0:
            return ""
        first_page = doc.load_page(0)
        return first_page.get_text().strip()
    except Exception as e:
        print(f"Error extracting text with PyMuPDF from {pdf_path}: {e}")
        return ""

# Extract Title Based on Keywords
def extract_title_from_text(text: str) -> str:
    lines = text.split("\n")
    for line in lines:
        l = line.lower()
        if any(keyword in l for keyword in [
            "contrat", "convention", "accord", "protocole", "entente",
            "type d’amodiation", "amodiation", "licence", "autorisation",
            "deal", "memorandum", "type d'accord", "type de contrat"
        ]):
            return line.strip()
    return lines[0].strip() if lines else ""

# Extract Company Name
def extract_company_name(filename: str, text: str) -> str:
    candidates = []

    # Check filename
    for keyword in company_keywords:
        pattern = r'([A-Z][\w&., -]+?\b' + re.escape(keyword) + r'\b)'
        match = re.search(pattern, filename, re.IGNORECASE)
        if match:
            candidates.append(match.group(0).strip())

    # Check text
    for keyword in company_keywords:
        pattern = r'([A-Z][\w&., -]+?\b' + re.escape(keyword) + r'\b)'
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            candidates.append(match.group(0).strip())

    return candidates[0] if candidates else ""

# Extract Info from PDF (uses both GROBID and PyMuPDF)
def extract_contract_info_from_pdf(pdf_path: str, country_list: list):
    try:
        with open(pdf_path, "rb") as pdf_file:
            response = requests.post(GROBID_URL, files={"input": pdf_file})

        if response.status_code != 200:
            print(f"Failed to process {pdf_path}")
            return None

        raw_xml = response.text
        raw_text = fix_encoding(BeautifulSoup(raw_xml, "lxml").get_text())

        # First page
        first_page_text = extract_first_page_text_with_pymupdf(pdf_path)

        title = extract_title_from_text(first_page_text)
        countries = extract_countries(raw_text, country_list)
        company_name = extract_company_name(os.path.basename(pdf_path), first_page_text)

        return {
            "file_name": os.path.basename(pdf_path),
            "titre_contrat": title,
            "pays_cible": countries,
            "societe_exploitation": company_name
        }
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

# Main Routine
def main():
    country_list = load_country_list(COUNTRY_LIST_FILE)
    results = []

    for filename in os.listdir(PDF_FOLDER):
        if filename.lower().endswith(".pdf"):
            print(f"Processing {filename}...")
            pdf_path = os.path.join(PDF_FOLDER, filename)
            info = extract_contract_info_from_pdf(pdf_path, country_list)
            if info:
                results.append(info)

    if results:
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"\nExtraction complete. Data saved to: {OUTPUT_JSON}")
    else:
        print("\nNo contracts extracted.")

if __name__ == "__main__":
    main()
