**Bibliothèques à télécharger**

In [None]:
!pip install pymupdf
!pip install pytesseract
!pip install pdf2image
!pip install easyocr
!apt-get install -y poppler-utils
!pip install pillow
!pip install spacy
!pip install tqdm
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate
!pip install transformers torch spacy
!python -m spacy download fr_core_news_md

**Code 1 : Extraction des textes à partir des pdfs**

In [None]:
import os
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import easyocr
import numpy as np
from PIL import Image

input_folder = "contracts_pdf"
output_folder = "extracted_text"
os.makedirs(output_folder, exist_ok=True)

reader = easyocr.Reader(['fr'], gpu=False)

def is_scanned_page(text: str) -> bool:
    return len(text.strip()) < 30  # Heuristique simple

def remove_bottom(image: Image.Image, percentage=20) -> Image.Image:
    """Supprimer le bas de l’image (ex. : pour ignorer signatures/cachets)"""
    width, height = image.size
    crop_height = height * (100 - percentage) // 100
    cropped = image.crop((0, 0, width, crop_height))
    return cropped

def clean_text(lines):
    """Supprime les lignes très courtes ou vides"""
    cleaned = []
    for line in lines:
        line = line.strip()
        if len(line) < 5:
            continue
        cleaned.append(line)
    return cleaned

def extract_text_from_pdf(pdf_path: str) -> str:
    text_result = ""
    try:
        doc = fitz.open(pdf_path)
        for i in range(len(doc)):
            page = doc[i]
            text = page.get_text("text")

            if is_scanned_page(text):
                # Page probablement scannée
                print(f"  OCR sur page {i + 1}")
                images = convert_from_path(pdf_path, first_page=i + 1, last_page=i + 1)
                if images:
                    image = remove_bottom(images[0], percentage=20)  # Retirer bas de page
                    ocr_lines = reader.readtext(np.array(image), detail=0)
                    lines = clean_text(ocr_lines)
                    text_result += "\n".join(lines) + "\n"
            else:
                # Page textuelle
                lines = clean_text(text.splitlines())
                text_result += "\n".join(lines) + "\n"
    except Exception as e:
        print(f"[ERREUR] {pdf_path}: {e}")
    return text_result.strip()

# Boucle sur les fichiers
for file_name in os.listdir(input_folder):
    if file_name.lower().endswith(".pdf"):
        print(f"Traitement de {file_name}...")
        pdf_path = os.path.join(input_folder, file_name)
        text = extract_text_from_pdf(pdf_path)

        if text:
            output_path = os.path.join(output_folder, file_name.replace(".pdf", ".txt"))
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f" Texte extrait vers {output_path}")
        else:
            print(f" Aucun texte trouvé pour {file_name}")


**Code 2 : Extraction des infos à partir des textes en utilisant un modèle LLM : TinyLlama**

In [None]:
import os
import re
import json
from transformers import pipeline
import torch

# Charger le modèle
pipe = pipeline(
    "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

company_keywords = [
    # English
    "Ltd", "Inc", "Corp", "LLC", "Company", "Corporation", "Enterprises",
    "Group", "Holdings", "Services", "Solutions", "Industries",
    "Systems", "Technologies", "Partners", "Consulting", "Management", "Trading", "Operations",

    # French
    "SARL", "SA", "SAS", "SNC", "Entreprise", "Compagnie", "Société",
    "Groupe", "Services", "Solutions", "Technologies", "Conseil",
    "Commerce", "Gestion", "Opérations", "Sprl"
]

# Découpe du texte en chunks de 1000 mots
def split_text(text, max_words=1000):
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

# Charger la liste des pays depuis un fichier texte
def load_country_list(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def extract_company_name_from_text(response_text, keywords):
    keyword_pattern = r'\b(?:[A-Z][a-zA-Z&, \'.-]+\s)+(?:' + '|'.join(re.escape(k) for k in keywords) + r')\b'
    keyword_matches = re.findall(keyword_pattern, response_text)

    caps_pattern = r'\b(?:[A-Z]{2,}(?:\s+[A-Z]{2,}){0,4})\b'
    caps_matches = re.findall(caps_pattern, response_text)

    # Filter out short or generic words (optional: could be refined more)
    caps_matches = [c.strip() for c in caps_matches if len(c) > 2 and not c.isdigit()]

    all_matches = set(keyword_matches + caps_matches)
    return list(all_matches)


# Dossiers d'entrée/sortie
text_folder = "extracted_text"
output_folder = "extracted_data"
os.makedirs(output_folder, exist_ok=True)

# Liste des pays en français
country_list = load_country_list("countries_fr.txt")

# Traitement de chaque fichier texte dans le dossier
for file_name in os.listdir(text_folder):
    if file_name.endswith(".txt"):
        file_path = os.path.join(text_folder, file_name)

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        chunks = split_text(content, max_words=1000)
        print(f"\nAnalyse du fichier : {file_name} ({len(chunks)} morceaux)")

        extracted_countries = []

        for i, chunk in enumerate(chunks):
            country_prompt = f"Quels sont les pays mentionnés dans ce texte : {chunk}"
            country_result = pipe(country_prompt, max_new_tokens=150, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
            country_response = country_result[0]['generated_text']

            countries_found = []
            for country in country_list:
                # Regex pour éviter les faux positifs (mot entier uniquement)
                pattern = r'\b' + re.escape(country) + r'\b'
                if re.search(pattern, country_response, re.IGNORECASE):
                    countries_found.append(country)

            # Supprimer les doublons éventuels
            countries_found = list(set(countries_found))

            company_prompt =f"Peux-tu extraire les noms des sociétés d'exploitation mentionné : {chunk}"
            company_result = pipe(company_prompt, max_new_tokens=150, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
            company_response = company_result[0]['generated_text']

            company_names = extract_company_name_from_text(company_response, company_keywords)

            extracted_countries.append({
                "chunk": i + 1,
                "countries": countries_found,
                "companies": company_names
            })

        output_file = os.path.join(output_folder, f"{file_name}.json")
        with open(output_file, "w", encoding="utf-8") as json_file:
            json.dump(extracted_countries, json_file, ensure_ascii=False, indent=4)

        print(f"Résultats enregistrés dans : {output_file}")


**Code 3 :  Extraction des infos à partir des textes en utilisant BERT**

In [None]:
import os
import re
import json
from transformers import pipeline

# Load CamemBERT NER model
ner_pipeline = pipeline("ner", model="Jean-Baptiste/camembert-ner", tokenizer="Jean-Baptiste/camembert-ner", grouped_entities=True)

company_keywords = [
    "Ltd", "Inc", "Corp", "LLC", "Company", "Corporation", "Enterprises", "Group",
    "Holdings", "SARL", "SA", "SAS", "SNC", "Entreprise", "Compagnie", "Société",
    "Groupe", "Office", "Gestion", "Opérations"
]

title_keywords = [
    "contrat", "convention", "accord", "protocole", "entente",
    "contract", "agreement", "memorandum"
]

exclusion_keywords = ["république", "ministère", "province", "direction"]

def clean_text(text):
    return re.sub(r'\s+', ' ', re.sub(r"[^\w\s\-’']", '', text)).strip()

def extract_title(text, max_lines=100):
    lines = text.splitlines()

    for i in range(min(len(lines), max_lines)):
        line = lines[i].strip()

        # Skip if not uppercase or too short
        if not line.isupper() or len(line.split()) < 2:
            continue

        cleaned_line = clean_text(line)
        lower_line = cleaned_line.lower()

        # Must start with one of the main title keywords and not contain exclusions
        if any(lower_line.startswith(kw) for kw in title_keywords):
            if not any(excl in lower_line for excl in exclusion_keywords):
                return line.strip()

    return ""



def split_text(text, max_words=500):
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

def is_company(name):
    return any(re.search(r'\b' + re.escape(kw) + r'\b', name, re.IGNORECASE) for kw in company_keywords)

def load_country_list(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

text_folder = "extracted_text"
output_folder = "bert_output"
os.makedirs(output_folder, exist_ok=True)

country_list = load_country_list("countries_fr.txt")

for file_name in os.listdir(text_folder):
    if not file_name.endswith(".txt"):
        continue

    with open(os.path.join(text_folder, file_name), 'r', encoding='utf-8') as f:
        content = f.read()

    title = extract_title(content)
    chunks = split_text(content)

    ner_results = []

    for i, chunk in enumerate(chunks):
        ner_out = ner_pipeline(chunk)
        countries = set()
        operating_companies = set()
        others = set()

        for ent in ner_out:
            word = ent['word']
            label = ent['entity_group']
            if label in ["LOC", "MISC"]:
                if any(re.fullmatch(re.escape(c), word, re.IGNORECASE) for c in country_list):
                    countries.add(word)
            elif label == "ORG":
                if is_company(word):
                    operating_companies.add(word)
                else:
                    others.add(word)

        ner_results.append({
            "chunk": i + 1,
            "Pays Cible": sorted(countries),
            "Société d'exploitation": sorted(operating_companies),
            "Autres sociétés": sorted(others)
        })

    result = {
        "Titre Contrat": title,
        "ner_chunks": ner_results
    }

    output_filename = os.path.splitext(file_name)[0] + ".json"
    with open(os.path.join(output_folder, output_filename), "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=4)

    print(f"BERT extracted data saved for {file_name}")
