# MFEGSN ‚Äî Pipeline Colab (Marker + LangExtract)

Notebook optimis√© pour une ex√©cution **pas √† pas** sur Google Colab.

**üöÄ Optimisation:** Les mod√®les (~4GB) sont sauvegard√©s sur Google Drive apr√®s le premier t√©l√©chargement.
Les sessions suivantes chargeront les mod√®les depuis Drive en quelques secondes.

**Ordre recommand√©:** √âtapes 1 ‚Üí 10

## √âtape 1 ‚Äî Monter Google Drive
**IMPORTANT:** Ex√©cutez cette cellule EN PREMIER pour permettre le cache des mod√®les.

In [1]:
from google.colab import drive
drive.mount("/content/drive")

# Cr√©er le dossier de cache sur Drive
import os
from pathlib import Path

# === CONFIGURATION DU CACHE ===
DRIVE_CACHE_DIR = Path("/content/drive/MyDrive/.mfegsn_cache")
DRIVE_CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Sous-dossiers pour chaque type de cache
HF_CACHE_DRIVE = DRIVE_CACHE_DIR / "huggingface"
TORCH_CACHE_DRIVE = DRIVE_CACHE_DIR / "torch"
DATALAB_CACHE_DRIVE = DRIVE_CACHE_DIR / "datalab"

for cache_dir in [HF_CACHE_DRIVE, TORCH_CACHE_DRIVE, DATALAB_CACHE_DRIVE]:
    cache_dir.mkdir(parents=True, exist_ok=True)

# Configurer les variables d'environnement pour utiliser le cache Drive
os. environ["HF_HOME"] = str(HF_CACHE_DRIVE)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["HUGGINGFACE_HUB_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["TORCH_HOME"] = str(TORCH_CACHE_DRIVE)
os.environ["XDG_CACHE_HOME"] = str(DRIVE_CACHE_DIR)

# V√©rifier la taille du cache existant
def get_dir_size(path):
    total = 0
    if path.exists():
        for f in path.rglob("*"):
            if f.is_file():
                total += f.stat().st_size
    return total / 1e9

cache_size = get_dir_size(DRIVE_CACHE_DIR)

print("="*60)
print("‚úÖ Google Drive mont√©! ")
print("="*60)
print(f"üìÇ Cache Drive:  {DRIVE_CACHE_DIR}")
print(f"üíæ Taille du cache: {cache_size:.2f} GB")
if cache_size > 3:
    print("üöÄ Mod√®les d√©j√† en cache!  Le chargement sera rapide.")
else:
    print("üì• Premier lancement:  les mod√®les seront t√©l√©charg√©s et mis en cache.")
print("="*60)

Mounted at /content/drive
‚úÖ Google Drive mont√©! 
üìÇ Cache Drive:  /content/drive/MyDrive/.mfegsn_cache
üíæ Taille du cache: 3.53 GB
üöÄ Mod√®les d√©j√† en cache!  Le chargement sera rapide.


## √âtape 2 ‚Äî Installer les d√©pendances
Installation des packages Python n√©cessaires.

In [2]:
# D√©pendances syst√®me
!apt-get update -qq
! apt-get install -y zstd -qq

# D√©pendances Python
!python -m pip install -q --upgrade pip
!python -m pip install -q marker-pdf[full] langextract google-generativeai pillow

# Installer hf_transfer pour des t√©l√©chargements plus rapides (sans casser les d√©pendances)
!pip install -q hf_transfer --no-deps

# Activer hf_transfer
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# V√©rifier les versions
print("\nüì¶ Versions install√©es:")
!pip show huggingface_hub 2>/dev/null | grep Version
! pip show transformers 2>/dev/null | grep Version
!pip show marker-pdf 2>/dev/null | grep Version

print("\n‚úÖ D√©pendances install√©es.")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package zstd.
(Reading database ... 121689 files and directories currently installed.)
Preparing to unpack .../zstd_1.4.8+dfsg-3build1_amd64.deb ...
Unpacking zstd (1.4.8+dfsg-3build1) ...
Setting up zstd (1.4.8+dfsg-3build1) ...
Processing triggers for man-db (2.10.2-1) ...

üì¶ Versions install√©es:
Version: 0.36.0
Version: 4.57.6
Version: 1.10.1

‚úÖ D√©pendances install√©es.


## √âtape 3 ‚Äî Configurer les dossiers de travail
Modifiez les chemins selon votre Drive.

In [3]:
from pathlib import Path

# === MODIFIEZ CES CHEMINS ===
INPUT_DIR = Path("/content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLPDF")
OUTPUT_DIR = Path("/content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLMD")

assert INPUT_DIR.exists(), f"‚ùå Dossier introuvable: {INPUT_DIR}"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Dossier global pour les logs du pipeline
GLOBAL_LOGS_DIR = OUTPUT_DIR / "_PIPELINE_LOGS"
GLOBAL_LOGS_DIR.mkdir(parents=True, exist_ok=True)

def get_doc_folders(doc_name):
    """Cr√©e et retourne les dossiers pour un document donn√©."""
    doc_dir = OUTPUT_DIR / doc_name
    doc_dir.mkdir(parents=True, exist_ok=True)

    folders = {
        "root": doc_dir,
        "figures": doc_dir / "_FIGURES",
        "references": doc_dir / "_REFERENCES",
        "analyses": doc_dir / "_ANALYSES",
        "logs": doc_dir / "_LOGS",
    }

    for folder in folders.values():
        folder.mkdir(parents=True, exist_ok=True)

    return folders

pdf_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])
print("="*60)
print("üìÅ CONFIGURATION")
print("="*60)
print(f"üìÇ Entr√©e: {INPUT_DIR}")
print(f"üìÇ Sortie: {OUTPUT_DIR}")
print(f"üìÑ PDFs: {len(pdf_files)}")
print(f"üìÇ Structure par fichier:  <nom_fichier>/")
print(f"   ‚îú‚îÄ‚îÄ _ANALYSES/")
print(f"   ‚îú‚îÄ‚îÄ _FIGURES/")
print(f"   ‚îú‚îÄ‚îÄ _LOGS/")
print(f"   ‚îî‚îÄ‚îÄ _REFERENCES/")
print("="*60)

üìÅ CONFIGURATION
üìÇ Entr√©e: /content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLPDF
üìÇ Sortie: /content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLMD
üìÑ PDFs: 54
üìÇ Structure par fichier:  <nom_fichier>/
   ‚îú‚îÄ‚îÄ _ANALYSES/
   ‚îú‚îÄ‚îÄ _FIGURES/
   ‚îú‚îÄ‚îÄ _LOGS/
   ‚îî‚îÄ‚îÄ _REFERENCES/


## √âtape 4 ‚Äî Ollama + Gemma 3 4B
Installation et d√©marrage de Gemma 3 4B via Ollama.

In [4]:
USE_OLLAMA = True  # Activ√© par d√©faut pour Gemma local

ollama_process = None

if USE_OLLAMA:
    import subprocess
    import time
    import os

    # Installer requests si n√©cessaire
    try:
        import requests
    except ImportError:
        !pip install -q requests
        import requests

    # Installer Ollama
    print("üì¶ Installation d'Ollama...")
    !curl -fsSL https://ollama.com/install.sh 2>/dev/null | sh 2>&1 | tail -n 3

    # D√©marrer Ollama en arri√®re-plan
    ollama_process = subprocess.Popen(
        ["ollama", "serve"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )
    print("‚è≥ D√©marrage du serveur Ollama...")

    # Attendre que le serveur soit pr√™t (avec timeout de 30 secondes)
    server_ready = False
    for i in range(30):
        try:
            response = requests.get("http://localhost:11434/api/tags", timeout=2)
            if response.status_code == 200:
                print("‚úÖ Serveur Ollama pr√™t!")
                server_ready = True
                break
        except:
            pass
        time.sleep(1)

    if not server_ready:
        print("‚ö†Ô∏è Le serveur Ollama n'a pas d√©marr√© √† temps")
        print("   Essayez de r√©ex√©cuter cette cellule.")

    # T√©l√©charger Gemma 3 4B (SANS espace dans le nom!)
    print("\nüì• T√©l√©chargement de Gemma 3 4B (‚âà3GB)...")
    print("   Cela peut prendre plusieurs minutes...")
    !ollama pull gemma3:4b

    # V√©rification
    result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
    if "gemma3:4b" in result.stdout or "gemma3" in result.stdout:
        print("\n‚úÖ Gemma 3 4B t√©l√©charg√©!")

        # Pr√©chauffer le mod√®le avec une requ√™te test
        print("üî• Pr√©chauffage du mod√®le...")
        test_result = subprocess.run(
            ["ollama", "run", "gemma3:4b", "R√©ponds uniquement: OK"],
            capture_output=True,
            text=True,
            timeout=30
        )

        if test_result.returncode == 0:
            print("‚úÖ Gemma 3 4B pr√™t et op√©rationnel!")
            print("\n" + "="*60)
            print("ü§ñ OLLAMA CONFIGUR√â")
            print("="*60)
            print("Mod√®le: gemma3:4b")
            print("Serveur: http://localhost:11434")
            print("="*60)
        else:
            print("‚ö†Ô∏è Avertissement: Le test du mod√®le a √©chou√©")
            print("   Le mod√®le devrait quand m√™me fonctionner.")
    else:
        print("‚ùå Erreur: Gemma 3 4B non trouv√©")
        print("Mod√®les disponibles:")
        print(result.stdout if result.stdout else "Aucun mod√®le")
else:
    print("‚ÑπÔ∏è Ollama d√©sactiv√©. Mettez USE_OLLAMA = True pour l'activer.")

üì¶ Installation d'Ollama...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
‚è≥ D√©marrage du serveur Ollama...
‚úÖ Serveur Ollama pr√™t!

üì• T√©l√©chargement de Gemma 3 4B (‚âà3GB)...
   Cela peut prendre plusieurs minutes...
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?

## √âtape 5 ‚Äî Configurer Marker
Chargement des mod√®les Marker (~4GB) depuis le cache Drive.

**‚ö° Premier lancement:** 10-20 minutes (t√©l√©chargement + sauvegarde sur Drive)

**üöÄ Lancements suivants:** 1-2 minutes (chargement depuis Drive)

In [None]:
import json
import re
import shutil
import base64
import logging
import warnings
import os
import sys
import time
from datetime import datetime
from pathlib import Path

# === OPTION: EXTRACTION D'IMAGES ===
EXTRACT_IMAGES = True  # ACTIV√â pour extraire les images/figures

# === CONFIGURATION DES LOGS ===
# D√©sactiver les warnings non critiques
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Configurer le logging pour r√©duire la verbosit√©
logging.getLogger().setLevel(logging.WARNING)

# R√©duire drastiquement les logs des biblioth√®ques tierces
for logger_name in ['transformers', 'torch', 'PIL', 'urllib3', 'filelock',
                     'huggingface_hub', 'marker', 'datasets']:
    logging.getLogger(logger_name).setLevel(logging.ERROR)

# Variables d'environnement pour r√©duire les logs
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '0'  # Garder les barres de progression

# S'assurer que le cache Drive est bien configur√©
os.environ["HF_HOME"] = str(HF_CACHE_DRIVE)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["HUGGINGFACE_HUB_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["TORCH_HOME"] = str(TORCH_CACHE_DRIVE)
# Forcer HF Transfer
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# === FORCER LE T√âL√âCHARGEMENT DES MOD√àLES SURYA DEPUIS HUGGING FACE ===
os.environ["SURYA_LAYOUT_MODEL"] = "vikp/surya_layout"
os.environ["SURYA_REC_MODEL"] = "vikp/surya_rec"
os.environ["SURYA_DET_MODEL"] = "vikp/surya_det"
os.environ["SURYA_ORDER_MODEL"] = "vikp/surya_order"
os.environ["SURYA_TABLE_REC_MODEL"] = "vikp/surya_tablerec"
os.environ["SURYA_DOWNLOAD_BACKEND"] = "huggingface"

import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
print("="*60)
print("üñ•Ô∏è CONFIGURATION MAT√âRIELLE")
print("="*60)
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# V√©rifier le cache existant
def get_dir_size(path):
    total = 0
    if path.exists():
        for f in path.rglob("*"):
            if f.is_file():
                try:
                    total += f.stat().st_size
                except:
                    pass
    return total / 1e9

initial_cache_size = get_dir_size(DRIVE_CACHE_DIR)
print(f"\nüíæ Cache Drive: {initial_cache_size:.2f} GB")

if initial_cache_size > 3:
    print("üöÄ Mod√®les trouv√©s en cache! Chargement rapide...")
else:
    print("üì• Premier t√©l√©chargement des mod√®les Marker (~4GB)")
    print("   ‚è±Ô∏è Dur√©e estim√©e: 10-20 minutes")
    print("   üíæ Les mod√®les seront sauvegard√©s sur Drive")

print("="*60)

# Chronom√®tre
start_time = time.time()

# Charger les mod√®les avec messages de progression
print("\nüì¶ Chargement des mod√®les Marker...")
if initial_cache_size > 3:
    print("   ‚ö° Chargement depuis le cache Drive...")
else:
    print("   üì• T√©l√©chargement en cours...")

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

# Tenter d'importer le service LLM appropri√©
LLM_SERVICE_CLS = None
USE_OLLAMA = True

if USE_OLLAMA:
    try:
        from marker.llm.openai import OpenAILLMService
        LLM_SERVICE_CLS = OpenAILLMService
        print("‚úÖ Service LLM: OpenAILLMService (compatible Ollama)")
    except ImportError:
        try:
            from marker.services.openai import OpenAILLMService
            LLM_SERVICE_CLS = OpenAILLMService
            print("‚úÖ Service LLM: OpenAILLMService (via path alternatif)")
        except ImportError:
            print("‚ö†Ô∏è Impossible d'importer OpenAILLMService. Le LLM sera d√©sactiv√© pour Marker.")
            USE_OLLAMA = False

# Configuration pour Ollama via API OpenAI
if USE_OLLAMA:
    os.environ["MARKER_LLM_PROVIDER"] = "openai"
    os.environ["OPENAI_BASE_URL"] = "http://localhost:11434/v1"
    os.environ["OPENAI_API_KEY"] = "ollama"
    os.environ["OPENAI_MODEL"] = "gemma3:4b"

# Configuration Marker OPTIMIS√âE pour extraction compl√®te
marker_config = {
    "workers": 2,
    "extract_images": EXTRACT_IMAGES,
    "images_as_base64": True,  # R√©cup√©rer les images en base64 pour √©viter les erreurs de fichiers
    "use_llm": USE_OLLAMA,
    "llm_provider": "openai" if USE_OLLAMA else None,
    "llm_model": "gemma3:4b" if USE_OLLAMA else None,
    "force_ocr": False,
    "languages": ["fr", "en"],
    "paginate_output": True,
    "batch_size": 4 if device == "cuda" else 2,
    # Options suppl√©mentaires pour am√©liorer l'extraction
    "output_format": "markdown",
}

print("\n‚öôÔ∏è Cr√©ation du dictionnaire de mod√®les...")
model_dict = create_model_dict()

print("‚úì Configuration du convertisseur...")
config_parser = ConfigParser(marker_config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=model_dict
)

# R√©sum√©
elapsed = time.time() - start_time
final_cache_size = get_dir_size(DRIVE_CACHE_DIR)
downloaded = final_cache_size - initial_cache_size

print("\n" + "="*60)
print("‚úÖ MARKER CONFIGUR√â AVEC SUCC√àS!")
print("="*60)
print(f"‚è±Ô∏è Dur√©e: {elapsed/60:.1f} minutes")
print(f"üì¶ T√©l√©charg√© cette session: {max(0, downloaded):.2f} GB")
print(f"üíæ Cache total sur Drive: {final_cache_size:.2f} GB")
print(f"üñºÔ∏è Extraction d'images: {'Activ√©e' if EXTRACT_IMAGES else 'D√©sactiv√©e'}")
if USE_OLLAMA:
    print(f"ü§ñ LLM: Gemma 3 4B via Ollama (interface OpenAI)")
else:
    print("ü§ñ LLM: D√©sactiv√©")
print("="*60)
if downloaded > 0.5:
    print("\nüí° Les mod√®les sont maintenant en cache sur votre Drive.")

üñ•Ô∏è CONFIGURATION MAT√âRIELLE
Device: cuda
GPU: Tesla T4
VRAM: 15.8 GB

üíæ Cache Drive: 3.53 GB
üöÄ Mod√®les trouv√©s en cache! Chargement rapide...

üì¶ Chargement des mod√®les Marker...
   ‚ö° Chargement depuis le cache Drive...
‚ö†Ô∏è Impossible d'importer OpenAILLMService. Le LLM sera d√©sactiv√© pour Marker.

‚öôÔ∏è Cr√©ation du dictionnaire de mod√®les...
‚úì Configuration du convertisseur...

‚úÖ MARKER CONFIGUR√â AVEC SUCC√àS!
‚è±Ô∏è Dur√©e: 0.4 minutes
üì¶ T√©l√©charg√© cette session: 0.00 GB
üíæ Cache total sur Drive: 3.53 GB
üñºÔ∏è Extraction d'images: D√©sactiv√©e
ü§ñ LLM: D√©sactiv√©


## √âtape 6 ‚Äî Fonctions utilitaires
Extraction des r√©f√©rences, figures et conversion PDF ‚Üí Markdown.

In [None]:
def extract_references_from_markdown(markdown_text):
    """Extrait la section r√©f√©rences/bibliographie du Markdown avec patterns tr√®s am√©lior√©s."""
    references = {
        "references_text": "",
        "references_list": [],
        "reference_count": 0,
    }

    # Normaliser le texte (supprimer les sauts de ligne multiples excessifs)
    text = re.sub(r'\n{3,}', '\n\n', markdown_text)

    # === √âTAPE 1: Trouver la section r√©f√©rences ===
    ref_section = ""

    # Patterns pour trouver le D√âBUT de la section r√©f√©rences
    section_start_patterns = [
        r"(?i)(?:^|\n)#{1,4}\s*(references?|r√©f√©rences?|bibliography|bibliographie|works?\s*cited|cited\s*works?|literature|sources?|notes?\s*(?:and\s*)?references?)\s*\n",
        r"(?i)(?:^|\n)\*\*(references?|r√©f√©rences?|bibliography|bibliographie)\*\*\s*\n",
        r"(?i)(?:^|\n)(REFERENCES?|R√âF√âRENCES?|BIBLIOGRAPHY|BIBLIOGRAPHIE|WORKS\s*CITED)\s*\n",
        r"(?i)(?:^|\n)_{2,}?\s*(references?|bibliography)\s*_{2,}?\s*\n",
    ]

    section_start_pos = -1
    for pattern in section_start_patterns:
        match = re.search(pattern, text)
        if match:
            section_start_pos = match.end()
            break

    if section_start_pos > 0:
        # Prendre tout depuis le d√©but de la section jusqu'√† la fin ou prochaine section majeure
        remaining = text[section_start_pos:]

        # Chercher la fin de la section (prochain header de m√™me niveau ou sup√©rieur, ou fin)
        end_patterns = [
            r"\n#{1,3}\s+[A-Z]",  # Prochain header
            r"\n\*\*[A-Z][a-z]+\*\*\s*\n",  # Prochain titre en gras
            r"\n(?:APPENDIX|ANNEXE|ACKNOWLEDGMENT)",  # Sections suivantes typiques
        ]

        end_pos = len(remaining)
        for pattern in end_patterns:
            match = re.search(pattern, remaining, re.IGNORECASE)
            if match and match.start() < end_pos:
                end_pos = match.start()

        ref_section = remaining[:end_pos].strip()

    # === √âTAPE 2: Si pas de section trouv√©e, chercher dans les derniers 30% du document ===
    if not ref_section or len(ref_section) < 100:
        # Les r√©f√©rences sont souvent dans les derniers 30% du document
        last_portion = text[int(len(text) * 0.7):]

        # Chercher des blocs qui ressemblent √† des r√©f√©rences
        # Pattern: lignes commen√ßant par [n], n., ou auteur suivi d'ann√©e
        ref_block_pattern = r"(?:^|\n)((?:\[\d+\]|\d+\.|[A-Z][a-z]+,?\s+[A-Z])[^\n]+(?:\n(?!\[\d+\]|\d+\.|[A-Z][a-z]+,\s+[A-Z])[^\n]+)*)"
        blocks = re.findall(ref_block_pattern, last_portion)

        if len(blocks) >= 5:  # Au moins 5 r√©f√©rences-like
            ref_section = "\n".join(blocks)

    # === √âTAPE 3: Extraction des r√©f√©rences individuelles ===
    ref_lines = []

    if ref_section:
        references["references_text"] = ref_section[:5000]  # Limiter la taille

        # Patterns de d√©but de r√©f√©rence (ordre de priorit√©)
        ref_patterns = [
            # [1] Author...
            r"^\[\d+\]\s*(.+?)(?=\n\[\d+\]|\n\n|\Z)",
            # 1. Author...
            r"^(\d+)\.\s+([A-Z][^.]+\.\s+.+?)(?=\n\d+\.|\n\n|\Z)",
            # Author, A. (Year)...
            r"^([A-Z][a-z]+(?:[-'][A-Z][a-z]+)?,\s+[A-Z]\.(?:\s*[A-Z]\.)*\s*(?:\(\d{4}\)|,?\s*\d{4})[^.]+\.[^\n]+)",
            # Author (Year)...
            r"^([A-Z][a-z]+(?:\s+(?:and|&|et)\s+[A-Z][a-z]+)*\s*\(\d{4}\)[^\n]+)",
            # - ou ‚Ä¢ Author...
            r"^[-‚Ä¢‚óè]\s+(.+?)(?=\n[-‚Ä¢‚óè]|\n\n|\Z)",
        ]

        lines = ref_section.split('\n')
        current_ref = ""

        for line in lines:
            line = line.strip()
            if not line:
                if current_ref and len(current_ref) > 40:
                    ref_lines.append(current_ref)
                current_ref = ""
                continue

            # V√©rifier si c'est le d√©but d'une nouvelle r√©f√©rence
            is_new_ref = False

            # Num√©rotation explicite
            if re.match(r"^\[\d+\]", line) or re.match(r"^\d+\.\s+[A-Z]", line):
                is_new_ref = True
            # Auteur suivi d'ann√©e entre parenth√®ses ou apr√®s virgule
            elif re.match(r"^[A-Z][a-z]+(?:[-'][A-Z][a-z]+)?,?\s+[A-Z]\..*\(\d{4}\)", line):
                is_new_ref = True
            elif re.match(r"^[A-Z][a-z]+,?\s+[A-Z]\.\s*(?:&|and|,)\s*[A-Z]", line):
                is_new_ref = True
            # Bullet points
            elif re.match(r"^[-‚Ä¢‚óè‚óã]\s+[A-Z]", line):
                is_new_ref = True
            # Auteur et al.
            elif re.match(r"^[A-Z][a-z]+\s+et\s+al\.", line):
                is_new_ref = True

            if is_new_ref:
                if current_ref and len(current_ref) > 40:
                    ref_lines.append(current_ref)
                current_ref = line
            elif current_ref:
                current_ref += " " + line
            elif len(line) > 50 and re.search(r'\d{4}', line):
                # Potentielle r√©f√©rence sans marqueur clair
                current_ref = line

        # Derni√®re r√©f√©rence
        if current_ref and len(current_ref) > 40:
            ref_lines.append(current_ref)

    # === √âTAPE 4: Extraction alternative - citations in-text ===
    if len(ref_lines) < 3:
        # Chercher des r√©f√©rences cit√©es dans le texte avec format complet
        inline_patterns = [
            # Format num√©rique avec d√©tails
            r"\[\d+\]\s+[A-Z][a-z]+(?:,?\s+[A-Z]\.)+[^.]+\.\s+[^.]+\.\s+\d{4}",
            # Format auteur-ann√©e complet
            r"[A-Z][a-z]+,?\s+[A-Z]\.(?:\s*[A-Z]\.)*\s*\(\d{4}\)\.\s+[^.]+\.[^.]+\.",
        ]

        for pattern in inline_patterns:
            matches = re.findall(pattern, text)
            for m in matches:
                if m not in ref_lines and len(m) > 50:
                    ref_lines.append(m.strip())

    # === √âTAPE 5: Nettoyage et d√©duplication ===
    seen = set()
    unique_refs = []
    for ref in ref_lines:
        # Nettoyer
        ref = re.sub(r'\s+', ' ', ref).strip()
        ref = re.sub(r'^[\[\d+\]|\d+\.|\-‚Ä¢‚óè]\s*', '', ref).strip()

        # D√©duplication bas√©e sur les premiers 80 caract√®res
        key = ref[:80].lower()
        if key not in seen and len(ref) > 40:
            seen.add(key)
            unique_refs.append(ref)

    references["references_list"] = unique_refs[:200]  # Max 200 r√©f√©rences
    references["reference_count"] = len(unique_refs)

    return references


def extract_figures_info(markdown_text, images_dict):
    """Extrait les informations sur les figures du document avec d√©tection am√©lior√©e."""
    figures = []
    seen_labels = set()

    # === PATTERNS AM√âLIOR√âS pour les figures ===
    fig_patterns = [
        # Figure X: Title ou Fig. X: Title
        r"(?i)(?:^|\n)\s*((?:figure|fig\.?)\s*(\d+(?:\.\d+)?)[:\s.]*([^\n]{0,200}))",
        # Figure X ‚Äî Title (avec tiret long)
        r"(?i)(?:^|\n)\s*((?:figure|fig\.?)\s*(\d+(?:\.\d+)?)\s*[‚Äî‚Äì-]\s*([^\n]{0,200}))",
        # Tableau X / Table X
        r"(?i)(?:^|\n)\s*((?:table(?:au)?)\s*(\d+(?:\.\d+)?)[:\s.]*([^\n]{0,200}))",
        # Chart/Graph/Diagram
        r"(?i)(?:^|\n)\s*((?:chart|graph|diagram|graphique|sch√©ma)\s*(\d+(?:\.\d+)?)?[:\s.]*([^\n]{0,150}))",
        # L√©gendes sous images markdown
        r"(?:!\[[^\]]*\]\([^)]+\))\s*\n\s*\*([^*\n]+)\*",
        r"(?:!\[[^\]]*\]\([^)]+\))\s*\n\s*_([^_\n]+)_",
    ]

    for pattern in fig_patterns:
        for match in re.finditer(pattern, markdown_text, re.MULTILINE):
            groups = match.groups()
            if len(groups) >= 1:
                full_match = groups[0] if groups[0] else match.group(0)
                fig_num = groups[1] if len(groups) > 1 and groups[1] else ""
                title = groups[2].strip() if len(groups) > 2 and groups[2] else ""

                title = re.sub(r'^[:\s.‚Äî‚Äì-]+', '', title).strip()

                label = full_match.strip()[:100]
                if label.lower() not in seen_labels:
                    seen_labels.add(label.lower())
                    figures.append({
                        "label": label,
                        "number": fig_num,
                        "title": title,
                        "type": "figure",
                    })

    # === D√âTECTER LES IMAGES MARKDOWN ===
    img_md_pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
    for match in re.finditer(img_md_pattern, markdown_text):
        alt_text = match.group(1).strip()
        img_src = match.group(2).strip()

        if alt_text and alt_text.lower() not in seen_labels:
            seen_labels.add(alt_text.lower())
            figures.append({
                "label": alt_text or f"Image: {img_src[:50]}",
                "title": alt_text,
                "path": img_src,
                "type": "embedded_image",
            })

    # === AJOUTER LES IMAGES DU DICTIONNAIRE MARKER ===
    if images_dict:
        for idx, img_name in enumerate(images_dict.keys(), 1):
            img_label = str(img_name)
            if img_label.lower() not in seen_labels:
                seen_labels.add(img_label.lower())
                figures.append({
                    "label": img_label,
                    "title": "",
                    "path": img_label,
                    "type": "extracted_image",
                    "index": idx,
                })

    return figures


def save_figures(images_dict, doc_folders):
    """Sauvegarde les figures extraites dans le dossier _FIGURES du document."""
    if not images_dict:
        return []

    figures_folder = doc_folders["figures"]
    saved_paths = []
    used_names = set()

    for idx, (img_name, img_data) in enumerate(images_dict.items(), 1):
        safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", str(img_name))
        safe_name = re.sub(r"_+", "_", safe_name).strip("_")
        safe_name = safe_name[:100]
        if not safe_name or len(safe_name) < 3:
            safe_name = f"figure_{idx:03d}"

        base_name = safe_name
        counter = 1
        while safe_name in used_names:
            safe_name = f"{base_name}_{counter}"
            counter += 1
        used_names.add(safe_name)

        img_path = figures_folder / f"{safe_name}.png"

        try:
            if isinstance(img_data, Image.Image):
                if img_data.mode in ('RGBA', 'LA', 'P'):
                    rgb_image = Image.new('RGB', img_data.size, (255, 255, 255))
                    if img_data.mode == 'P':
                        img_data = img_data.convert('RGBA')
                    rgb_image.paste(img_data, mask=img_data.split()[-1] if img_data.mode in ('RGBA', 'LA') else None)
                    img_data = rgb_image
                img_data.save(img_path, "PNG", optimize=True)
            elif isinstance(img_data, (bytes, bytearray)):
                with open(img_path, "wb") as f:
                    f.write(img_data)
            elif isinstance(img_data, str):
                if img_data.startswith("data:image"):
                    b64_data = img_data.split(",", 1)[1] if "," in img_data else img_data
                    with open(img_path, "wb") as f:
                        f.write(base64.b64decode(b64_data))
                elif Path(img_data).exists():
                    shutil.copy(img_data, img_path)
                else:
                    try:
                        with open(img_path, "wb") as f:
                            f.write(base64.b64decode(img_data))
                    except:
                        continue
            else:
                continue

            saved_paths.append(str(img_path))
        except Exception as e:
            print(f"   ‚ö†Ô∏è Impossible de sauvegarder {safe_name}: {str(e)[:80]}")
            continue

    return saved_paths


def extract_metadata_from_markdown(markdown_text):
    """Extrait les m√©tadonn√©es du document (titre, auteurs, abstract, etc.)."""
    metadata = {
        "title": "",
        "authors": [],
        "abstract": "",
        "keywords": [],
        "date": "",
    }

    title_patterns = [
        r"^#\s+([^\n]+)",
        r"^\*\*([^\*\n]{10,150})\*\*",
        r"^([A-Z][^\n]{20,150})\n[=]+",
    ]
    for pattern in title_patterns:
        match = re.search(pattern, markdown_text[:2000], re.MULTILINE)
        if match:
            metadata["title"] = match.group(1).strip()
            break

    abstract_pattern = r"(?i)(?:abstract|r√©sum√©|summary)[:\s]*\n?([\s\S]{50,1500}?)(?=\n\n|\n#{1,3}|\n\*\*[A-Z])"
    match = re.search(abstract_pattern, markdown_text[:5000])
    if match:
        metadata["abstract"] = match.group(1).strip()

    keywords_pattern = r"(?i)(?:keywords?|mots[- ]?cl√©s?)[:\s]*([^\n]+)"
    match = re.search(keywords_pattern, markdown_text[:5000])
    if match:
        kw_text = match.group(1)
        keywords = re.split(r'[,;‚Ä¢¬∑]', kw_text)
        metadata["keywords"] = [k.strip() for k in keywords if k.strip() and len(k.strip()) > 2]

    date_pattern = r"\b((?:19|20)\d{2})\b"
    dates = re.findall(date_pattern, markdown_text[:3000])
    if dates:
        metadata["date"] = dates[0] if dates else ""

    return metadata


def convert_pdf_complete(pdf_path, doc_name):
    """Conversion compl√®te d'un PDF avec extraction figures et r√©f√©rences."""
    doc_folders = get_doc_folders(doc_name)

    result_data = {
        "doc_name": doc_name,
        "doc_folder": str(doc_folders["root"]),
        "markdown_path": "",
        "figures": [],
        "figures_paths": [],
        "references": {},
        "metadata": {},
        "error": None,
    }

    try:
        print(f"   üìÑ Conversion de {pdf_path.name}...")

        try:
            result = converter(str(pdf_path))
        except Exception as conv_error:
            error_msg = str(conv_error).lower()
            if "extension" in error_msg or "image" in error_msg or "unknown" in error_msg:
                print(f"   ‚ö†Ô∏è Erreur d'extraction d'images, nouvelle tentative sans images...")

                from marker.config.parser import ConfigParser
                temp_config = {
                    "workers": 2,
                    "extract_images": False,
                    "images_as_base64": False,
                    "use_llm": False,
                    "force_ocr": False,
                    "languages": ["fr", "en"],
                    "paginate_output": True,
                    "batch_size": 4 if device == "cuda" else 2,
                }
                config_parser = ConfigParser(temp_config)
                from marker.converters.pdf import PdfConverter
                temp_converter = PdfConverter(
                    config=config_parser.generate_config_dict(),
                    artifact_dict=model_dict,
                    llm_service=None
                )
                result = temp_converter(str(pdf_path))
            else:
                raise

        markdown_text = getattr(result, "markdown", "") or ""
        images_dict = getattr(result, "images", {}) or {}

        print(f"   üìä Markdown: {len(markdown_text)} caract√®res")
        print(f"   üñºÔ∏è Images brutes de Marker: {len(images_dict)}")

        cleaned_images = {}
        if images_dict:
            for key, value in images_dict.items():
                if key and value is not None:
                    if not any(char in str(key) for char in ['?', '*', '<', '>', '|', '\x00']):
                        cleaned_images[key] = value

        images_dict = cleaned_images

        md_path = doc_folders["root"] / f"{doc_name}.md"
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(markdown_text)

        result_data["markdown_path"] = str(md_path)

        # === EXTRACTIONS ===
        result_data["figures_paths"] = save_figures(images_dict, doc_folders)
        result_data["figures"] = extract_figures_info(markdown_text, images_dict)
        result_data["references"] = extract_references_from_markdown(markdown_text)
        result_data["metadata"] = extract_metadata_from_markdown(markdown_text)

        # Sauvegarder les r√©f√©rences
        if result_data["references"].get("reference_count", 0) > 0:
            ref_path = doc_folders["references"] / f"{doc_name}_references.json"
            with open(ref_path, "w", encoding="utf-8") as f:
                json.dump(result_data["references"], f, ensure_ascii=False, indent=2)

        # Sauvegarder les m√©tadonn√©es
        if result_data["metadata"].get("title"):
            meta_path = doc_folders["analyses"] / f"{doc_name}_metadata.json"
            with open(meta_path, "w", encoding="utf-8") as f:
                json.dump(result_data["metadata"], f, ensure_ascii=False, indent=2)

        return result_data

    except Exception as e:
        import traceback
        result_data["error"] = str(e)
        error_log = doc_folders["logs"] / "error.txt"
        with open(error_log, "w", encoding="utf-8") as f:
            f.write(f"Erreur: {str(e)}\nTraceback:\n{traceback.format_exc()}\nDate: {datetime.now().isoformat()}")
        return result_data


print("‚úÖ Fonctions utilitaires optimis√©es d√©finies.")
print("   ‚Ä¢ Extraction de r√©f√©rences: patterns acad√©miques tr√®s am√©lior√©s")
print("   ‚Ä¢ D√©tection de figures: texte + images Marker")
print("   ‚Ä¢ M√©tadonn√©es: titre, abstract, keywords")

‚úÖ Fonctions utilitaires d√©finies. 


## √âtape 7 ‚Äî LangExtract
Extraction structur√©e avec LangExtract (activ√© par d√©faut avec Ollama).

In [None]:
USE_LANGEXTRACT = True

# V√©rifier si Ollama est disponible (ind√©pendamment de la config Marker)
OLLAMA_AVAILABLE = False
try:
    import requests
    response = requests.get("http://localhost:11434/api/tags", timeout=2)
    if response.status_code == 200:
        OLLAMA_AVAILABLE = True
        print("‚úÖ Ollama d√©tect√© - Utilisation de Gemma 3 4B")
except:
    print("‚ö†Ô∏è Ollama non disponible - LangExtract utilisera l'API directe")

LANGEXTRACT_CONFIG = {
    "provider": "ollama" if OLLAMA_AVAILABLE else "openai",
    "model": "gemma3:4b" if OLLAMA_AVAILABLE else "gpt-3.5-turbo",
    "base_url": "http://localhost:11434" if OLLAMA_AVAILABLE else None,
}

# Prompt pour l'extraction structur√©e (utilis√© avec Ollama directement si LangExtract √©choue)
EXTRACTION_PROMPT = """Analysez ce document acad√©mique et retournez un JSON structur√© avec:
{
  "contexte": {"theme": "", "zone_geographique": "", "periode": ""},
  "acteurs": {"institutions": [], "pays": [], "organisations": []},
  "concepts_cles": [],
  "donnees_chiffrees": [],
  "references_principales": []
}
R√©pondez UNIQUEMENT avec le JSON, sans texte avant ou apr√®s."""

def _safe_json(obj):
    try:
        return json.loads(json.dumps(obj))
    except Exception:
        return {"raw": str(obj)}


def extract_with_langextract(markdown_text, doc_name, doc_folders, references_data=None, figures_data=None):
    """Extraction structur√©e avec LangExtract ou Ollama directement."""
    if not USE_LANGEXTRACT:
        return {"status": "skipped", "reason": "USE_LANGEXTRACT=False"}

    # Pr√©parer le texte enrichi (limiter la taille pour le LLM)
    max_chars = 15000  # Limite pour √©viter les timeouts
    enriched_text = markdown_text[:max_chars]

    if references_data and references_data.get("reference_count", 0) > 0:
        enriched_text += f"\n\n[R√âF√âRENCES EXTRAITES: {references_data['reference_count']}]"

    if figures_data:
        enriched_text += f"\n\n[FIGURES D√âTECT√âES: {len(figures_data)}]"

    result = None

    # === TENTATIVE 1: LangExtract (nouvelle API) ===
    try:
        import langextract as lx

        # Essayer diff√©rentes signatures d'API
        if hasattr(lx, "extract"):
            # Nouvelle API sans prompt
            try:
                extraction = lx.extract(enriched_text)
                result = _safe_json(extraction)
            except TypeError:
                # Essayer avec d'autres param√®tres
                try:
                    extraction = lx.extract(enriched_text, model=LANGEXTRACT_CONFIG["model"])
                    result = _safe_json(extraction)
                except:
                    pass
        elif hasattr(lx, "LangExtract"):
            try:
                extractor = lx.LangExtract()
                extraction = extractor.extract(enriched_text)
                result = _safe_json(extraction)
            except:
                pass
    except Exception as e:
        pass  # Continuer avec Ollama

    # === TENTATIVE 2: Ollama directement ===
    if result is None and OLLAMA_AVAILABLE:
        try:
            import requests

            ollama_payload = {
                "model": "gemma3:4b",
                "prompt": f"{EXTRACTION_PROMPT}\n\nDOCUMENT:\n{enriched_text[:10000]}",
                "stream": False,
                "options": {"temperature": 0.1}
            }

            resp = requests.post(
                "http://localhost:11434/api/generate",
                json=ollama_payload,
                timeout=120
            )

            if resp.status_code == 200:
                ollama_result = resp.json()
                response_text = ollama_result.get("response", "")

                # Extraire le JSON de la r√©ponse
                json_match = re.search(r'\{[\s\S]*\}', response_text)
                if json_match:
                    try:
                        result = json.loads(json_match.group())
                    except json.JSONDecodeError:
                        result = {"raw_response": response_text[:500]}
                else:
                    result = {"raw_response": response_text[:500]}
        except Exception as e:
            result = {"status": "error", "error": f"Ollama: {str(e)}"}

    # === FALLBACK: Extraction basique sans LLM ===
    if result is None:
        result = {
            "status": "fallback",
            "note": "Extraction LLM indisponible, donn√©es de base uniquement",
            "references_count": references_data.get("reference_count", 0) if references_data else 0,
            "figures_count": len(figures_data) if figures_data else 0,
        }

    # Sauvegarder le r√©sultat
    extraction_path = doc_folders["analyses"] / f"{doc_name}_langextract.json"
    with open(extraction_path, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    return result


print(f"‚úÖ LangExtract configur√©")
print(f"   Provider: {LANGEXTRACT_CONFIG['provider']}")
print(f"   Model: {LANGEXTRACT_CONFIG['model']}")
print(f"   Fallback Ollama: {'Activ√©' if OLLAMA_AVAILABLE else 'D√©sactiv√©'}")

‚úÖ Ollama d√©tect√© - Utilisation de Gemma 3 4B
‚úÖ LangExtract configur√© (Provider: ollama, Model: gemma3:4b)


## √âtape 8 ‚Äî Test sur un PDF (optionnel)
Permet de valider la configuration avant le batch.

In [None]:
pdf_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])

if pdf_files:
    sample_path = pdf_files[0]
    sample_name = sample_path.stem
    print(f"üß™ Test sur: {sample_path.name}")
    print("‚è≥ Conversion en cours...\n")

    test_start = time.time()
    sample_result = convert_pdf_complete(sample_path, sample_name)
    test_elapsed = time.time() - test_start

    if sample_result.get("error"):
        print(f"\n‚ùå Erreur: {sample_result['error']}")
    else:
        print(f"\n" + "="*60)
        print("‚úÖ CONVERSION R√âUSSIE!")
        print("="*60)
        print(f"‚è±Ô∏è Dur√©e: {test_elapsed:.1f}s")
        print(f"\nüìÇ Dossier de sortie:")
        print(f"   {sample_result['doc_folder']}")

        print(f"\nüìÑ MARKDOWN:")
        print(f"   Fichier: {sample_result['markdown_path']}")
        # Afficher un extrait du contenu
        try:
            with open(sample_result['markdown_path'], 'r', encoding='utf-8') as f:
                content = f.read()
            print(f"   Taille: {len(content):,} caract√®res")
            print(f"   Lignes: {content.count(chr(10)):,}")
        except:
            pass

        print(f"\nüñºÔ∏è FIGURES:")
        figures = sample_result.get('figures', [])
        print(f"   D√©tect√©es dans le texte: {len(figures)}")
        print(f"   Images sauvegard√©es: {len(sample_result.get('figures_paths', []))}")
        if figures:
            print("   Exemples:")
            for fig in figures[:5]:
                fig_type = fig.get('type', 'unknown')
                label = fig.get('label', '')[:60]
                print(f"      [{fig_type}] {label}")
            if len(figures) > 5:
                print(f"      ... et {len(figures)-5} autres")

        print(f"\nüìö R√âF√âRENCES:")
        refs = sample_result.get('references', {})
        ref_count = refs.get('reference_count', 0)
        print(f"   Nombre: {ref_count}")
        if ref_count > 0:
            print("   Exemples:")
            for ref in refs.get('references_list', [])[:3]:
                print(f"      ‚Ä¢ {ref[:80]}...")
            if ref_count > 3:
                print(f"      ... et {ref_count-3} autres")
        elif refs.get('references_text'):
            print(f"   ‚ö†Ô∏è Section trouv√©e mais parsing √©chou√©")
            print(f"   Texte brut ({len(refs['references_text'])} chars):")
            print(f"      {refs['references_text'][:200]}...")

        print(f"\nüìã M√âTADONN√âES:")
        meta = sample_result.get('metadata', {})
        if meta.get('title'):
            print(f"   Titre: {meta['title'][:80]}")
        if meta.get('abstract'):
            print(f"   Abstract: {meta['abstract'][:100]}...")
        if meta.get('keywords'):
            print(f"   Keywords: {', '.join(meta['keywords'][:5])}")
        if meta.get('date'):
            print(f"   Date: {meta['date']}")

        print("\n" + "="*60)

        # === DIAGNOSTIC AVANC√â si pas de figures/refs ===
        if len(figures) == 0 or ref_count == 0:
            print("\nüîç DIAGNOSTIC (extraction insuffisante):")
            try:
                with open(sample_result['markdown_path'], 'r', encoding='utf-8') as f:
                    md_content = f.read()

                # Chercher des patterns de figures
                fig_matches = re.findall(r"(?i)(figure|fig\.?|table|tableau)\s*\d", md_content)
                print(f"   Mentions 'figure/table' trouv√©es: {len(fig_matches)}")
                if fig_matches:
                    print(f"      Exemples: {fig_matches[:5]}")

                # Chercher des images markdown
                img_matches = re.findall(r"!\[[^\]]*\]\([^)]+\)", md_content)
                print(f"   Images markdown (![]()): {len(img_matches)}")

                # Chercher des sections de r√©f√©rences
                ref_sections = re.findall(r"(?i)(references?|bibliography|bibliographie)", md_content)
                print(f"   Mentions 'references/bibliography': {len(ref_sections)}")

                # Chercher des citations num√©rot√©es
                citations = re.findall(r"\[\d+\]", md_content)
                print(f"   Citations num√©riques [n]: {len(citations)}")

                # Afficher la fin du document (souvent les r√©f√©rences)
                print(f"\n   üìú Fin du document (2000 derniers chars):")
                print("   " + "-"*50)
                print(md_content[-2000:].replace('\n', '\n   '))
            except Exception as e:
                print(f"   Erreur diagnostic: {e}")

else:
    print("‚ùå Aucun PDF trouv√© dans le dossier d'entr√©e.")
    print(f"   V√©rifiez le chemin: {INPUT_DIR}")

üß™ Test sur:  Against Sovereignty in Cyberspace.pdf
‚è≥ Conversion en cours...

   DEBUG: Tentative de conversion initiale pour Against Sovereignty in Cyberspace.pdf...


Recognizing Layout: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [00:12<00:00,  1.82it/s]
Running OCR Error Detection: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 22.53it/s]
Detecting bboxes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.59it/s]
Recognizing Text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 41/41 [01:19<00:00,  1.95s/it]
Recognizing tables: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.93it/s]
Detecting bboxes: 0it [00:00, ?it/s]


   DEBUG: Premi√®re conversion r√©ussie.
‚úÖ Conversion r√©ussie en 104.1s! 
   üìÇ Dossier: /content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLMD/Against Sovereignty in Cyberspace
   üìÑ Markdown: /content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLMD/Against Sovereignty in Cyberspace/Against Sovereignty in Cyberspace.md
   üñºÔ∏è Figures: 0
   üìö R√©f√©rences: 0


## √âtape 9 ‚Äî Pipeline complet avec reprise
Traitement batch + logs + reprise automatique.

In [None]:
import time as time_module

def process_all_documents():
    """Traite tous les documents PDF avec reprise automatique."""

    all_pdfs = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])

    if not all_pdfs:
        print("‚ùå Aucun PDF trouv√© dans le dossier d'entr√©e! ")
        print(f"   Chemin v√©rifi√©: {INPUT_DIR}")
        return [], []

    log_file = GLOBAL_LOGS_DIR / f"processing_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    progress_file = GLOBAL_LOGS_DIR / "progress.json"

    def log_message(message):
        print(message)
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(message + "\n")

    processed_files = set()
    if progress_file.exists():
        try:
            with open(progress_file, "r", encoding="utf-8") as f:
                progress_data = json.load(f)
                processed_files = set(progress_data.get("processed", []))
            log_message(f"üìÇ Reprise:  {len(processed_files)} fichiers d√©j√† trait√©s")
        except Exception:
            processed_files = set()

    remaining_pdfs = [p for p in all_pdfs if p. name not in processed_files]

    log_message("="*60)
    log_message(f"üìÑ Total PDFs: {len(all_pdfs)}")
    log_message(f"‚úÖ D√©j√† trait√©s: {len(processed_files)}")
    log_message(f"‚è≥ Restants: {len(remaining_pdfs)}")
    log_message("="*60)

    if not remaining_pdfs:
        log_message("‚úÖ Tous les fichiers ont d√©j√† √©t√© trait√©s!")
        return [], []

    results = []
    errors = []
    start_time = time_module.time()

    def save_progress():
        with open(progress_file, "w", encoding="utf-8") as f:
            json.dump({
                "processed": list(processed_files),
                "last_update": datetime.now().isoformat()
            }, f, ensure_ascii=False, indent=2)

    for idx, pdf_path in enumerate(remaining_pdfs, 1):
        doc_name = pdf_path.stem
        doc_start_time = time_module.time()

        log_message(f"\n[{idx}/{len(remaining_pdfs)}] üöÄ Traitement:  {pdf_path.name}")

        conversion_result = convert_pdf_complete(pdf_path, doc_name)

        if conversion_result.get("error"):
            errors. append({"file": pdf_path. name, "error": conversion_result["error"]})
            log_message(f"   ‚ùå Erreur conversion: {conversion_result['error']}")
            processed_files.add(pdf_path.name)
            save_progress()
            continue

        doc_folders = get_doc_folders(doc_name)

        log_message(
            f"   üñºÔ∏è Figures: {len(conversion_result['figures'])} trouv√©es, "
            f"{len(conversion_result['figures_paths'])} sauvegard√©es"
        )
        log_message(
            f"   üìö R√©f√©rences: {conversion_result['references'].get('reference_count', 0)} extraites"
        )

        extraction = None
        if USE_LANGEXTRACT:
            log_message("   üîç Extraction LangExtract... ")
            with open(conversion_result['markdown_path'], 'r', encoding='utf-8') as f:
                md_content = f.read()
            extraction = extract_with_langextract(
                md_content,
                doc_name,
                doc_folders,
                conversion_result['references'],
                conversion_result['figures'],
            )
            if extraction. get("status") == "error":
                log_message(f"   ‚ö†Ô∏è LangExtract: {extraction. get('error', 'Erreur inconnue')}")
            else:
                log_message("   ‚úÖ LangExtract termin√©")

        analysis = {
            "doc_name": doc_name,
            "source_pdf": str(pdf_path),
            "doc_folder": str(doc_folders["root"]),
            "processed_at": datetime.now().isoformat(),
            "processing_time_seconds": round(time_module.time() - doc_start_time, 2),
            "conversion": {
                "markdown_path": conversion_result['markdown_path'],
                "figures_count": len(conversion_result['figures']),
                "figures_saved": len(conversion_result['figures_paths']),
                "figures_paths": conversion_result['figures_paths'],
                "references_count": conversion_result['references'].get('reference_count', 0),
            },
            "references": conversion_result['references'],
            "figures": conversion_result['figures'],
            "langextract": extraction,
        }

        analysis_path = doc_folders["analyses"] / f"{doc_name}_analysis.json"
        with open(analysis_path, "w", encoding="utf-8") as f:
            json.dump(analysis, f, ensure_ascii=False, indent=2)

        doc_log_path = doc_folders["logs"] / "processing. log"
        with open(doc_log_path, "w", encoding="utf-8") as f:
            f. write(f"Trait√© le: {datetime.now().isoformat()}\n")
            f.write(f"Dur√©e: {analysis['processing_time_seconds']} secondes\n")
            f.write(f"Figures:  {len(conversion_result['figures_paths'])}\n")
            f.write(f"R√©f√©rences: {conversion_result['references'].get('reference_count', 0)}\n")

        results.append(analysis)
        processed_files.add(pdf_path.name)
        save_progress()

        elapsed = time_module.time() - doc_start_time
        log_message(f"   ‚è±Ô∏è Dur√©e: {elapsed:.1f}s")

    total_time = time_module.time() - start_time
    log_message("\n" + "="*60)
    log_message(f"‚úÖ Traitement termin√© en {total_time:.1f} secondes")
    log_message(f"   üìÑ R√©ussis: {len(results)}")
    log_message(f"   ‚ùå Erreurs: {len(errors)}")
    log_message("="*60)

    return results, errors


def export_all_results(results):
    """Exporte les r√©sultats globaux."""
    if not results:
        print("‚ÑπÔ∏è Aucun r√©sultat √† exporter. ")
        return

    summary_path = GLOBAL_LOGS_DIR / "_SUMMARY.json"
    report_md_path = GLOBAL_LOGS_DIR / "_REPORT. md"

    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    all_references = []
    for data in results:
        refs = data.get("references", {})
        if refs. get("references_list"):
            for ref in refs["references_list"]:
                all_references.append({
                    "document": data. get("doc_name", ""),
                    "reference": ref,
                })

    if all_references:
        biblio_path = GLOBAL_LOGS_DIR / "_BIBLIOGRAPHIE_COMPLETE.json"
        with open(biblio_path, "w", encoding="utf-8") as f:
            json. dump(all_references, f, ensure_ascii=False, indent=2)
        print(f"üìö Bibliographie:  {biblio_path} ({len(all_references)} r√©f√©rences)")

    all_figures = []
    for data in results:
        for fig_path in data.get("conversion", {}).get("figures_paths", []):
            all_figures.append({
                "document": data. get("doc_name", ""),
                "path": fig_path,
            })

    if all_figures:
        figures_index_path = GLOBAL_LOGS_DIR / "_INDEX_FIGURES.json"
        with open(figures_index_path, "w", encoding="utf-8") as f:
            json.dump(all_figures, f, ensure_ascii=False, indent=2)
        print(f"üñºÔ∏è Index figures: {figures_index_path} ({len(all_figures)} figures)")

    md_lines = []
    md_lines.append("# Rapport de traitement MFEGSN\n\n")
    md_lines.append(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
    md_lines.append(f"- **Total documents trait√©s:** {len(results)}\n")
    md_lines.append(f"- **Total figures extraites:** {len(all_figures)}\n")
    md_lines.append(f"- **Total r√©f√©rences:** {len(all_references)}\n\n")
    md_lines.append("## Documents trait√©s\n\n")

    for data in results:
        conv = data.get("conversion", {})
        md_lines.append(f"### {data. get('doc_name', '')}\n\n")
        md_lines.append(f"- **Dossier:** `{data.get('doc_folder', '')}`\n")
        md_lines.append(f"- **Figures:** {conv.get('figures_count', 0)}\n")
        md_lines. append(f"- **R√©f√©rences:** {conv.get('references_count', 0)}\n")
        md_lines.append(f"- **Dur√©e:** {data.get('processing_time_seconds', 0)}s\n\n")

    with open(report_md_path, "w", encoding="utf-8") as f:
        f.write("". join(md_lines))

    print(f"‚úÖ R√©sultats export√©s: ")
    print(f"   üìä Summary: {summary_path}")
    print(f"   üìù Report: {report_md_path}")

print("‚úÖ Fonctions de pipeline d√©finies. ")

## √âtape 10 ‚Äî Lancer le traitement
Ex√©cutez cette cellule pour lancer le traitement complet.

In [None]:
# Recharger la liste des PDFs depuis INPUT_DIR pour s'assurer qu'elle est √† jour
print("üìÇ V√©rification du dossier d'entr√©e...")
print(f"   Chemin: {INPUT_DIR}")

# Recharger la liste des fichiers
pdf_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])
print(f"   PDFs trouv√©s: {len(pdf_files)}")

if len(pdf_files) == 0:
    print("\n‚ö†Ô∏è ATTENTION: Aucun PDF trouv√©!")
    print("   V√©rifiez que:")
    print("   1. Le chemin INPUT_DIR est correct")
    print("   2. Des fichiers PDF sont pr√©sents dans ce dossier")
    print("   3. Google Drive est bien mont√©")
else:
    for i, pdf in enumerate(pdf_files[:5], 1):
        print(f"   {i}. {pdf.name}")
    if len(pdf_files) > 5:
        print(f"   ... et {len(pdf_files) - 5} autres")

print("\n" + "="*60)
print("üöÄ D√©marrage du pipeline...")
print("="*60)

results, errors = process_all_documents()

if results:
    export_all_results(results)

print("\n" + "="*60)
print("‚úÖ PIPELINE TERMIN√â")
print("="*60)

if errors:
    print(f"\n‚ö†Ô∏è {len(errors)} erreur(s) rencontr√©e(s):")
    for err in errors[:5]:
        print(f"   - {err['file']}: {err['error'][:50]}...")
    if len(errors) > 5:
        print(f"   ... et {len(errors) - 5} autres erreurs")

## Utilitaires ‚Äî Gestion du cache
Cellules optionnelles pour g√©rer le cache des mod√®les sur Drive.

In [None]:
# === AFFICHER LA TAILLE DU CACHE ===
def show_cache_info():
    """Affiche les informations sur le cache Drive."""
    print("="*60)
    print("üíæ INFORMATIONS CACHE")
    print("="*60)

    for name, path in [("HuggingFace", HF_CACHE_DRIVE),
                       ("Torch", TORCH_CACHE_DRIVE),
                       ("Datalab", DATALAB_CACHE_DRIVE)]:
        size = get_dir_size(path)
        print(f"üìÇ {name}: {size:.2f} GB")

    total = get_dir_size(DRIVE_CACHE_DIR)
    print(f"\nüì¶ Total: {total:.2f} GB")
    print(f"üìç Emplacement: {DRIVE_CACHE_DIR}")
    print("="*60)

# Appeler la fonction pour afficher les infos
show_cache_info()

In [None]:
# === SUPPRIMER LE CACHE (si n√©cessaire) ===
# ‚ö†Ô∏è ATTENTION:  Cela supprimera tous les mod√®les en cache!
# D√©commentez les lignes ci-dessous pour ex√©cuter.

# import shutil
#
# if DRIVE_CACHE_DIR. exists():
#     print(f"‚ö†Ô∏è Suppression du cache:  {DRIVE_CACHE_DIR}")
#     shutil.rmtree(DRIVE_CACHE_DIR)
#     print("‚úÖ Cache supprim√©.  Les mod√®les seront ret√©l√©charg√©s au prochain lancement.")
# else:
#     print("‚ÑπÔ∏è Aucun cache √† supprimer.")

print("‚ÑπÔ∏è D√©commentez le code ci-dessus pour supprimer le cache.")