# MFEGSN ‚Äî Pipeline Colab (Marker + LangExtract)

Notebook optimis√© pour une ex√©cution **pas √† pas** sur Google Colab.

**üöÄ Optimisation:** Les mod√®les (~4GB) sont sauvegard√©s sur Google Drive apr√®s le premier t√©l√©chargement.
Les sessions suivantes chargeront les mod√®les depuis Drive en quelques secondes.

**Ordre recommand√©:** √âtapes 1 ‚Üí 10

## √âtape 1 ‚Äî Monter Google Drive
**IMPORTANT:** Ex√©cutez cette cellule EN PREMIER pour permettre le cache des mod√®les.

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Cr√©er le dossier de cache sur Drive
import os
from pathlib import Path

# === CONFIGURATION DU CACHE ===
DRIVE_CACHE_DIR = Path("/content/drive/MyDrive/.mfegsn_cache")
DRIVE_CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Sous-dossiers pour chaque type de cache
HF_CACHE_DRIVE = DRIVE_CACHE_DIR / "huggingface"
TORCH_CACHE_DRIVE = DRIVE_CACHE_DIR / "torch"
DATALAB_CACHE_DRIVE = DRIVE_CACHE_DIR / "datalab"

for cache_dir in [HF_CACHE_DRIVE, TORCH_CACHE_DRIVE, DATALAB_CACHE_DRIVE]:
    cache_dir.mkdir(parents=True, exist_ok=True)

# Configurer les variables d'environnement pour utiliser le cache Drive
os. environ["HF_HOME"] = str(HF_CACHE_DRIVE)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["HUGGINGFACE_HUB_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["TORCH_HOME"] = str(TORCH_CACHE_DRIVE)
os.environ["XDG_CACHE_HOME"] = str(DRIVE_CACHE_DIR)

# V√©rifier la taille du cache existant
def get_dir_size(path):
    total = 0
    if path.exists():
        for f in path.rglob("*"):
            if f.is_file():
                total += f.stat().st_size
    return total / 1e9

cache_size = get_dir_size(DRIVE_CACHE_DIR)

print("="*60)
print("‚úÖ Google Drive mont√©! ")
print("="*60)
print(f"üìÇ Cache Drive:  {DRIVE_CACHE_DIR}")
print(f"üíæ Taille du cache: {cache_size:.2f} GB")
if cache_size > 3:
    print("üöÄ Mod√®les d√©j√† en cache!  Le chargement sera rapide.")
else:
    print("üì• Premier lancement:  les mod√®les seront t√©l√©charg√©s et mis en cache.")
print("="*60)

## √âtape 2 ‚Äî Installer les d√©pendances
Installation des packages Python n√©cessaires.

In [None]:
# D√©pendances syst√®me
!apt-get update -qq
! apt-get install -y zstd -qq

# D√©pendances Python
!python -m pip install -q --upgrade pip
!python -m pip install -q marker-pdf[full] langextract google-generativeai pillow

# Installer hf_transfer pour des t√©l√©chargements plus rapides (sans casser les d√©pendances)
!pip install -q hf_transfer --no-deps

# Activer hf_transfer
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# V√©rifier les versions
print("\nüì¶ Versions install√©es:")
!pip show huggingface_hub 2>/dev/null | grep Version
! pip show transformers 2>/dev/null | grep Version
!pip show marker-pdf 2>/dev/null | grep Version

print("\n‚úÖ D√©pendances install√©es.")

## √âtape 3 ‚Äî Configurer les dossiers de travail
Modifiez les chemins selon votre Drive.

In [None]:
from pathlib import Path

# === MODIFIEZ CES CHEMINS ===
INPUT_DIR = Path("/content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLPDF")
OUTPUT_DIR = Path("/content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLMD")

assert INPUT_DIR.exists(), f"‚ùå Dossier introuvable: {INPUT_DIR}"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Dossier global pour les logs du pipeline
GLOBAL_LOGS_DIR = OUTPUT_DIR / "_PIPELINE_LOGS"
GLOBAL_LOGS_DIR.mkdir(parents=True, exist_ok=True)

def get_doc_folders(doc_name):
    """Cr√©e et retourne les dossiers pour un document donn√©."""
    doc_dir = OUTPUT_DIR / doc_name
    doc_dir.mkdir(parents=True, exist_ok=True)
    
    folders = {
        "root": doc_dir,
        "figures": doc_dir / "_FIGURES",
        "references": doc_dir / "_REFERENCES",
        "analyses": doc_dir / "_ANALYSES",
        "logs": doc_dir / "_LOGS",
    }
    
    for folder in folders.values():
        folder.mkdir(parents=True, exist_ok=True)
    
    return folders

pdf_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])
print("="*60)
print("üìÅ CONFIGURATION")
print("="*60)
print(f"üìÇ Entr√©e: {INPUT_DIR}")
print(f"üìÇ Sortie: {OUTPUT_DIR}")
print(f"üìÑ PDFs: {len(pdf_files)}")
print(f"üìÇ Structure par fichier:  <nom_fichier>/")
print(f"   ‚îú‚îÄ‚îÄ _ANALYSES/")
print(f"   ‚îú‚îÄ‚îÄ _FIGURES/")
print(f"   ‚îú‚îÄ‚îÄ _LOGS/")
print(f"   ‚îî‚îÄ‚îÄ _REFERENCES/")
print("="*60)

## √âtape 4 ‚Äî Ollama + Gemma 3 4B
Installation et d√©marrage de Gemma 3 4B via Ollama.

In [None]:
USE_OLLAMA = True  # Activ√© par d√©faut pour Gemma local

ollama_process = None

if USE_OLLAMA:
    import subprocess
    import time
    import os
    
    # Installer requests si n√©cessaire
    try:
        import requests
    except ImportError:
        !pip install -q requests
        import requests

    # Installer Ollama
    print("üì¶ Installation d'Ollama...")
    !curl -fsSL https://ollama.com/install.sh 2>/dev/null | sh 2>&1 | tail -n 3

    # D√©marrer Ollama en arri√®re-plan
    ollama_process = subprocess.Popen(
        ["ollama", "serve"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )
    print("‚è≥ D√©marrage du serveur Ollama...")
    
    # Attendre que le serveur soit pr√™t (avec timeout de 30 secondes)
    server_ready = False
    for i in range(30):
        try:
            response = requests.get("http://localhost:11434/api/tags", timeout=2)
            if response.status_code == 200:
                print("‚úÖ Serveur Ollama pr√™t!")
                server_ready = True
                break
        except:
            pass
        time.sleep(1)
    
    if not server_ready:
        print("‚ö†Ô∏è Le serveur Ollama n'a pas d√©marr√© √† temps")
        print("   Essayez de r√©ex√©cuter cette cellule.")

    # T√©l√©charger Gemma 3 4B (SANS espace dans le nom!)
    print("\nüì• T√©l√©chargement de Gemma 3 4B (‚âà3GB)...")
    print("   Cela peut prendre plusieurs minutes...")
    !ollama pull gemma3:4b
    
    # V√©rification
    result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
    if "gemma3:4b" in result.stdout or "gemma3" in result.stdout:
        print("\n‚úÖ Gemma 3 4B t√©l√©charg√©!")
        
        # Pr√©chauffer le mod√®le avec une requ√™te test
        print("üî• Pr√©chauffage du mod√®le...")
        test_result = subprocess.run(
            ["ollama", "run", "gemma3:4b", "R√©ponds uniquement: OK"],
            capture_output=True,
            text=True,
            timeout=30
        )
        
        if test_result.returncode == 0:
            print("‚úÖ Gemma 3 4B pr√™t et op√©rationnel!")
            print("\n" + "="*60)
            print("ü§ñ OLLAMA CONFIGUR√â")
            print("="*60)
            print("Mod√®le: gemma3:4b")
            print("Serveur: http://localhost:11434")
            print("="*60)
        else:
            print("‚ö†Ô∏è Avertissement: Le test du mod√®le a √©chou√©")
            print("   Le mod√®le devrait quand m√™me fonctionner.")
    else:
        print("‚ùå Erreur: Gemma 3 4B non trouv√©")
        print("Mod√®les disponibles:")
        print(result.stdout if result.stdout else "Aucun mod√®le")
else:
    print("‚ÑπÔ∏è Ollama d√©sactiv√©. Mettez USE_OLLAMA = True pour l'activer.")

## √âtape 5 ‚Äî Configurer Marker
Chargement des mod√®les Marker (~4GB) depuis le cache Drive.

**‚ö° Premier lancement:** 10-20 minutes (t√©l√©chargement + sauvegarde sur Drive)

**üöÄ Lancements suivants:** 1-2 minutes (chargement depuis Drive)

In [None]:
import json
import re
import shutil
import base64
import logging
import warnings
import os
import sys
import time
from datetime import datetime
from pathlib import Path

# === CONFIGURATION DES LOGS ===
# D√©sactiver les warnings non critiques
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Configurer le logging pour r√©duire la verbosit√©
logging.getLogger().setLevel(logging.WARNING)

# R√©duire drastiquement les logs des biblioth√®ques tierces
for logger_name in ['transformers', 'torch', 'PIL', 'urllib3', 'filelock', 
                     'huggingface_hub', 'marker', 'datasets']:
    logging.getLogger(logger_name).setLevel(logging.ERROR)

# Variables d'environnement pour r√©duire les logs
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '0'  # Garder les barres de progression

# S'assurer que le cache Drive est bien configur√©
os.environ["HF_HOME"] = str(HF_CACHE_DRIVE)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["HUGGINGFACE_HUB_CACHE"] = str(HF_CACHE_DRIVE)
os.environ["TORCH_HOME"] = str(TORCH_CACHE_DRIVE)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# === FORCER LE T√âL√âCHARGEMENT DES MOD√àLES SURYA DEPUIS HUGGING FACE ===
# R√©sout les erreurs de connexion aux serveurs Datalab
# Les mod√®les sont h√©berg√©s officiellement par Vik Paruchuri sur HF

# Mod√®les Surya √† t√©l√©charger depuis Hugging Face
os.environ["SURYA_LAYOUT_MODEL"] = "vikp/surya_layout"
os.environ["SURYA_REC_MODEL"] = "vikp/surya_rec"
os.environ["SURYA_DET_MODEL"] = "vikp/surya_det"
os.environ["SURYA_ORDER_MODEL"] = "vikp/surya_order"
os.environ["SURYA_TABLE_REC_MODEL"] = "vikp/surya_tablerec"

# Alternative: forcer l'utilisation de huggingface_hub pour tous les t√©l√©chargements
os.environ["SURYA_DOWNLOAD_BACKEND"] = "huggingface"

import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
print("="*60)
print("üñ•Ô∏è CONFIGURATION MAT√âRIELLE")
print("="*60)
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# V√©rifier le cache existant
def get_dir_size(path):
    total = 0
    if path.exists():
        for f in path.rglob("*"):
            if f.is_file():
                try:
                    total += f.stat().st_size
                except:
                    pass
    return total / 1e9

initial_cache_size = get_dir_size(DRIVE_CACHE_DIR)
print(f"\nüíæ Cache Drive: {initial_cache_size:.2f} GB")

if initial_cache_size > 3:
    print("üöÄ Mod√®les trouv√©s en cache! Chargement rapide...")
else:
    print("üì• Premier t√©l√©chargement des mod√®les Marker (~4GB)")
    print("   ‚è±Ô∏è Dur√©e estim√©e: 10-20 minutes")
    print("   üíæ Les mod√®les seront sauvegard√©s sur Drive")

print("="*60)

# Chronom√®tre
start_time = time.time()

# Charger les mod√®les avec messages de progression
print("\nüì¶ Chargement des mod√®les Marker...")
if initial_cache_size > 3:
    print("   ‚ö° Chargement depuis le cache Drive...")
else:
    print("   üì• T√©l√©chargement en cours (barres de progression ci-dessous)...")

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

# Configuration Marker avec Gemma local
marker_config = {
    "workers": 2,
    "extract_images": True,
    "images_as_base64": False,
    "use_llm": USE_OLLAMA,
    "llm_provider": "ollama" if USE_OLLAMA else None,
    "llm_model": "gemma3:4b" if USE_OLLAMA else None,
    "force_ocr": False,
    "languages": ["fr", "en"],
    "paginate_output": True,
    "batch_size": 4 if device == "cuda" else 2,
}

print("\n‚öôÔ∏è Cr√©ation du dictionnaire de mod√®les...")
model_dict = create_model_dict()

print("‚úì Configuration du convertisseur...")
config_parser = ConfigParser(marker_config)
converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=model_dict,
)

# R√©sum√©
elapsed = time.time() - start_time
final_cache_size = get_dir_size(DRIVE_CACHE_DIR)
downloaded = final_cache_size - initial_cache_size

print("\n" + "="*60)
print("‚úÖ MARKER CONFIGUR√â AVEC SUCC√àS!")
print("="*60)
print(f"‚è±Ô∏è Dur√©e: {elapsed/60:.1f} minutes")
print(f"üì¶ T√©l√©charg√© cette session: {max(0, downloaded):.2f} GB")
print(f"üíæ Cache total sur Drive: {final_cache_size:.2f} GB")
print(f"‚öôÔ∏è Workers: {marker_config['workers']}, Batch: {marker_config['batch_size']}")
if USE_OLLAMA:
    print(f"ü§ñ LLM: Gemma 3 4B via Ollama")
print("="*60)
if downloaded > 0.5:
    print("\nüí° Les mod√®les sont maintenant en cache sur votre Drive.")
    print("   Les prochaines sessions chargeront en 1-2 minutes!")

## √âtape 6 ‚Äî Fonctions utilitaires
Extraction des r√©f√©rences, figures et conversion PDF ‚Üí Markdown.

In [None]:
def extract_references_from_markdown(markdown_text):
    """Extrait la section r√©f√©rences/bibliographie du Markdown."""
    references = {
        "references_text": "",
        "references_list": [],
        "reference_count": 0,
    }

    ref_patterns = [
        r"(? i)(? :^|\n)#{1,3}\s*(references|r√©f√©rences|bibliography|bibliographie|works\s*cited|sources?)\s*[\s: ]*\n([\s\S]*?)(? =\n#{1,3}\s|\Z)",
        r"(?i)(?:^|\n)\*\*(references|r√©f√©rences|bibliography|bibliographie)\*\*\s*[\s: ]*\n([\s\S]*?)(?=\n\*\*|\n#{1,3}|\Z)",
    ]

    for pattern in ref_patterns:
        match = re.search(pattern, markdown_text, re.MULTILINE)
        if match:
            ref_section = match.group(2).strip()
            references["references_text"] = ref_section

            ref_lines = []
            lines = ref_section.split("\n")
            current_ref = ""

            for line in lines:
                line = line.strip()
                if not line:
                    if current_ref:
                        ref_lines.append(current_ref. strip())
                        current_ref = ""
                    continue

                if re.match(r"^(\[\d+\]|\d+\.|[-‚Ä¢]|\([A-Z])", line):
                    if current_ref:
                        ref_lines.append(current_ref.strip())
                    current_ref = line
                else:
                    current_ref += " " + line

            if current_ref:
                ref_lines. append(current_ref.strip())

            references["references_list"] = [r for r in ref_lines if len(r) > 20]
            references["reference_count"] = len(references["references_list"])
            break

    return references


def extract_figures_info(markdown_text, images_dict):
    """Extrait les informations sur les figures du document."""
    figures = []
    fig_pattern = r"(?i)(figure|fig\.)\s*(\d+)?\s*[:]? \s*(. {0,120})"

    for match in re.finditer(fig_pattern, markdown_text):
        title = (match.group(3) or "").strip()
        figures.append({
            "label": match.group(0).strip(),
            "title": title,
        })

    if images_dict:
        for img_name in images_dict. keys():
            existing = any(f.get("path") == img_name for f in figures)
            if not existing:
                figures.append({
                    "label": str(img_name),
                    "title": "",
                    "path": str(img_name),
                })

    return figures


def save_figures(images_dict, doc_folders):
    """Sauvegarde les figures extraites dans le dossier _FIGURES du document."""
    if not images_dict:
        return []

    figures_folder = doc_folders["figures"]
    saved_paths = []

    for img_name, img_data in images_dict.items():
        safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", str(img_name))
        img_path = figures_folder / f"{safe_name}.png"

        try:
            if isinstance(img_data, Image.Image):
                img_data.save(img_path)
            elif isinstance(img_data, (bytes, bytearray)):
                with open(img_path, "wb") as f:
                    f.write(img_data)
            elif isinstance(img_data, str):
                if img_data.startswith("data: image"):
                    b64_data = img_data.split(",", 1)[1]
                    with open(img_path, "wb") as f:
                        f.write(base64.b64decode(b64_data))
                elif Path(img_data).exists():
                    shutil.copy(img_data, img_path)
                else:
                    with open(img_path, "wb") as f:
                        f.write(base64.b64decode(img_data))
            else:
                continue

            saved_paths. append(str(img_path))
        except Exception:
            continue

    return saved_paths


def convert_pdf_complete(pdf_path, doc_name):
    """Conversion compl√®te d'un PDF avec extraction figures et r√©f√©rences."""
    doc_folders = get_doc_folders(doc_name)
    
    result_data = {
        "doc_name": doc_name,
        "doc_folder": str(doc_folders["root"]),
        "markdown_path": "",
        "figures": [],
        "figures_paths": [],
        "references": {},
        "error": None,
    }

    try:
        result = converter(str(pdf_path))
        markdown_text = getattr(result, "markdown", "") or ""
        images_dict = getattr(result, "images", {}) or {}

        md_path = doc_folders["root"] / f"{doc_name}.md"
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(markdown_text)

        result_data["markdown_path"] = str(md_path)
        result_data["figures_paths"] = save_figures(images_dict, doc_folders)
        result_data["figures"] = extract_figures_info(markdown_text, images_dict)

        result_data["references"] = extract_references_from_markdown(markdown_text)
        if result_data["references"]. get("reference_count", 0) > 0:
            ref_path = doc_folders["references"] / f"{doc_name}_references.json"
            with open(ref_path, "w", encoding="utf-8") as f:
                json.dump(result_data["references"], f, ensure_ascii=False, indent=2)

        return result_data
    except Exception as e:
        result_data["error"] = str(e)
        error_log = doc_folders["logs"] / "error. txt"
        with open(error_log, "w", encoding="utf-8") as f:
            f.write(f"Erreur:  {str(e)}\nDate: {datetime.now().isoformat()}")
        return result_data

print("‚úÖ Fonctions utilitaires d√©finies. ")

## √âtape 7 ‚Äî LangExtract
Extraction structur√©e avec LangExtract (activ√© par d√©faut avec Ollama).

In [None]:
USE_LANGEXTRACT = True

LANGEXTRACT_CONFIG = {
    "provider": "ollama" if USE_OLLAMA else "openai",
    "model": "gemma3:4b" if USE_OLLAMA else "gpt-3.5-turbo",
    "base_url": "http://localhost:11434" if USE_OLLAMA else None,
}

PROMPT_TEMPLATE = """Vous √™tes un assistant d'analyse pour des documents en sciences sociales.
Retournez un JSON structur√© avec les sections suivantes:

1.  CONTEXTE
- Th√®me principal
- Zone g√©ographique
- P√©riode

2. ACTEURS
- Institutions
- Pays
- Organisations

3. CONCEPTS CL√âS
- Mots-cl√©s
- Concepts

4. DONN√âES
- Chiffres cl√©s (si disponibles)

5. R√âF√âRENCES
- Principales r√©f√©rences cit√©es

6. FIGURES ET TABLEAUX
- Liste des figures mentionn√©es

R√©pondez uniquement avec un JSON valide."""

def _safe_json(obj):
    try:
        return json.loads(json.dumps(obj))
    except Exception:
        return {"raw": str(obj)}


def extract_with_langextract(markdown_text, doc_name, doc_folders, references_data=None, figures_data=None):
    """Extraction structur√©e avec LangExtract."""
    if not USE_LANGEXTRACT:
        return {"status": "skipped", "reason": "USE_LANGEXTRACT=False"}

    enriched_text = markdown_text

    if references_data and references_data.get("reference_count", 0) > 0:
        enriched_text += "\n\n## R√âF√âRENCES\n"
        enriched_text += f"Nombre de r√©f√©rences: {references_data['reference_count']}\n"
        for i, ref in enumerate(references_data. get("references_list", [])[:20], 1):
            enriched_text += f"[{i}] {ref}\n"

    if figures_data:
        enriched_text += "\n\n## FIGURES IDENTIFI√âES\n"
        enriched_text += f"Nombre de figures:  {len(figures_data)}\n"
        for fig in figures_data[: 10]:
            enriched_text += f"- {fig. get('label', '')} {fig.get('title', '')}\n"

    try:
        import langextract as lx
        
        extractor_kwargs = {
            "prompt": PROMPT_TEMPLATE,
        }
        
        if USE_OLLAMA and LANGEXTRACT_CONFIG["base_url"]:
            extractor_kwargs["provider"] = "ollama"
            extractor_kwargs["model"] = LANGEXTRACT_CONFIG["model"]
            extractor_kwargs["base_url"] = LANGEXTRACT_CONFIG["base_url"]
        
        if hasattr(lx, "extract"):
            extraction = lx.extract(enriched_text, **extractor_kwargs)
        elif hasattr(lx, "LangExtract"):
            extractor = lx.LangExtract(**extractor_kwargs)
            extraction = extractor. extract(enriched_text)
        else:
            return {"status": "error", "error": "API LangExtract introuvable"}

        result = _safe_json(extraction)
        
        extraction_path = doc_folders["analyses"] / f"{doc_name}_langextract.json"
        with open(extraction_path, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        
        return result
    except Exception as e:
        return {"status":  "error", "error":  str(e)}

print(f"‚úÖ LangExtract configur√© (Provider: {LANGEXTRACT_CONFIG['provider']}, Model: {LANGEXTRACT_CONFIG['model']})")

## √âtape 8 ‚Äî Test sur un PDF (optionnel)
Permet de valider la configuration avant le batch.

In [None]:
pdf_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])

if pdf_files:
    sample_path = pdf_files[0]
    sample_name = sample_path.stem
    print(f"üß™ Test sur:  {sample_path.name}")
    print("‚è≥ Conversion en cours...\n")
    
    test_start = time.time()
    sample_result = convert_pdf_complete(sample_path, sample_name)
    test_elapsed = time.time() - test_start
    
    if sample_result. get("error"):
        print(f"‚ùå Erreur:  {sample_result['error']}")
    else:
        print(f"‚úÖ Conversion r√©ussie en {test_elapsed:.1f}s! ")
        print(f"   üìÇ Dossier: {sample_result['doc_folder']}")
        print(f"   üìÑ Markdown: {sample_result['markdown_path']}")
        print(f"   üñºÔ∏è Figures: {len(sample_result['figures_paths'])}")
        print(f"   üìö R√©f√©rences: {sample_result['references']. get('reference_count', 0)}")
else:
    print("‚ùå Aucun PDF trouv√© dans le dossier d'entr√©e.")

## √âtape 9 ‚Äî Pipeline complet avec reprise
Traitement batch + logs + reprise automatique.

In [None]:
import time as time_module

def process_all_documents():
    """Traite tous les documents PDF avec reprise automatique."""
    
    all_pdfs = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])
    
    if not all_pdfs:
        print("‚ùå Aucun PDF trouv√© dans le dossier d'entr√©e! ")
        print(f"   Chemin v√©rifi√©: {INPUT_DIR}")
        return [], []
    
    log_file = GLOBAL_LOGS_DIR / f"processing_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    progress_file = GLOBAL_LOGS_DIR / "progress.json"

    def log_message(message):
        print(message)
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(message + "\n")

    processed_files = set()
    if progress_file.exists():
        try:
            with open(progress_file, "r", encoding="utf-8") as f:
                progress_data = json.load(f)
                processed_files = set(progress_data.get("processed", []))
            log_message(f"üìÇ Reprise:  {len(processed_files)} fichiers d√©j√† trait√©s")
        except Exception:
            processed_files = set()

    remaining_pdfs = [p for p in all_pdfs if p. name not in processed_files]

    log_message("="*60)
    log_message(f"üìÑ Total PDFs: {len(all_pdfs)}")
    log_message(f"‚úÖ D√©j√† trait√©s: {len(processed_files)}")
    log_message(f"‚è≥ Restants: {len(remaining_pdfs)}")
    log_message("="*60)

    if not remaining_pdfs:
        log_message("‚úÖ Tous les fichiers ont d√©j√† √©t√© trait√©s!")
        return [], []

    results = []
    errors = []
    start_time = time_module.time()

    def save_progress():
        with open(progress_file, "w", encoding="utf-8") as f:
            json.dump({
                "processed": list(processed_files),
                "last_update": datetime.now().isoformat()
            }, f, ensure_ascii=False, indent=2)

    for idx, pdf_path in enumerate(remaining_pdfs, 1):
        doc_name = pdf_path.stem
        doc_start_time = time_module.time()
        
        log_message(f"\n[{idx}/{len(remaining_pdfs)}] üöÄ Traitement:  {pdf_path.name}")

        conversion_result = convert_pdf_complete(pdf_path, doc_name)
        
        if conversion_result.get("error"):
            errors. append({"file": pdf_path. name, "error": conversion_result["error"]})
            log_message(f"   ‚ùå Erreur conversion: {conversion_result['error']}")
            processed_files.add(pdf_path.name)
            save_progress()
            continue

        doc_folders = get_doc_folders(doc_name)
        
        log_message(
            f"   üñºÔ∏è Figures: {len(conversion_result['figures'])} trouv√©es, "
            f"{len(conversion_result['figures_paths'])} sauvegard√©es"
        )
        log_message(
            f"   üìö R√©f√©rences: {conversion_result['references'].get('reference_count', 0)} extraites"
        )

        extraction = None
        if USE_LANGEXTRACT:
            log_message("   üîç Extraction LangExtract... ")
            with open(conversion_result['markdown_path'], 'r', encoding='utf-8') as f:
                md_content = f.read()
            extraction = extract_with_langextract(
                md_content,
                doc_name,
                doc_folders,
                conversion_result['references'],
                conversion_result['figures'],
            )
            if extraction. get("status") == "error":
                log_message(f"   ‚ö†Ô∏è LangExtract: {extraction. get('error', 'Erreur inconnue')}")
            else:
                log_message("   ‚úÖ LangExtract termin√©")

        analysis = {
            "doc_name": doc_name,
            "source_pdf": str(pdf_path),
            "doc_folder": str(doc_folders["root"]),
            "processed_at": datetime.now().isoformat(),
            "processing_time_seconds": round(time_module.time() - doc_start_time, 2),
            "conversion": {
                "markdown_path": conversion_result['markdown_path'],
                "figures_count": len(conversion_result['figures']),
                "figures_saved": len(conversion_result['figures_paths']),
                "figures_paths": conversion_result['figures_paths'],
                "references_count": conversion_result['references'].get('reference_count', 0),
            },
            "references": conversion_result['references'],
            "figures": conversion_result['figures'],
            "langextract": extraction,
        }

        analysis_path = doc_folders["analyses"] / f"{doc_name}_analysis.json"
        with open(analysis_path, "w", encoding="utf-8") as f:
            json.dump(analysis, f, ensure_ascii=False, indent=2)

        doc_log_path = doc_folders["logs"] / "processing. log"
        with open(doc_log_path, "w", encoding="utf-8") as f:
            f. write(f"Trait√© le: {datetime.now().isoformat()}\n")
            f.write(f"Dur√©e: {analysis['processing_time_seconds']} secondes\n")
            f.write(f"Figures:  {len(conversion_result['figures_paths'])}\n")
            f.write(f"R√©f√©rences: {conversion_result['references'].get('reference_count', 0)}\n")

        results.append(analysis)
        processed_files.add(pdf_path.name)
        save_progress()
        
        elapsed = time_module.time() - doc_start_time
        log_message(f"   ‚è±Ô∏è Dur√©e: {elapsed:.1f}s")

    total_time = time_module.time() - start_time
    log_message("\n" + "="*60)
    log_message(f"‚úÖ Traitement termin√© en {total_time:.1f} secondes")
    log_message(f"   üìÑ R√©ussis: {len(results)}")
    log_message(f"   ‚ùå Erreurs: {len(errors)}")
    log_message("="*60)

    return results, errors


def export_all_results(results):
    """Exporte les r√©sultats globaux."""
    if not results:
        print("‚ÑπÔ∏è Aucun r√©sultat √† exporter. ")
        return
    
    summary_path = GLOBAL_LOGS_DIR / "_SUMMARY.json"
    report_md_path = GLOBAL_LOGS_DIR / "_REPORT. md"

    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    all_references = []
    for data in results:
        refs = data.get("references", {})
        if refs. get("references_list"):
            for ref in refs["references_list"]:
                all_references.append({
                    "document": data. get("doc_name", ""),
                    "reference": ref,
                })

    if all_references:
        biblio_path = GLOBAL_LOGS_DIR / "_BIBLIOGRAPHIE_COMPLETE.json"
        with open(biblio_path, "w", encoding="utf-8") as f:
            json. dump(all_references, f, ensure_ascii=False, indent=2)
        print(f"üìö Bibliographie:  {biblio_path} ({len(all_references)} r√©f√©rences)")

    all_figures = []
    for data in results:
        for fig_path in data.get("conversion", {}).get("figures_paths", []):
            all_figures.append({
                "document": data. get("doc_name", ""),
                "path": fig_path,
            })

    if all_figures:
        figures_index_path = GLOBAL_LOGS_DIR / "_INDEX_FIGURES.json"
        with open(figures_index_path, "w", encoding="utf-8") as f:
            json.dump(all_figures, f, ensure_ascii=False, indent=2)
        print(f"üñºÔ∏è Index figures: {figures_index_path} ({len(all_figures)} figures)")

    md_lines = []
    md_lines.append("# Rapport de traitement MFEGSN\n\n")
    md_lines.append(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
    md_lines.append(f"- **Total documents trait√©s:** {len(results)}\n")
    md_lines.append(f"- **Total figures extraites:** {len(all_figures)}\n")
    md_lines.append(f"- **Total r√©f√©rences:** {len(all_references)}\n\n")
    md_lines.append("## Documents trait√©s\n\n")

    for data in results:
        conv = data.get("conversion", {})
        md_lines.append(f"### {data. get('doc_name', '')}\n\n")
        md_lines.append(f"- **Dossier:** `{data.get('doc_folder', '')}`\n")
        md_lines.append(f"- **Figures:** {conv.get('figures_count', 0)}\n")
        md_lines. append(f"- **R√©f√©rences:** {conv.get('references_count', 0)}\n")
        md_lines.append(f"- **Dur√©e:** {data.get('processing_time_seconds', 0)}s\n\n")

    with open(report_md_path, "w", encoding="utf-8") as f:
        f.write("". join(md_lines))

    print(f"‚úÖ R√©sultats export√©s: ")
    print(f"   üìä Summary: {summary_path}")
    print(f"   üìù Report: {report_md_path}")

print("‚úÖ Fonctions de pipeline d√©finies. ")

## √âtape 10 ‚Äî Lancer le traitement
Ex√©cutez cette cellule pour lancer le traitement complet.

In [None]:
# Recharger la liste des PDFs depuis INPUT_DIR pour s'assurer qu'elle est √† jour
print("üìÇ V√©rification du dossier d'entr√©e...")
print(f"   Chemin: {INPUT_DIR}")

# Recharger la liste des fichiers
pdf_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])
print(f"   PDFs trouv√©s: {len(pdf_files)}")

if len(pdf_files) == 0:
    print("\n‚ö†Ô∏è ATTENTION: Aucun PDF trouv√©!")
    print("   V√©rifiez que:")
    print("   1. Le chemin INPUT_DIR est correct")
    print("   2. Des fichiers PDF sont pr√©sents dans ce dossier")
    print("   3. Google Drive est bien mont√©")
else:
    for i, pdf in enumerate(pdf_files[:5], 1):
        print(f"   {i}. {pdf.name}")
    if len(pdf_files) > 5:
        print(f"   ... et {len(pdf_files) - 5} autres")

print("\n" + "="*60)
print("üöÄ D√©marrage du pipeline...")
print("="*60)

results, errors = process_all_documents()

if results:
    export_all_results(results)

print("\n" + "="*60)
print("‚úÖ PIPELINE TERMIN√â")
print("="*60)

if errors:
    print(f"\n‚ö†Ô∏è {len(errors)} erreur(s) rencontr√©e(s):")
    for err in errors[:5]:
        print(f"   - {err['file']}: {err['error'][:50]}...")
    if len(errors) > 5:
        print(f"   ... et {len(errors) - 5} autres erreurs")

## Utilitaires ‚Äî Gestion du cache
Cellules optionnelles pour g√©rer le cache des mod√®les sur Drive.

In [None]:
# === AFFICHER LA TAILLE DU CACHE ===
def show_cache_info():
    """Affiche les informations sur le cache Drive."""
    print("="*60)
    print("üíæ INFORMATIONS CACHE")
    print("="*60)
    
    for name, path in [("HuggingFace", HF_CACHE_DRIVE), 
                       ("Torch", TORCH_CACHE_DRIVE), 
                       ("Datalab", DATALAB_CACHE_DRIVE)]:
        size = get_dir_size(path)
        print(f"üìÇ {name}: {size:.2f} GB")
    
    total = get_dir_size(DRIVE_CACHE_DIR)
    print(f"\nüì¶ Total: {total:.2f} GB")
    print(f"üìç Emplacement: {DRIVE_CACHE_DIR}")
    print("="*60)

# Appeler la fonction pour afficher les infos
show_cache_info()

In [None]:
# === SUPPRIMER LE CACHE (si n√©cessaire) ===
# ‚ö†Ô∏è ATTENTION:  Cela supprimera tous les mod√®les en cache!
# D√©commentez les lignes ci-dessous pour ex√©cuter.

# import shutil
# 
# if DRIVE_CACHE_DIR. exists():
#     print(f"‚ö†Ô∏è Suppression du cache:  {DRIVE_CACHE_DIR}")
#     shutil.rmtree(DRIVE_CACHE_DIR)
#     print("‚úÖ Cache supprim√©.  Les mod√®les seront ret√©l√©charg√©s au prochain lancement.")
# else:
#     print("‚ÑπÔ∏è Aucun cache √† supprimer.")

print("‚ÑπÔ∏è D√©commentez le code ci-dessus pour supprimer le cache.")