# MFEGSN ‚Äî Pipeline Colab (Marker + LangExtract)

Notebook optimis√© pour une ex√©cution **pas √† pas** sur Google Colab.
Il s√©pare clairement les **blocs texte** (explications) et les **blocs code** (ex√©cution).

**Ordre recommand√© :** √âtapes 1 ‚Üí 10, avec tests optionnels si besoin.

## √âtape 1 ‚Äî Installer les d√©pendances
- Syst√®mes : zstd (requis pour Ollama).
- Python : marker-pdf, langextract, pillow.

In [None]:
# D√©pendances syst√®me
!apt-get update -qq
!apt-get install -y zstd -qq

# D√©pendances Python
!python -m pip install -q --upgrade pip
!python -m pip install -q marker-pdf[full] langextract google-generativeai pillow

print("‚úÖ D√©pendances install√©es.")

## √âtape 2 ‚Äî Monter Google Drive
Ex√©cutez cette cellule si vos PDF sont sur Drive.

In [None]:
from google.colab import drive
drive.mount("/content/drive")

## √âtape 3 ‚Äî Configurer les dossiers
Modifiez les chemins ci-dessous selon votre Drive.

In [None]:
from pathlib import Path

# === MODIFIEZ CES CHEMINS ===
INPUT_DIR = Path("/content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLPDF")
OUTPUT_DIR = Path("/content/drive/MyDrive/G√©opolitique et Souverainet√© Num√©riques/ALL/ALLMD")

assert INPUT_DIR.exists(), f"‚ùå Dossier introuvable : {INPUT_DIR}"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

FIGURES_DIR = OUTPUT_DIR / "_FIGURES"
REFERENCES_DIR = OUTPUT_DIR / "_REFERENCES"
ANALYSES_DIR = OUTPUT_DIR / "_ANALYSES"
LOGS_DIR = OUTPUT_DIR / "_LOGS"

for p in [FIGURES_DIR, REFERENCES_DIR, ANALYSES_DIR, LOGS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

pdf_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])
print("=" * 60)
print("üìÅ CONFIGURATION")
print("=" * 60)
print(f"üìÇ Entr√©e  : {INPUT_DIR}")
print(f"üìÇ Sortie  : {OUTPUT_DIR}")
print(f"üìÑ PDFs    : {len(pdf_files)}")
print(f"üñºÔ∏è  Figures : {FIGURES_DIR}")
print(f"üìö R√©f√©rences : {REFERENCES_DIR}")
print("=" * 60)

## √âtape 4 ‚Äî (Optionnel) Ollama + Gemma 3 4B
Activez uniquement si vous utilisez LangExtract avec un mod√®le local.

In [None]:
USE_OLLAMA = False  # Mettre True si vous voulez Gemma via Ollama

if USE_OLLAMA:
    import subprocess
    import time

    # Installer Ollama (si besoin)
    !curl -fsSL https://ollama.com/install.sh | sh

    # D√©marrer Ollama en arri√®re-plan
    ollama_process = subprocess.Popen(
        ["ollama", "serve"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )
    print("‚è≥ D√©marrage d'Ollama...")
    time.sleep(10)

    # T√©l√©charger Gemma 3 4B
    print("üì• T√©l√©chargement de Gemma 3 4B (‚âà3GB)...")
    !ollama pull gemma3:4b
    !ollama list
    print("‚úÖ Gemma 3 4B pr√™t !")

## √âtape 5 ‚Äî Configurer Marker
R√©glages optimis√©s (2 workers + extraction figures + r√©f√©rences).

In [None]:
import json
import re
import shutil
import base64
from datetime import datetime

import torch
from PIL import Image
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è  Device : {device}")
if device == "cuda":
    print(f"   GPU :  {torch.cuda.get_device_name(0)}")
    print(f"   VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

marker_config = {
    "workers": 2,
    "extract_images": True,
    "images_as_base64": False,
    "use_llm": False,
    "force_ocr": False,
    "languages": ["fr", "en"],
    "paginate_output": True,
    "batch_size": 4 if device == "cuda" else 2,
}

print("üì• Chargement des mod√®les Marker (premi√®re fois = t√©l√©chargement)...")
print(f"‚öôÔ∏è  Configuration : {marker_config['workers']} workers, batch_size={marker_config['batch_size']}")

model_dict = create_model_dict()
config_parser = ConfigParser(marker_config)
converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=model_dict,
)

print("‚úÖ Marker configur√© avec 2 workers !")

## √âtape 6 ‚Äî Fonctions utilitaires
Extraction des r√©f√©rences, figures et conversion PDF ‚Üí Markdown.

In [None]:
def extract_references_from_markdown(markdown_text):
    """Extrait la section r√©f√©rences/bibliographie du Markdown."""
    references = {
        "references_text": "",
        "references_list": [],
        "reference_count": 0,
    }

    ref_patterns = [
        r"(?i)(?:^|\n)#{1,3}\s*(references|r√©f√©rences|bibliography|bibliographie|works\s*cited|sources?)\s*[\s:]*\n([\s\S]*?)(?=\n#{1,3}\s|\Z)",
        r"(?i)(?:^|\n)\*\*(references|r√©f√©rences|bibliography|bibliographie)\*\*\s*[\s:]*\n([\s\S]*?)(?=\n\*\*|\n#{1,3}|\Z)",
    ]

    for pattern in ref_patterns:
        match = re.search(pattern, markdown_text, re.MULTILINE)
        if match:
            ref_section = match.group(2).strip()
            references["references_text"] = ref_section

            ref_lines = []
            lines = ref_section.split("\n")
            current_ref = ""

            for line in lines:
                line = line.strip()
                if not line:
                    if current_ref:
                        ref_lines.append(current_ref.strip())
                        current_ref = ""
                    continue

                if re.match(r"^(\[\d+\]|\d+\.|[-‚Ä¢]|\([A-Z])", line):
                    if current_ref:
                        ref_lines.append(current_ref.strip())
                    current_ref = line
                else:
                    current_ref += " " + line

            if current_ref:
                ref_lines.append(current_ref.strip())

            references["references_list"] = [r for r in ref_lines if len(r) > 20]
            references["reference_count"] = len(references["references_list"])
            break

    return references


def extract_figures_info(markdown_text, images_dict):
    """Extrait les informations sur les figures du document."""
    figures = []
    fig_pattern = r"(?i)(figure|fig\.)\s*(\d+)?\s*[:]?\s*(.{0,120})"

    for match in re.finditer(fig_pattern, markdown_text):
        title = (match.group(3) or "").strip()
        figures.append({
            "label": match.group(0).strip(),
            "title": title,
        })

    if images_dict:
        for img_name in images_dict.keys():
            existing = any(f.get("path") == img_name for f in figures)
            if not existing:
                figures.append({
                    "label": str(img_name),
                    "title": "",
                    "path": str(img_name),
                })

    return figures


def save_figures(images_dict, doc_name, figures_base_folder):
    """Sauvegarde les figures extraites dans un dossier d√©di√©."""
    if not images_dict:
        return []

    doc_figures_folder = figures_base_folder / doc_name
    doc_figures_folder.mkdir(parents=True, exist_ok=True)
    saved_paths = []

    for img_name, img_data in images_dict.items():
        safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", str(img_name))
        img_path = doc_figures_folder / f"{safe_name}.png"

        try:
            if isinstance(img_data, Image.Image):
                img_data.save(img_path)
            elif isinstance(img_data, (bytes, bytearray)):
                with open(img_path, "wb") as f:
                    f.write(img_data)
            elif isinstance(img_data, str):
                if img_data.startswith("data:image"):
                    b64_data = img_data.split(",", 1)[1]
                    with open(img_path, "wb") as f:
                        f.write(base64.b64decode(b64_data))
                elif Path(img_data).exists():
                    shutil.copy(img_data, img_path)
                else:
                    with open(img_path, "wb") as f:
                        f.write(base64.b64decode(img_data))
            else:
                continue

            saved_paths.append(str(img_path))
        except Exception:
            continue

    return saved_paths


def convert_pdf_complete(pdf_path, doc_name):
    """Conversion compl√®te d'un PDF avec extraction figures et r√©f√©rences."""
    result_data = {
        "doc_name": doc_name,
        "markdown_path": "",
        "figures": [],
        "figures_paths": [],
        "references": {},
        "error": None,
    }

    try:
        result = converter(str(pdf_path))
        markdown_text = getattr(result, "markdown", "") or ""
        images_dict = getattr(result, "images", {}) or {}

        md_path = OUTPUT_DIR / f"{doc_name}.md"
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(markdown_text)

        result_data["markdown_path"] = str(md_path)
        result_data["figures_paths"] = save_figures(images_dict, doc_name, FIGURES_DIR)
        result_data["figures"] = extract_figures_info(markdown_text, images_dict)

        result_data["references"] = extract_references_from_markdown(markdown_text)
        if result_data["references"].get("reference_count", 0) > 0:
            ref_path = REFERENCES_DIR / f"{doc_name}_references.json"
            with open(ref_path, "w", encoding="utf-8") as f:
                json.dump(result_data["references"], f, ensure_ascii=False, indent=2)

        return result_data
    except Exception as e:
        result_data["error"] = str(e)
        return result_data

## √âtape 7 ‚Äî LangExtract (optionnel)
Activez uniquement si vous souhaitez l'extraction structur√©e.

In [None]:
USE_LANGEXTRACT = False  # Mettre True pour activer LangExtract

PROMPT_TEMPLATE = """
Vous √™tes un assistant d'analyse pour des documents en sciences sociales.
Retournez un JSON structur√© avec les sections suivantes :

1. CONTEXTE
- Th√®me principal
- Zone g√©ographique
- P√©riode

2. ACTEURS
- Institutions
- Pays
- Organisations

3. CONCEPTS CL√âS
- Mots-cl√©s
- Concepts

4. DONN√âES
- Chiffres cl√©s (si disponibles)

5. R√âF√âRENCES
- Principales r√©f√©rences cit√©es

6. FIGURES ET TABLEAUX
- Liste des figures mentionn√©es

R√©pondez uniquement avec un JSON valide.
"""

def _safe_json(obj):
    try:
        return json.loads(json.dumps(obj))
    except Exception:
        return {"raw": str(obj)}


def extract_with_langextract(markdown_text, doc_name, references_data=None, figures_data=None):
    """Extraction structur√©e avec LangExtract (optionnel)."""
    if not USE_LANGEXTRACT:
        return {"status": "skipped", "reason": "USE_LANGEXTRACT=False"}

    enriched_text = markdown_text

    if references_data and references_data.get("reference_count", 0) > 0:
        enriched_text += "\n\n## R√âF√âRENCES\n"
        enriched_text += f"Nombre de r√©f√©rences : {references_data['reference_count']}\n"
        for i, ref in enumerate(references_data.get("references_list", [])[:20], 1):
            enriched_text += f"[{i}] {ref}\n"

    if figures_data:
        enriched_text += "\n\n## FIGURES IDENTIFI√âES\n"
        enriched_text += f"Nombre de figures : {len(figures_data)}\n"
        for fig in figures_data[:10]:
            enriched_text += f"- {fig.get('label', '')} {fig.get('title', '')}\n"

    try:
        import langextract as lx
        if hasattr(lx, "extract"):
            extraction = lx.extract(enriched_text, prompt=PROMPT_TEMPLATE)
        elif hasattr(lx, "LangExtract"):
            extractor = lx.LangExtract()
            extraction = extractor.extract(enriched_text, prompt=PROMPT_TEMPLATE)
        else:
            return {"status": "error", "error": "API LangExtract introuvable"}

        return _safe_json(extraction)
    except Exception as e:
        return {"status": "error", "error": str(e)}

## √âtape 8 ‚Äî Test sur un PDF (optionnel)
Permet de valider la configuration avant le batch.

In [None]:
if pdf_files:
    sample_path = pdf_files[0]
    sample_name = sample_path.stem
    sample_result = convert_pdf_complete(sample_path, sample_name)
    sample_result
else:
    print("Aucun PDF trouv√© dans le dossier d'entr√©e.")

## √âtape 9 ‚Äî Pipeline complet avec reprise
Traitement batch + logs + reprise automatique.

In [None]:
def process_all_documents():
    log_file = LOGS_DIR / f"processing_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    progress_file = LOGS_DIR / "progress.json"

    def log_message(message):
        print(message)
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(message + "\n")

    processed_files = set()
    if progress_file.exists():
        try:
            with open(progress_file, "r", encoding="utf-8") as f:
                progress_data = json.load(f)
                processed_files = set(progress_data.get("processed", []))
            log_message(f"üìÇ Reprise : {len(processed_files)} fichiers d√©j√† trait√©s")
        except Exception:
            processed_files = set()

    all_pdfs = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".pdf"])
    remaining_pdfs = [p for p in all_pdfs if p.name not in processed_files]

    log_message(f"üìÑ Total : {len(all_pdfs)} | Restants : {len(remaining_pdfs)}")

    results = []
    errors = []

    def save_progress():
        with open(progress_file, "w", encoding="utf-8") as f:
            json.dump({"processed": list(processed_files)}, f, ensure_ascii=False, indent=2)

    for pdf_path in remaining_pdfs:
        doc_name = pdf_path.stem
        log_message(f"\nüöÄ Traitement : {pdf_path.name}")

        conversion_result = convert_pdf_complete(pdf_path, doc_name)
        if conversion_result.get("error"):
            errors.append({"file": pdf_path.name, "error": conversion_result["error"]})
            log_message(f"‚ùå Erreur conversion : {conversion_result['error']}")
            continue

        log_message(
            f"  üñºÔ∏è  Figures : {len(conversion_result['figures'])} trouv√©es, "
            f"{len(conversion_result['figures_paths'])} sauvegard√©es"
        )
        log_message(
            f"  üìö R√©f√©rences : {conversion_result['references'].get('reference_count', 0)} extraites"
        )

        extraction = None
        if USE_LANGEXTRACT:
            log_message("  üîç Extraction LangExtract...")
            extraction = extract_with_langextract(
                open(conversion_result['markdown_path'], 'r', encoding='utf-8').read(),
                doc_name,
                conversion_result['references'],
                conversion_result['figures'],
            )

        analysis = {
            "doc_name": doc_name,
            "source_pdf": str(pdf_path),
            "processed_at": datetime.now().isoformat(),
            "conversion": {
                "markdown_path": conversion_result['markdown_path'],
                "figures_count": len(conversion_result['figures']),
                "figures_saved": len(conversion_result['figures_paths']),
                "figures_paths": conversion_result['figures_paths'],
                "references_count": conversion_result['references'].get('reference_count', 0),
            },
            "references": conversion_result['references'],
            "figures": conversion_result['figures'],
            "langextract": extraction,
        }

        analysis_path = ANALYSES_DIR / f"{doc_name}_analysis.json"
        with open(analysis_path, "w", encoding="utf-8") as f:
            json.dump(analysis, f, ensure_ascii=False, indent=2)

        results.append(analysis)
        processed_files.add(pdf_path.name)
        save_progress()

    return results, errors


def export_all_results(results):
    summary_path = ANALYSES_DIR / "_SUMMARY.json"
    report_md_path = ANALYSES_DIR / "_REPORT.md"

    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    # Bibliographie compl√®te
    all_references = []
    for data in results:
        refs = data.get("references", {})
        if refs.get("references_list"):
            for ref in refs["references_list"]:
                all_references.append({
                    "document": data.get("doc_name", ""),
                    "reference": ref,
                })

    if all_references:
        biblio_path = REFERENCES_DIR / "_BIBLIOGRAPHIE_COMPLETE.json"
        with open(biblio_path, "w", encoding="utf-8") as f:
            json.dump(all_references, f, ensure_ascii=False, indent=2)
        print(f"üìö Bibliographie : {biblio_path} ({len(all_references)} r√©f√©rences)")

    # Index des figures
    all_figures = []
    for data in results:
        for fig_path in data.get("conversion", {}).get("figures_paths", []):
            all_figures.append({
                "document": data.get("doc_name", ""),
                "path": fig_path,
            })

    if all_figures:
        figures_index_path = FIGURES_DIR / "_INDEX_FIGURES.json"
        with open(figures_index_path, "w", encoding="utf-8") as f:
            json.dump(all_figures, f, ensure_ascii=False, indent=2)
        print(f"üñºÔ∏è  Index figures : {figures_index_path} ({len(all_figures)} figures)")

    # Rapport Markdown
    md_lines = []
    md_lines.append("# Rapport de traitement\n")
    md_lines.append(f"- Total documents : {len(results)}\n")
    md_lines.append(f"- Total figures extraites : {len(all_figures)}\n")
    md_lines.append(f"- Total r√©f√©rences : {len(all_references)}\n\n")

    for data in results:
        conv = data.get("conversion", {})
        md_lines.append(f"## {data.get('doc_name', '')}\n")
        md_lines.append(f"- Markdown : {conv.get('markdown_path', '')}\n")
        md_lines.append(f"- Figures : {conv.get('figures_count', 0)}\n")
        md_lines.append(f"- R√©f√©rences : {conv.get('references_count', 0)}\n\n")

    with open(report_md_path, "w", encoding="utf-8") as f:
        f.write("".join(md_lines))

    print(f"‚úÖ R√©sultats export√©s : {summary_path} | {report_md_path}")

## √âtape 10 ‚Äî Lancer le traitement
D√©commentez si n√©cessaire, puis ex√©cutez.

In [None]:
results, errors = process_all_documents()
export_all_results(results)

print("‚úÖ Termin√©.")
if errors:
    print(f"‚ö†Ô∏è  Erreurs : {len(errors)}")