In [1]:
# Génération de sommaires pour articles scientifiques (cas AFNOR)

import os
import requests
from bs4 import BeautifulSoup

arxiv_url = "https://arxiv.org/list/cs.AI/recent"

## 📥 Étape 1 : Téléchargement des articles depuis arXiv
output_dir = "articles"
os.makedirs(output_dir, exist_ok=True)
print("Imports chargés ✅")

max_article_count = 10

response = requests.get(arxiv_url)
soup = BeautifulSoup(response.content, "html.parser")

# Extraire les liens de téléchargement PDF
pdf_links = []
for link in soup.find_all("a"):
    href = link.get("href", "")
    if href.startswith("/pdf/") and "format" not in href:
        full_link = "https://arxiv.org" + href
        pdf_links.append(full_link)

# Enlever les doublons et limiter le nombre pour test
pdf_links = list(set(pdf_links))[:max_article_count]  # Télécharge les X premiers articles
print("Liens PDF récupérés :", pdf_links)


for i, pdf_url in enumerate(pdf_links, start=1):
    print(f"Téléchargement {i}: {pdf_url}")
    try:
        response = requests.get(pdf_url)
        pdf_name = pdf_url.split("/")[-1] + ".pdf"
        with open(os.path.join(output_dir, pdf_name), "wb") as f:
            f.write(response.content)
    except Exception as e:
        print(f"Erreur pendant le téléchargement de {pdf_url} : {e}")

print("Téléchargement terminé ✅")


Imports chargés ✅
Liens PDF récupérés : ['https://arxiv.org/pdf/2507.04370', 'https://arxiv.org/pdf/2507.04299', 'https://arxiv.org/pdf/2507.05246', 'https://arxiv.org/pdf/2507.04431', 'https://arxiv.org/pdf/2507.04381', 'https://arxiv.org/pdf/2507.04600', 'https://arxiv.org/pdf/2507.03916', 'https://arxiv.org/pdf/2507.04770', 'https://arxiv.org/pdf/2507.04283', 'https://arxiv.org/pdf/2507.05241']
Téléchargement 1: https://arxiv.org/pdf/2507.04370
Téléchargement 2: https://arxiv.org/pdf/2507.04299
Téléchargement 3: https://arxiv.org/pdf/2507.05246
Téléchargement 4: https://arxiv.org/pdf/2507.04431
Téléchargement 5: https://arxiv.org/pdf/2507.04381
Téléchargement 6: https://arxiv.org/pdf/2507.04600
Téléchargement 7: https://arxiv.org/pdf/2507.03916
Téléchargement 8: https://arxiv.org/pdf/2507.04770
Téléchargement 9: https://arxiv.org/pdf/2507.04283
Téléchargement 10: https://arxiv.org/pdf/2507.05241
Téléchargement terminé ✅


In [3]:
## Approche 1 : Extraction simple de texte + Expressions regex
# !pip install PyMuPDF
import fitz  # PyMuPDF
import os
import re

# === PARAMÈTRES ===
INPUT_DIR = "articles"
OUTPUT_DIR = "output_v1"
MAX_TITLE_WORDS = 12

# === REGEX PATTERNS ===
section_pattern = re.compile(r"^\s*(\d+(?:\.\d+)*)(?:\.|\))?\s+([A-Z][\w\s\-,:;\(\)]*)$")
roman_pattern = re.compile(r"^\s*(?=[IVXLCDM]+\.)\s*([IVXLCDM]+)\.\s+([A-Z][\w\s\-,:;\(\)]*)$", re.IGNORECASE)
appendix_pattern = re.compile(r"^\s*([A-Z])(\.|[\s])\s+([A-Z\s\-]{5,})$")
uppercase_pattern = re.compile(r"^[A-Z\s\-]{5,}$")

# === UTILITAIRES ===

def is_valid_title(line: str) -> bool:
    return bool(line) and len(line.split()) <= MAX_TITLE_WORDS and line[-1] not in ".:;!?"

def clean_title_line(line: str) -> str:
    return line.strip()

def extract_toc_from_text(text: str):
    toc = []
    for line in text.splitlines():
        raw_line = line.strip()
        if not is_valid_title(raw_line):
            continue

        for pattern in [section_pattern, roman_pattern, appendix_pattern]:
            match = pattern.match(raw_line)
            if match:
                parts = match.groups()
                number, title = parts[0], parts[-1]
                toc.append((number, clean_title_line(title)))
                break
        else:
            if uppercase_pattern.match(raw_line):
                toc.append(("", clean_title_line(raw_line)))
    return toc

def save_toc_to_md(toc, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("# Table des matières détectée\n\n")
        for number, title in toc:
            indent = "  " * number.count(".") if "." in number else ""
            line = f"{indent}- {number + ' ' if number else ''}{title}"
            f.write(line + "\n")

# === TRAITEMENT EN LOT ===

os.makedirs(OUTPUT_DIR, exist_ok=True)
pdf_files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".pdf")]

for pdf_file in pdf_files:
    pdf_path = os.path.join(INPUT_DIR, pdf_file)
    output_path = os.path.join(OUTPUT_DIR, os.path.splitext(pdf_file)[0] + ".md")
    
    try:
        doc = fitz.open(pdf_path)
        full_text = "\n".join(page.get_text() for page in doc)
        toc = extract_toc_from_text(full_text)
        save_toc_to_md(toc, output_path)
        print(f"✅ TOC extrait : {pdf_file} -> {output_path}")
    except Exception as e:
        print(f"❌ Erreur avec {pdf_file} : {e}")


✅ TOC extrait : 2507.05246.pdf -> output_v1/2507.05246.md
✅ TOC extrait : 2507.04370.pdf -> output_v1/2507.04370.md
✅ TOC extrait : 2507.04600.pdf -> output_v1/2507.04600.md
✅ TOC extrait : 2507.04770.pdf -> output_v1/2507.04770.md
✅ TOC extrait : 2507.05241.pdf -> output_v1/2507.05241.md
✅ TOC extrait : 2507.04299.pdf -> output_v1/2507.04299.md
✅ TOC extrait : 2507.03916.pdf -> output_v1/2507.03916.md
✅ TOC extrait : 2507.04283.pdf -> output_v1/2507.04283.md
✅ TOC extrait : 2507.04381.pdf -> output_v1/2507.04381.md
✅ TOC extrait : 2507.04431.pdf -> output_v1/2507.04431.md


In [19]:
## Approche 2 : Extraction à l'aide de modèles LLM

import fitz  # PyMuPDF
from openai import OpenAI
from dotenv import load_dotenv
import json
import time
import re
import os

# === CONFIGURATION ===
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY1").strip()
client = OpenAI(api_key=api_key)
model = "gpt-4.1"
articles_dir = "articles"
output_dir = "output_v2"
# max_pages = 100
delay_between_calls = 1.2  # seconds

# === UTILS ===

def get_page_text(path, page_num):
    doc = fitz.open(path)
    if page_num >= len(doc):
        return None
    page = doc.load_page(page_num)
    return page.get_text()

def build_prompt(text, page_number):
    return f"""
You are analyzing a scientific research paper.

This is the content of page {page_number}. Your task is to extract any section or subsection titles.

If titles like "1 Introduction" or "2.1 Related Work" are present, return them in this JSON format:

[
  {{"section": "1 Introduction", "page": {page_number}}},
  ...
]

If no headers are found, return an empty list: []

Only return valid JSON. No commentary.

--- PAGE {page_number} ---
{text}
--- END PAGE ---
"""

def call_gpt(prompt):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=500
    )
    return response.choices[0].message.content

def clean_json_string(raw_text):
    cleaned = raw_text.strip().strip("```").replace("json", "").strip()
    match = re.search(r"\[.*\]", cleaned, re.DOTALL)
    return match.group(0).strip() if match else cleaned

def save_json(obj, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2)

# === TRAITEMENT PRINCIPAL ===

def process_pdf(pdf_path, output_subdir):
    toc = []
    filename = os.path.basename(pdf_path)
    article_name = os.path.splitext(filename)[0]

    # print(f"\n📘 Processing {article_name}...")
    with fitz.open(pdf_path) as doc:
        num_pages = len(doc)
    
    for i in range(num_pages):
        page_number = i + 1
        # print(f"  📄 Page {page_number}...")

        text = get_page_text(pdf_path, i)
        if not text or len(text.strip()) < 20:
            # print("   ⚠️ Skipped (empty or short text)")
            continue

        prompt = build_prompt(text, page_number)

        try:
            raw = call_gpt(prompt)
            # print(f"   📝 Raw GPT output:\n{raw}\n")

            cleaned = clean_json_string(raw)
            entries = json.loads(cleaned)
            if isinstance(entries, list):
                toc.extend(entries)
            else:
                print("   ❌ Unexpected JSON structure.")
        except json.JSONDecodeError:
            print("   ❌ JSON parse error — saving raw output.")
            with open(os.path.join(output_subdir, f"debug_page_{page_number}.txt"), "w", encoding="utf-8") as f:
                f.write(raw)
        except Exception as e:
            print(f"   🚨 Error: {e}")

        # Sauvegarde intermédiaire
        intermediate_path = os.path.join(output_subdir, "toc_intermediate.json")
        save_json(toc, intermediate_path)
        # print(f"   💾 Intermediate saved: {len(toc)} entries")

        time.sleep(delay_between_calls)

    return toc

# === SCRIPT GLOBAL ===

def main():
    os.makedirs(output_dir, exist_ok=True)
    pdf_files = [f for f in os.listdir(articles_dir) if f.endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(articles_dir, pdf_file)
        article_name = os.path.splitext(pdf_file)[0]
        output_subdir = os.path.join(output_dir, article_name)
        os.makedirs(output_subdir, exist_ok=True)

        toc = process_pdf(pdf_path, output_subdir)

        final_path = os.path.join(output_subdir, "toc_final.json")
        save_json(toc, final_path)

        print(f"✅ Done: {pdf_file} → {final_path} ({len(toc)} entries)")

if __name__ == "__main__":
    main()


✅ Done: 2507.05246.pdf → output_v2/2507.05246/toc_final.json (88 entries)
✅ Done: 2507.04370.pdf → output_v2/2507.04370/toc_final.json (29 entries)
✅ Done: 2507.04600.pdf → output_v2/2507.04600/toc_final.json (30 entries)
✅ Done: 2507.04770.pdf → output_v2/2507.04770/toc_final.json (25 entries)
✅ Done: 2507.05241.pdf → output_v2/2507.05241/toc_final.json (18 entries)
✅ Done: 2507.04299.pdf → output_v2/2507.04299/toc_final.json (11 entries)
✅ Done: 2507.03916.pdf → output_v2/2507.03916/toc_final.json (26 entries)
✅ Done: 2507.04283.pdf → output_v2/2507.04283/toc_final.json (20 entries)
✅ Done: 2507.04381.pdf → output_v2/2507.04381/toc_final.json (18 entries)
✅ Done: 2507.04431.pdf → output_v2/2507.04431/toc_final.json (9 entries)
