# ‚úÇÔ∏è 2) Chunking intelligent (pr√©-processing pour le RAG)

Une pipeline de d√©coupage avanc√©, qui :

‚úî √âvite les coupures de mots

‚úî √âvite les coupures de phrases

‚úî G√©n√®re des segments coh√©rents (H2/H3, blocs logiques)

‚úî Ajoute les m√©tadonn√©es :

- rubric (admissions, formations‚Ä¶)
- titre complet
- URL parent + URL child
- source file

Le chunking cr√©e un dossier :
chunks_esilv/ ‚Üí contenant tous les √©l√©ments pr√™ts pour ingestion RAG.

üëâ Objectif atteint : `pr√©paration optimale du texte pour la vectorisation.`

Ceraines phrases se rep√®tent donc :

In [None]:
def dedupe_sentences(text):
    """
    Supprime les phrases r√©p√©t√©es dans un bloc de texte.
    """
    if not text:
        return ""

    # Split par phrases
    sentences = re.split(r"(?<=[.!?])\s+", normalize(text))

    seen = set()
    clean_sentences = []

    for s in sentences:
        s_norm = normalize(s)
        if not s_norm:
            continue
        if s_norm not in seen:
            clean_sentences.append(s_norm)
            seen.add(s_norm)

    return " ".join(clean_sentences)

In [None]:
import os
import json
import re

INPUT_DIR = "/content/scraping_esilv"
OUTPUT_DIR = "/content/chunks_esilv"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# URLs parentes officielles
START_URLS = {
    "lecole": "https://www.esilv.fr/lecole/",
    "admissions": "https://www.esilv.fr/admissions/",
    "formations": "https://www.esilv.fr/formations/",
    "entreprises-debouches": "https://www.esilv.fr/entreprises-debouches/",
    "recherche": "https://www.esilv.fr/recherche/",
    "international": "https://www.esilv.fr/international/",
}

MAX_CHARS = 1000


# ----------------------------------------------------
# Normalisation s√©curis√©e
# ----------------------------------------------------
def normalize(t):
    if not t:
        return ""
    return re.sub(r"\s+", " ", str(t)).strip()


# ----------------------------------------------------
# Transformer objet dict ‚Üí texte clair
# ----------------------------------------------------
def dict_to_text(obj):
    parts = []

    for key in ["title", "content", "role", "phone", "email"]:
        if obj.get(key):
            parts.append(normalize(obj[key]))

    if obj.get("url"):
        parts.append(f"(Plus d'informations : {obj['url']})")

    # join only non-empty
    parts = [p for p in parts if p]
    return " ‚Äì ".join(parts)


# ----------------------------------------------------
# Chunking intelligent
# ----------------------------------------------------
def chunk_text(title, text, rubric=None, url=None):
    text = normalize(text)
    if not text:
        return []

    chunks = []

    # chunk direct si petit
    if len(text) <= MAX_CHARS:
        chunks.append({
            "title": title,
            "content": text,
            "rubric": rubric,
            "url": url
        })
        return chunks

    # d√©coupage par phrases
    sentences = re.split(r"(?<=[.!?])\s+", text)
    buffer = ""

    for sent in sentences:
        sent = normalize(sent)
        if not sent:
            continue

        if len(buffer) + len(sent) + 1 < MAX_CHARS:
            buffer += sent + " "
        else:
            chunks.append({
                "title": title,
                "content": buffer.strip(),
                "rubric": rubric,
                "url": url
            })
            buffer = sent + " "

    if buffer.strip():
        chunks.append({
            "title": title,
            "content": buffer.strip(),
            "rubric": rubric,
            "url": url
        })

    return chunks


# ----------------------------------------------------
# Extraction chunks d'un fichier JSON scraping
# ----------------------------------------------------
def extract_chunks(data, parent_url):
    rubric = data.get("rubric", "UNKNOWN")
    chunks = []

    # -------------------------------------------
    # INTRO
    # -------------------------------------------
    if data.get("intro"):
        chunks += chunk_text(
            title=f"{rubric} ‚Äì Introduction",
            text=data["intro"],
            rubric=rubric,
            url=parent_url
        )

    # -------------------------------------------
    # SECTIONS PRINCIPALES
    # -------------------------------------------
    for sec in data.get("sections", []):
        h2 = normalize(sec.get("h2", ""))
        h3 = normalize(sec.get("h3"))
        title = h2 if not h3 else f"{h2} ‚Äì {h3}"

        text = ""
        for item in sec.get("content", []):
            if isinstance(item, str):
                text += normalize(item) + " "
            elif isinstance(item, dict):
                text += dict_to_text(item) + " "

        #chunks += chunk_text(title, text, rubric, parent_url)
        clean_text = dedupe_sentences(text)
        chunks += chunk_text(title, clean_text, rubric, parent_url)

    # -------------------------------------------
    # CHILD PAGES
    # -------------------------------------------
    for cp in data.get("child_pages", []):
        cp_title = normalize(cp.get("title", "Page enfant"))
        cp_url = cp.get("url")

        # intro
        if cp.get("intro"):
            chunks += chunk_text(
                title=f"{cp_title} ‚Äì Introduction",
                text=cp["intro"],
                rubric=rubric,
                url=cp_url
            )

        # blocs
        for block in cp.get("blocks", []):
            block_title = normalize(
                block.get("h3") or block.get("h4") or block.get("title") or cp_title
            )

            body = ""

            # paragraphes
            for p in block.get("text", []):
                if p:
                    body += normalize(p) + " "

            # titled_list
            items = block.get("items", [])
            for it in items:
                if it:
                    body += normalize(it) + " "

            # listes UL/LI
            lists = block.get("lists", [])
            for lst in lists:
                if isinstance(lst, list):
                    for li in lst:
                        if li:
                            body += normalize(li) + " "

            clean_body = dedupe_sentences(body)
            chunks += chunk_text(
                title=f"{cp_title} ‚Äì {block_title}",
                text=clean_body,
                rubric=rubric,
                url=cp_url
            )

        # full text child

        if cp.get("full_text"):
            chunks += chunk_text(
                title=f"{cp_title} ‚Äì Full text",
                text=cp["full_text"],
                rubric=rubric,
                url=cp_url
            )

    return chunks


# ----------------------------------------------------
# PROCESS ALL
# ----------------------------------------------------
def process_all():
    files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".json")]
    print(f"‚û° {len(files)} fichiers d√©tect√©s.")

    for fn in files:
        print(f"üîç Traitement : {fn}")

        key = fn.replace(".json", "")
        parent_url = START_URLS.get(key)

        with open(os.path.join(INPUT_DIR, fn), "r", encoding="utf-8") as f:
            data = json.load(f)

        chunks = extract_chunks(data, parent_url)

        out_path = os.path.join(OUTPUT_DIR, f"chunks_{fn}")
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(chunks, f, indent=2, ensure_ascii=False)

        print(f"‚úî G√©n√©r√© : {out_path}")

    print("\nüéâ Tous les chunks sont g√©n√©r√©s dans /content/chunks_esilv/")


# RUN
process_all()

‚û° 6 fichiers d√©tect√©s.
üîç Traitement : formations.json
‚úî G√©n√©r√© : /content/chunks_esilv/chunks_formations.json
üîç Traitement : entreprises-debouches.json
‚úî G√©n√©r√© : /content/chunks_esilv/chunks_entreprises-debouches.json
üîç Traitement : recherche.json
‚úî G√©n√©r√© : /content/chunks_esilv/chunks_recherche.json
üîç Traitement : lecole.json
‚úî G√©n√©r√© : /content/chunks_esilv/chunks_lecole.json
üîç Traitement : international.json
‚úî G√©n√©r√© : /content/chunks_esilv/chunks_international.json
üîç Traitement : admissions.json
‚úî G√©n√©r√© : /content/chunks_esilv/chunks_admissions.json

üéâ Tous les chunks sont g√©n√©r√©s dans /content/chunks_esilv/


plus aucune phrase r√©p√©t√©e

chunks ultra propres

embeddings beaucoup plus efficaces

meilleur RAG

meilleure coh√©rence des r√©ponses

In [None]:
import os
import json
import re

CHUNKS_DIR = "/content/chunks_esilv"
CLEAN_DIR = "/content/chunks_esilv_clean"
os.makedirs(CLEAN_DIR, exist_ok=True)


# -------------------------------
# Nettoyage d‚Äôun chunk individuel
# -------------------------------
def clean_chunk_content(text):
    if not text:
        return ""

    # Normalisation espaces
    text = re.sub(r"\s+", " ", text).strip()

    # D√©couper en phrases
    sentences = re.split(r"(?<=[.!?])\s+", text)

    # Supprimer doublons internes
    seen = set()
    cleaned_sentences = []
    for s in sentences:
        s_norm = s.lower().strip()
        if s_norm and s_norm not in seen:
            cleaned_sentences.append(s)
            seen.add(s_norm)

    cleaned = ". ".join(cleaned_sentences).strip()

    # Petite correction : √©viter ". ." dans les jonctions
    cleaned = cleaned.replace(". .", ".")

    return cleaned


# -------------------------------
# Nettoyer un fichier de chunks
# -------------------------------
def clean_chunks_file(path):
    with open(path, "r", encoding="utf-8") as f:
        chunks = json.load(f)

    cleaned = []
    global_seen = set()

    for chunk in chunks:
        text = clean_chunk_content(chunk.get("content", ""))

        # supprimer chunks vides ou ultra-courts
        if len(text) < 20:
            continue

        # doublons globaux
        hash_text = text.lower().strip()
        if hash_text in global_seen:
            continue

        global_seen.add(hash_text)

        # chunk nettoy√©
        cleaned.append({
            "title": chunk.get("title", ""),
            "content": text,
            "rubric": chunk.get("rubric", ""),
            "url": chunk.get("url", None)
        })

    return cleaned


# -------------------------------
# PROCESS ALL
# -------------------------------
def process_all():
    files = [f for f in os.listdir(CHUNKS_DIR) if f.endswith(".json")]
    print(f"üßπ {len(files)} fichiers d√©tect√©s √† nettoyer")

    for fn in files:
        input_path = os.path.join(CHUNKS_DIR, fn)
        output_path = os.path.join(CLEAN_DIR, fn)

        print(f"üîç Nettoyage : {fn}")

        cleaned_chunks = clean_chunks_file(input_path)

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_chunks, f, indent=2, ensure_ascii=False)

        print(f"‚úî Fichier nettoy√© ‚Üí {output_path}")

    print("\nüéâ Nettoyage COMPLET termin√© !")


process_all()

üßπ 6 fichiers d√©tect√©s √† nettoyer
üîç Nettoyage : chunks_lecole.json
‚úî Fichier nettoy√© ‚Üí /content/chunks_esilv_clean/chunks_lecole.json
üîç Nettoyage : chunks_recherche.json
‚úî Fichier nettoy√© ‚Üí /content/chunks_esilv_clean/chunks_recherche.json
üîç Nettoyage : chunks_formations.json
‚úî Fichier nettoy√© ‚Üí /content/chunks_esilv_clean/chunks_formations.json
üîç Nettoyage : chunks_international.json
‚úî Fichier nettoy√© ‚Üí /content/chunks_esilv_clean/chunks_international.json
üîç Nettoyage : chunks_entreprises-debouches.json
‚úî Fichier nettoy√© ‚Üí /content/chunks_esilv_clean/chunks_entreprises-debouches.json
üîç Nettoyage : chunks_admissions.json
‚úî Fichier nettoy√© ‚Üí /content/chunks_esilv_clean/chunks_admissions.json

üéâ Nettoyage COMPLET termin√© !


- Nettoie proprement
- Transforme les objets en bon texte
- D√©coupe sans couper les mots
- Respecte H2 / H3 / blocs / child pages
- Est compatible avec FAISS, ChromaDB, Ollama embeddings
- G√®re 1 fichier par fichier ‚Üí rapide pour ta machine