In [1]:
import os, json, re
import numpy as np
import faiss
from ollama import Client

In [2]:
# -------------------------
# Config
# -------------------------
PDF_JSON_PATH = "../../data/scraping_esilv/full_pdfs_improved.json"
OUT_DIR = "vector_store_v3"
INDEX_PATH = os.path.join(OUT_DIR, "faiss_index.bin")
MAP_PATH = os.path.join(OUT_DIR, "mapping.json")

EMBED_MODEL = "mxbai-embed-large"
EMBED_DIM = 1024

# Safe chunking
MAX_CHARS = 1100
MIN_CHARS = 200

os.makedirs(OUT_DIR, exist_ok=True)

client = Client(host="http://localhost:11434")

In [3]:
def normalize_vec(v: np.ndarray) -> np.ndarray:
    v = v.astype("float32")
    v /= (np.linalg.norm(v) + 1e-12)
    return v

def embed_one(text: str) -> np.ndarray:
    # Ollama python client recent versions: client.embed or client.embeddings
    # We'll use embed if available, else embeddings.
    if hasattr(client, "embed"):
        resp = client.embed(model=EMBED_MODEL, input=text)
        # resp.embeddings is usually list[list[float]]
        v = np.array(resp.embeddings[0], dtype="float32")
    else:
        resp = client.embeddings(model=EMBED_MODEL, prompt=text)
        # resp may be dict-like or pydantic
        if isinstance(resp, dict) and "embedding" in resp:
            v = np.array(resp["embedding"], dtype="float32")
        else:
            v = np.array(resp.embedding, dtype="float32")  # fallback (rare)
    return normalize_vec(v)

def smart_split(text: str, max_chars: int = MAX_CHARS):
    text = re.sub(r"\s+", " ", text).strip()
    if len(text) <= max_chars:
        return [text]

    parts = []
    start = 0
    while start < len(text):
        end = min(start + max_chars, len(text))
        # try split on sentence boundary
        cut = text.rfind(". ", start, end)
        if cut == -1 or cut < start + MIN_CHARS:
            cut = text.rfind(" ", start, end)
        if cut == -1 or cut < start + MIN_CHARS:
            cut = end
        part = text[start:cut].strip()
        if part:
            parts.append(part)
        start = cut + 1
    return parts

In [4]:
# -------------------------
# Load pdf json
# -------------------------
with open(PDF_JSON_PATH, "r", encoding="utf-8") as f:
    pdf_root = json.load(f)

# pdf_root expected schema:
# { "rubric": "...", "documents": [ { "pdf_name":..., "id_pdf":..., "pages":[{page, text_norm_wo_footer...}, ...] } ] }
documents = pdf_root.get("documents", [])
rubric = pdf_root.get("rubric", "pdf")

In [5]:
# -------------------------
# Create / load FAISS v3
# -------------------------
if os.path.exists(INDEX_PATH):
    index = faiss.read_index(INDEX_PATH)
else:
    index = faiss.IndexFlatIP(EMBED_DIM)

if os.path.exists(MAP_PATH):
    with open(MAP_PATH, "r", encoding="utf-8") as f:
        mapping = json.load(f)
else:
    mapping = {}

def add_vector(vec: np.ndarray, meta: dict):
    vec = vec.reshape(1, -1).astype("float32")
    idx_id = index.ntotal
    index.add(vec)
    mapping[str(idx_id)] = meta

In [6]:
# -------------------------
# Pipeline
# -------------------------
added = 0

for doc in documents:
    pdf_name = doc.get("pdf_name", "")
    pdf_id = doc.get("id_pdf", doc.get("pdf_id", ""))
    pages = doc.get("pages", [])

    for p in pages:
        page_num = p.get("page")
        text = p.get("text_norm_wo_footer") or p.get("text_norm") or p.get("text_raw") or ""
        if not text.strip():
            continue

        header = f"SOURCE: pdf\nPDF_NAME: {pdf_name}\nPDF_ID: {pdf_id}\nPAGE: {page_num}\nRUBRIC: {rubric}\nTEXT:\n"
        embed_input = header + text

        parts = smart_split(embed_input, MAX_CHARS)

        for part_i, part in enumerate(parts):
            v = embed_one(part)

            meta = {
                "title": f"{pdf_name} - page {page_num} - part {part_i+1}/{len(parts)}",
                "content": part,  # keep what was embedded (important for debugging)
                "rubric": rubric,
                "url": f"pdf://{pdf_id}#page={page_num}",
                "source_file": pdf_id,
                "pdf_name": pdf_name,
                "page": page_num,
                "part_index": part_i,
                "parts_count": len(parts),
                "embedding_input_chars": len(part),
            }
            add_vector(v, meta)
            added += 1

print("Added vectors:", added)
print("Index ntotal:", index.ntotal)

faiss.write_index(index, INDEX_PATH)
with open(MAP_PATH, "w", encoding="utf-8") as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)

print("Saved:", INDEX_PATH, MAP_PATH)

Added vectors: 410
Index ntotal: 410
Saved: vector_store_v3\faiss_index.bin vector_store_v3\mapping.json


In [7]:
import faiss, json

index = faiss.read_index("vector_store_v3/faiss_index.bin")
with open("vector_store_v3/mapping.json", "r", encoding="utf-8") as f:
    mapping = json.load(f)

print("ntotal:", index.ntotal)

# afficher 3 entrées au hasard
for k in list(mapping.keys())[:3]:
    print(k, mapping[k]["title"], mapping[k]["url"])


ntotal: 410
0 diplome ingénieur esilv - page 1 - part 1/1 pdf://download_10873.pdf#page=1
1 diplome ingénieur esilv - page 2 - part 1/3 pdf://download_10873.pdf#page=2
2 diplome ingénieur esilv - page 2 - part 2/3 pdf://download_10873.pdf#page=2
