In [3]:
pip install numpy

Collecting numpy
  Using cached numpy-2.4.0-cp313-cp313-win_amd64.whl.metadata (6.6 kB)
Using cached numpy-2.4.0-cp313-cp313-win_amd64.whl (12.3 MB)
Installing collected packages: numpy
Successfully installed numpy-2.4.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
pip install faiss-cpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.13.2-cp313-cp313-win_amd64.whl.metadata (7.6 kB)
Using cached faiss_cpu-1.13.2-cp313-cp313-win_amd64.whl (18.9 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
pip install --upgrade ollama

Collecting ollama
  Using cached ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting pydantic>=2.9 (from ollama)
  Using cached pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting annotated-types>=0.6.0 (from pydantic>=2.9->ollama)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Using cached ollama-0.6.1-py3-none-any.whl (14 kB)
Using cached pydantic-2.12.5-py3-none-any.whl (463 kB)
Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)
Installing collected packages: annotated-types, pydantic, ollama

   ------------- -------------------------- 1/3 [pydantic]
   ------------- -------------------------- 1/3 [pydantic]
   ------------- -------------------------- 1/3 [pydantic]
   ------------- -------------------------- 1/3 [pydantic]
   ------------- -------------------------- 1/3 [pydantic]
   ------------- -------------------------- 1/3 [pydantic]
   -------------------------- ------------- 2/3 [ollama]
   -----------------------------------


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os, json
import numpy as np
import faiss
import ollama
from ollama import Client

In [3]:
CHUNKS_DIR = os.path.abspath("../../data/chunks_esilv")
BASE_V2_DIR = os.path.abspath("../embeddings/vector_store_v2")
os.makedirs(BASE_V2_DIR, exist_ok=True)

INDEX_PATH = os.path.join(BASE_V2_DIR, "faiss_index.bin")
MAPPING_PATH = os.path.join(BASE_V2_DIR, "mapping.json")

EMBED_DIM = 1024
MODEL = "mxbai-embed-large"

client = Client()

In [4]:
os.listdir("../../data/chunks_esilv")

['chunks_admissions.json',
 'chunks_entreprises-debouches.json',
 'chunks_formations.json',
 'chunks_international.json',
 'chunks_lecole.json',
 'chunks_recherche.json']

In [5]:
# -----------------------------
# Ollama client
# -----------------------------

# -----------------------------
# Helpers: call Ollama + extract vector
# -----------------------------
def get_embedding_vector(resp) -> np.ndarray:
    """
    Supporte:
    - client.embed(...) -> EmbedResponse avec .embeddings (liste)
    - client.embeddings(...) -> dict avec "embedding" OU "embeddings"
    """
    # Case 1: pydantic object from client.embed()
    if hasattr(resp, "embeddings"):
        emb = resp.embeddings
        if isinstance(emb, list) and len(emb) > 0:
            # emb peut etre: [[...]] ou [...]
            if isinstance(emb[0], list):
                return np.array(emb[0], dtype="float32")
            return np.array(emb, dtype="float32")

    # Case 2: dict response
    if isinstance(resp, dict):
        if "embedding" in resp:
            return np.array(resp["embedding"], dtype="float32")
        if "embeddings" in resp and resp["embeddings"]:
            return np.array(resp["embeddings"][0], dtype="float32")

    raise ValueError(f"Unknown embedding response format: {type(resp)}")

def embed_once(text: str) -> np.ndarray:
    # API moderne si dispo
    if hasattr(client, "embed"):
        resp = client.embed(model=MODEL, input=text)
    else:
        resp = client.embeddings(model=MODEL, prompt=text)

    v = get_embedding_vector(resp)

    # normalize for cosine-like retrieval with IndexFlatIP
    v /= (np.linalg.norm(v) + 1e-12)
    return v

In [6]:
# -----------------------------
# Chunking safe for context length
# -----------------------------
MAX_CHARS_PER_PART = 1200  # petit => evite overflow tokens

def split_text(text: str, max_chars: int):
    text = (text or "").strip()
    if not text:
        return []
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

def embed_text_full(text: str) -> np.ndarray:
    """
    Zero perte d'info: on embed TOUT le texte via morceaux,
    puis on moyenne les embeddings.
    """
    parts = split_text(text, MAX_CHARS_PER_PART)
    if not parts:
        raise ValueError("Empty text after stripping")

    vecs = []
    for p in parts:
        resp = ollama_embed("mxbai-embed-large", p)

        # Depending on API, field can be "embeddings" or "embedding"
        if isinstance(resp, dict) and "embedding" in resp:
            v = np.array(resp["embedding"], dtype="float32")
        elif isinstance(resp, dict) and "embeddings" in resp:
            # Some clients return list of embeddings
            v = np.array(resp["embeddings"][0], dtype="float32")
        else:
            # Last resort: try attribute-like access (rare)
            v = np.array(resp.embedding, dtype="float32")

        vecs.append(v)

    v_mean = np.mean(np.stack(vecs, axis=0), axis=0).astype("float32")

    # Normalize for cosine-like retrieval with IndexFlatIP
    v_mean /= (np.linalg.norm(v_mean) + 1e-12)


In [7]:
# -----------------------------
# Split to avoid context overflow
# -----------------------------
MAX_CHARS_PER_PART = 1200

def split_text(text: str, max_chars: int):
    text = (text or "").strip()
    if not text:
        return []
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

def embed_text_full(text: str) -> np.ndarray:
    parts = split_text(text, MAX_CHARS_PER_PART)
    if not parts:
        raise ValueError("Empty text")

    vecs = [embed_once(p) for p in parts]
    v_mean = np.mean(np.stack(vecs, axis=0), axis=0).astype("float32")
    v_mean /= (np.linalg.norm(v_mean) + 1e-12)
    return v_mean

In [8]:
# -----------------------------
# Build FAISS v2
# -----------------------------
index = faiss.IndexFlatIP(EMBED_DIM)
mapping = {}
added = 0

for fn in sorted(os.listdir(CHUNKS_DIR)):
    if not fn.endswith(".json"):
        continue

    path = os.path.join(CHUNKS_DIR, fn)
    print("Traitement:", fn)

    with open(path, "r", encoding="utf-8") as f:
        chunks = json.load(f)

    for chunk in chunks:
        content = (chunk.get("content") or "").strip()
        if not content:
            continue

        title = (chunk.get("title") or "").strip()
        rubric = (chunk.get("rubric") or "").strip()
        url = chunk.get("url")

        embed_input = f"TITLE: {title}\nRUBRIC: {rubric}\nCONTENT:\n{content}"

        v = embed_text_full(embed_input).reshape(1, -1)

        idx_id = index.ntotal
        index.add(v)

        mapping[str(idx_id)] = {
            "title": title,
            "content": content,  # on conserve TOUT
            "rubric": rubric,
            "url": url,
            "source_file": fn,
            "embedding_input_chars": len(embed_input),
            "parts_count": int(np.ceil(len(embed_input) / MAX_CHARS_PER_PART)),
            "max_chars_per_part": MAX_CHARS_PER_PART,
        }

        added += 1

Traitement: chunks_admissions.json
Traitement: chunks_entreprises-debouches.json
Traitement: chunks_formations.json
Traitement: chunks_international.json
Traitement: chunks_lecole.json
Traitement: chunks_recherche.json


In [9]:
print("Vectors added:", added)
print("Index total:", index.ntotal)

faiss.write_index(index, INDEX_PATH)
with open(MAPPING_PATH, "w", encoding="utf-8") as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)

print("Saved index:", INDEX_PATH)
print("Saved mapping:", MAPPING_PATH)

Vectors added: 533
Index total: 533
Saved index: C:\Users\nacca\Documents\ESILV\LLM_and_GENAI\DEPOT\ESILV-Smart-Assistant\code\embeddings\vector_store_v2\faiss_index.bin
Saved mapping: C:\Users\nacca\Documents\ESILV\LLM_and_GENAI\DEPOT\ESILV-Smart-Assistant\code\embeddings\vector_store_v2\mapping.json
