In [1]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from pathlib import Path
import os

def embed_documents(provider='sentence-transformers'):
    # Nếu đang ở thư mục /scripts thì lùi lên gốc project
    base_dir = Path.cwd().parent if Path.cwd().name == "scripts" else Path.cwd()
    data_dir = base_dir / "data"

    input_file = data_dir / "chunked_docs.json"
    output_file = data_dir / "embeddings.npy"

    with open(input_file, 'r', encoding='utf-8') as f:
        chunks = json.load(f)

    texts = [chunk['content'] for chunk in chunks]

    print(f"Tổng số chunk cần embed: {len(texts)}")

    if provider == 'sentence-transformers':
        print("Đang dùng mô hình SentenceTransformer: all-MiniLM-L6-v2")
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model.encode(texts, show_progress_bar=True)

    elif provider == 'openai':
        print("Đang dùng mô hình OpenAI: text-embedding-ada-002")
        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
        embeddings = []
        for text in texts:
            response = client.embeddings.create(input=text, model='text-embedding-ada-002')
            embeddings.append(response.data[0].embedding)
        embeddings = np.array(embeddings)

    np.save(output_file, embeddings)
    print(f"Đã lưu embeddings tại: {output_file}")


embed_documents(provider='sentence-transformers')


  from .autonotebook import tqdm as notebook_tqdm


Tổng số chunk cần embed: 7604
Đang dùng mô hình SentenceTransformer: all-MiniLM-L6-v2


Batches: 100%|██████████| 238/238 [02:53<00:00,  1.37it/s]

Đã lưu embeddings tại: d:\RAG\data\embeddings.npy





### Embeddings pdf chunks chuẩn

In [1]:
import json, os, numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm

IN = "../data/pdf_chunks_cleaned_for_embed_v3.json"
EMB_OUT = "../data/embeddings.npy"
META_OUT = "../data/embeddings_meta.json"
INDEX_OUT = "../data/faiss_index.ivf"

BATCH = 64
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" 
os.makedirs(os.path.dirname(EMB_OUT), exist_ok=True)

# 1) load chunks
with open(IN,"r",encoding="utf-8") as f:
    chunks = json.load(f)

texts = [c["text_for_embed"] for c in chunks]
meta  = [{"chunk_id": c.get("id"), 
          "source_chunk_id": c.get("source_chunk_id"), 
          "page": c.get("page")} 
          for c in chunks]

# 2) embed in batches
model = SentenceTransformer(MODEL_NAME)
embs = []
for i in tqdm(range(0, len(texts), BATCH)):
    batch = texts[i:i+BATCH]
    e = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
    embs.append(e)
embs = np.vstack(embs).astype("float32")
np.save(EMB_OUT, embs)
with open(META_OUT,"w",encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False)

# 3) build Faiss index (IVF + PQ optional). Simpler: IndexFlatIP if small.
d = embs.shape[1]
index = faiss.IndexFlatIP(d)          # cosine-like if vectors normalized
faiss.normalize_L2(embs)
index.add(embs)
faiss.write_index(index, INDEX_OUT)
print("Saved embeddings:", EMB_OUT, 
      "index:", INDEX_OUT, 
      "meta:", META_OUT)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 85/85 [00:21<00:00,  3.89it/s]

Saved embeddings: ../data/embeddings.npy index: ../data/faiss_index.ivf meta: ../data/embeddings_meta.json



