In [1]:
pdf_path = 'data\\DVSTUDY_PAPER.pdf'

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path

In [3]:

loader = PyPDFLoader(pdf_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [4]:
async def load_all_pdfs(folder_path: str) -> list:
    pages = []
    pdf_files = Path(folder_path).rglob("*.pdf")
    
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        async for page in loader.alazy_load():
            pages.append(page)
    return pages

In [5]:
pages = await load_all_pdfs('data')

In [6]:
print(type(pages[0]))


<class 'langchain_core.documents.base.Document'>


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_splitter(pages:list[str], c_size: int, c_overlap: int) -> list:
    chunks = []
    if pages:
        try:
            splitter = RecursiveCharacterTextSplitter(chunk_size = c_size,chunk_overlap=c_overlap)
            chunks = splitter.split_documents(pages)
        except Exception as e:
            raise e
    return chunks


In [8]:
chunks = text_splitter(pages, 1000, 50)

In [9]:
print(type(chunks))

<class 'list'>


In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "intfloat/e5-base-v2"

from langchain.vectorstores import FAISS
db = FAISS.from_documents(chunks, embedding=HuggingFaceEmbeddings(model_name=model_name))

In [11]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

from langchain.vectorstores import FAISS
db2 = FAISS.from_documents(chunks, embedding=HuggingFaceEmbeddings(model_name=model_name))

In [12]:
model_name = "thenlper/gte-base"

from langchain.vectorstores import FAISS
db3 = FAISS.from_documents(chunks, embedding=HuggingFaceEmbeddings(model_name=model_name))

In [17]:
model_name = "BAAI/bge-base-en-v1.5"

from langchain.vectorstores import FAISS
db4 = FAISS.from_documents(chunks, embedding=HuggingFaceEmbeddings(model_name=model_name))

model.safetensors:   5%|4         | 21.0M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import os
import hashlib
import pickle
from pathlib import Path
from typing import List
from langchain_core.documents import Document
from langchain_chroma import Chroma
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings

# --- Embedding wrapper using thenlper/gte-base ---
class GTEEmbeddings(Embeddings):
    def __init__(self, model_name: str = "thenlper/gte-base"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, convert_to_numpy=True).tolist()


# --- Utility: Hash documents for caching ---
def compute_documents_hash(documents: List[Document]) -> str:
    hasher = hashlib.sha256()
    for doc in documents:
        hasher.update(doc.page_content.encode("utf-8"))
    return hasher.hexdigest()


# --- Main function: Embed and cache ---
def embed_and_store_once(
    documents: List[Document],
    persist_dir: str = "embeddings",
    model_name: str = "thenlper/gte-base"
) -> Chroma:

    os.makedirs(persist_dir, exist_ok=True)
    hash_path = Path(persist_dir) / "hash.pkl"
    current_hash = compute_documents_hash(documents)

    # Check for previously stored hash
    if hash_path.exists():
        with open(hash_path, "rb") as f:
            saved_hash = pickle.load(f)
        if saved_hash == current_hash:
            print("🟢 Reusing existing ChromaDB vector store from 'embeddings/'")
            return Chroma(
                persist_directory=persist_dir,
                embedding_function=GTEEmbeddings(model_name),
                client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
            )

    # Embed and store if hash differs
    print("🔵 Generating new embeddings and storing in 'embeddings/'...")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=GTEEmbeddings(model_name),
        persist_directory=persist_dir,
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )

    # Save hash for reuse
    with open(hash_path, "wb") as f:
        pickle.dump(current_hash, f)

    return vectorstore



In [29]:
vectorstore = embed_and_store_once(chunks)

🟢 Reusing existing ChromaDB vector store from 'embddings/'
