In [None]:
persist_directory = r"C:\Users\suram\OneDrive\Desktop\rag project\chroma_db"

In [None]:
import json
import multiprocessing
from concurrent.futures import ThreadPoolExecutor
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma


json_file = r"C:\Users\suram\OneDrive\Desktop\rag project\split_chunks.json"


with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)  


if isinstance(data, list):  
    documents = [Document(page_content=chunk) for chunk in data]
    print(f"Loaded {len(documents)} chunks from {json_file}")
else:
    print("Error: JSON is not a list of chunks!")
    exit()


embedding_function = OllamaEmbeddings(model="nomic-embed-text")


def process_batch(batch):
    """Function to process a batch of documents and return their embeddings."""
    return embedding_function.embed_documents([doc.page_content for doc in batch])


num_cores = min(8, multiprocessing.cpu_count())  
batch_size = 80  
batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]


embeddings = []
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    results = executor.map(process_batch, batches)
    for result in results:
        embeddings.extend(result)  


persist_directory = r"C:\Users\suram\OneDrive\Desktop\rag project\chroma_db"
vectorstore = Chroma.from_documents(documents, embedding_function, persist_directory=persist_directory)

print(f"Embeddings stored successfully in ChromaDB at {persist_directory}!")


Loaded 15148 chunks from C:\Users\suram\OneDrive\Desktop\rag project\split_chunks.json
Embeddings stored successfully in ChromaDB at C:\Users\suram\OneDrive\Desktop\rag project\chroma_db!
