In [1]:
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_openai import OpenAIEmbeddings

from dotenv import load_dotenv, find_dotenv
founddotenv = load_dotenv(find_dotenv(), override=True) 
print("Found .env: %s", founddotenv)

path = "./pull_requests_summary.xlsx"
loader = UnstructuredExcelLoader(file_path=path, mode="elements")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
chunks = text_splitter.split_documents(data)
print(f"Document split into {len(chunks)} chunks")

try:
    
    embeddings = OpenAIEmbeddings(model= "text-embedding-3-small", dimensions=1536) #text-embedding-3-small
    print("Embeddings initiated")

    docs=filter_complex_metadata(chunks)
    print(f"lenghth after filter {len(docs)}")

    persist_directory_xlsx ='./chroma_db_xlsx'
    vector_db = Chroma.from_documents(
        documents=docs, 
        #documents=chunks, 
        embedding=embeddings, 
        persist_directory=persist_directory_xlsx, 
        collection_name="myRAG-xlsx"
        )
    print("Vector DB created")
except Exception as e:
    print("FAILED to create Vector DB")
    print("The error is: ",e)


Document split into 2449 chunks
Embeddings initiated
lenghth after filter 2449


: 