In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OpenAI API Key is missing")

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [6]:
def ingest_pdf(pdf_files, persist_directory="Chroma_DB"):
    """
    Parse PDF, chunk content, generate embeddings, and store in ChromaDB.
    Args:
        pdf_files (list): List of paths to PDF files.
        persist_directory (str): Directory to store the ChromaDB. 
    """
    documents = []

    for pdf_file in pdf_files:
        print(f'Processing: {pdf_file}...')
        loader = PyPDFLoader(pdf_file)
        documents.extend(loader.load())

    print('Chunking documents...')
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 100
    )
    chunks = text_splitter.split_documents(documents)

    print('Generating embeddings and storing in vector DB...')
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = Chroma.from_documents(
        chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )

    vectorstore.persist()
    print(f'Chroma DB stored at: {persist_directory}')

In [7]:
pdf_files = ['data/1706.03762v7.pdf', 'data/1802.05365v2.pdf', 'data/1810.04805v2.pdf']
chroma_dir = 'Chroma-DB'

In [8]:
ingest_pdf(pdf_files=pdf_files, persist_directory=chroma_dir)

Processing: data/1706.03762v7.pdf...
Processing: data/1802.05365v2.pdf...
Processing: data/1810.04805v2.pdf...
Chunking documents...
Generating embeddings and storing in vector DB...




Chroma DB stored at: Chroma-DB
