In [5]:

from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredFileLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

from backend.common.config import BaseObject, Config


In [ ]:
class PDFRetriever(BaseObject):
    def __init__(
            self,
            config: Config = None,
            model=None,
            embedder=None,
    ):
        super().__init__()
        self.config = config if config is not None else Config()
        self._base_model = model
        self._embeddings = embedder

    def process_pdf(
            self,
            pdf_path: str,
            chunk_size: int = 1000,
            chunk_overlap: int = 200,
            unstructured_data: bool = False,
    ) -> Chroma:
        # Load the PDF
        loader = PyMuPDFLoader(pdf_path) \
            if not unstructured_data \
            else UnstructuredFileLoader(file_path=pdf_path)

        pages = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        texts = text_splitter.split_documents(pages)

        vector_store = Chroma.from_documents(
            texts,
            self._embeddings
        )
        return vector_store

    def retrieve(self, upload_dir: str, query: str, k: int = 5) -> list:
        if self._base_model is None:
            raise ValueError("Model not set")
        if upload_dir is None:
            raise ValueError("Upload directory not set")
        vector_store = Chroma.load(upload_dir)
        try:
            output = vector_store.retrieve(query, k)
        except Exception as e:
            raise ValueError(f"Error retrieving from vector store: {e}")
        return output

In [6]:
import os

print(os.getcwd())

D:\Workspace\AIProject\chatbot-agent-langchain\backend\examples


In [32]:
from dotenv import load_dotenv

load_dotenv()

True

In [39]:
from langchain_openai import OpenAIEmbeddings

retriever = PDFRetriever(
    embedder=OpenAIEmbeddings(),
)
process = retriever.process_pdf(pdf_path="./../data/pdf/OmniPred_ Language Models as Universal Regressors.pdf")
print(process)


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************2RwA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}