In [2]:
import os
import chromadb
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain

In [None]:
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"

In [None]:
embedding_model = HuggingFaceInstructEmbeddings(
            model_name=EMBEDDING_MODEL_NAME,
            embed_instruction="Represent the document for retrieval:",
            query_instruction="Represent the question for retrieving supporting documents:",
        )



In [None]:
def load_chunk_persist_pdf() -> Chroma:
    pdf_folder_path = ""
    documents = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    chunked_documents = text_splitter.split_documents(documents)
    client = chromadb.Client()
    if client.list_collections():
        consent_collection = client.create_collection("consent_collection")
    else:
        print("Collection already exists")
    vectordb = Chroma.from_documents(
        documents=chunked_documents,
        embedding=embedding_model,
        persist_directory="DB"
    )
    vectordb.persist()
    return vectordb

In [None]:

def create_agent_chain():
    
    model_name = "gpt2"

    llm = HuggingFacePipeline.from_model_id(
        model_id=model_name,
        task="text-generation",
        # pipeline_kwargs={"max_new_tokens": 10},
    )

    chain = load_qa_chain(llm, chain_type="stuff")

    return chain

In [None]:
def get_llm_response(query):
    vectordb = load_chunk_persist_pdf()
    chain = create_agent_chain()
    matching_docs = vectordb.similarity_search(query)
    answer = chain.run(input_documents=matching_docs, question=query)
    return answer


In [None]:
query = "What is the annual revenue?"
get_llm_response(query)