In [7]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [20]:
from langchain_community.document_loaders import DirectoryLoader

loader =DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
documents = loader.load()

100%|██████████| 2/2 [00:03<00:00,  1.54s/it]


In [21]:
# split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [22]:
embeddings = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-large-en",
    model_kwargs = {'device': 'cpu'},
    encode_kwargs ={'normalize_embeddings': False}

)


In [26]:
# ingesting into chromadb 

vector_store =Chroma.from_documents(texts, embeddings, collection_metadata ={"hnsw:space": "cosine"}, persist_directory="stores/pet_cosine")
print('Vector Store Created....')

Vector Store Created....


In [32]:
from langchain.llms import CTransformers
import os 
model ="neural-chat-7b-v3-1.Q4_K_M.gguf"

config ={
    'max_new_tokens': 1024,
    'repetition_penalty': 1.1,
    'temperature': 0.1,
    'top_k': 50,
    'top_p': 0.9,
    'stream': True,
    'threads': int(os.cpu_count() / 2)
}

llm=CTransformers(
    model = model, 
    model_type ='llama',
    max_new_tokens=1024,
    temperature=0.1,
    top_p=0.95,
    top_k=50,
    repetition_penality=1.1
 
)

print("LLM Initialized....")

LLM Initialized....


In [33]:
prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [34]:
load_vector_store = Chroma(persist_directory="stores/pet_cosine", embedding_function=embeddings)

  warn_deprecated(


In [35]:
retriever = load_vector_store.as_retriever(search_kwargs={"k":1})

In [47]:

def generate_response1(query):
    qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type ='stuff',
    retriever =retriever,
    return_source_documents=True,
    verbose=True 
)
    response = qa(query)
    print(response)
    answer = response['result']
    source_document = response['source_documents'][0].page_content
    doc = response['source_documents'][0].metadata['source']
    response_data = {"answer": answer, "source_document": source_document, "doc": doc}

    return json.dumps(response_data, indent=2)

In [48]:
query ='How is GAAP accounting different from tax accounting?'

generate_response1(query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
{'query': 'How is GAAP accounting different from tax accounting?', 'result': ' GAAP is accrual-based but tax is cash-based. GAAP uses straight-line depreciation or a few other methods whereas tax accounting has accelerated depreciation. GAAP is more complex and accurately tracks assets/liabilities, while tax accounting focuses on revenue/expenses in the current period and income taxes due.', 'source_documents': [Document(metadata={'page': 18, 'source': 'data/400 Questions & Technicals.pdf'}, page_content='http://breakingintowallstreet.com http://www.mergersandinquisitions.com  \n \n19 \n \n \n Accounting  Questions & Answers – Advanced  \n \nThese more advanced questions cover topics like deferred tax assets and liabilities and \nhow to actually project a company’s finan cial statements in an operating model.  \n \nYou may get some of these in investment banking interviews, but they’re more common \nif you’ve had 

'{\n  "answer": " GAAP is accrual-based but tax is cash-based. GAAP uses straight-line depreciation or a few other methods whereas tax accounting has accelerated depreciation. GAAP is more complex and accurately tracks assets/liabilities, while tax accounting focuses on revenue/expenses in the current period and income taxes due.",\n  "source_document": "http://breakingintowallstreet.com http://www.mergersandinquisitions.com  \\n \\n19 \\n \\n \\n Accounting  Questions & Answers \\u2013 Advanced  \\n \\nThese more advanced questions cover topics like deferred tax assets and liabilities and \\nhow to actually project a company\\u2019s finan cial statements in an operating model.  \\n \\nYou may get some of these in investment banking interviews, but they\\u2019re more common \\nif you\\u2019ve had significant finance experience or you\\u2019re interviewing for private equity, or \\nwith a more technical group.  \\n \\n1. How is G AAP accounting different from tax accounting?  \\n \\n1. 