In [1]:
# ! pip install 'qdrant-client[fastembed]'

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import glob

import gc

### 1.  Load the model

In [2]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(model_path="./models/llama-2-13b-chat.Q4_K_S.gguf", 
               n_ctx = 4000, 
               max_tokens = 4000,
               f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
               callback_manager=callback_manager,
               verbose=True)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ./models/llama-2-13b-chat.Q4_K_S.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q5_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 

llm("capital of karnataka ?")

### 2. Load text document

In [3]:
documents = PyPDFLoader(file_path="./documents/HR_Policy_Manual.pdf").load()

### 3. Load Our Embeddings

In [4]:
embeddings = FastEmbedEmbeddings( model_name= "BAAI/bge-small-en-v1.5", 
                                 cache_dir="./embedding_model/")

### 4. Process of Embedding the documents

In [5]:
# define a splitter 

splitter = RecursiveCharacterTextSplitter( chunk_size = 512, 
                                           chunk_overlap  = 50 )
# split the text document 
text = splitter.split_documents(documents)


# preview of document split 
# print(text[180].page_content)

# Embed data and save it to directory


# if the chroma db files not present create fresh embeddings
if len(glob.glob("./vectordb/*.sqlite3")) == 0:
    db = Chroma.from_documents(documents= text, 
                               embedding= embeddings,
                               persist_directory= "./vectordb/")
else:
    db = Chroma(persist_directory="./vectordb/", embedding_function=embeddings)

### 5. Create a Retreiver (here we will be using a Ensomble Technique )

In [6]:
# from langchain.retrievers import SelfQueryRetriever
# from langchain.chains.query_constructor.base import AttributeInfo
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import LLMChainFilter

In [7]:
# # Helper function for printing docs

# def pretty_print_docs(docs):
#     print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [8]:
# # Define our metadata
# compression_retriever = ContextualCompressionRetriever(base_compressor= LLMChainFilter.from_llm(llm), 
#                                                        base_retriever= db.as_retriever() )

# # Example output
# compressed_docs = compression_retriever.get_relevant_documents("what is the travel policy?")
# pretty_print_docs(compressed_docs)

### 6. Infer data using Chatbot/ Agent/ Chain interface

In [9]:
# custome agent with tool retrieval : https://python.langchain.com/docs/modules/agents/how_to/custom_agent_with_tool_retrieval

In [10]:
# from langchain.chains import ConversationalRetrievalChain, StuffDocumentsChain, LLMChain
# from langchain.memory import ConversationBufferMemory
# from langchain_core.prompts import PromptTemplate


# # This controls how each document will be formatted. Specifically,
# # it will be passed to `format_document` - see that function for more
# # details.
# document_prompt = PromptTemplate(
#     input_variables=["page_content"],
#     template="{page_content}"
# )
# document_variable_name = "context"
# # The prompt here should take as an input variable the
# # `document_variable_name`
# stuff_prompt = PromptTemplate.from_template(
#     "Summarize this content: {context}"
# )

# llm_chain = LLMChain(llm=llm, prompt=stuff_prompt)

# combine_docs_chain = StuffDocumentsChain(llm_chain=llm_chain,
#                                          document_prompt=document_prompt,
#                                          document_variable_name=document_variable_name)


# # This controls how the standalone question is generated.
# # Should take `chat_history` and `question` as input variables.
# template = (
#     "Combine the chat history and follow up question into "
#     "a standalone question. Chat History: {chat_history}"
#     "Follow up question: {question}"
#     "Its important to make sure the answer is as short as possible and to the point"
#     "If the information is not present in the document, say you dont know please reach out to HR admin at hr@abcorg.in"
#     "Make sure to answer this in Less than 100 words"
# )

# prompt = PromptTemplate.from_template(template)

# question_generator_chain = LLMChain(llm=llm, prompt=prompt)


# xx = ConversationalRetrievalChain(
#     retriever = db.as_retriever(),
#     question_generator = question_generator_chain,
#     combine_docs_chain = combine_docs_chain,
#     callback_manager = callback_manager,
#     max_tokens_limit = 4000
# )

# chat_history = []

# xx.run({'question' : "tell me about tvs jupyter", 
#        'chat_history': chat_history})

In [11]:
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain import hub

In [12]:
rag_prompt_llama = hub.pull("rlm/rag-prompt-llama")


qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": rag_prompt_llama},
)

qa_chain.callback_manager = callback_manager
qa_chain.memory = ConversationBufferMemory()

In [None]:
qa_chain.run(" how many paid leaves do i get and how are they distributed ?")