In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

In [2]:
import glob 
import os 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, 
    chunk_overlap = 50
)
for file in glob.glob(os.path.join("Govt_dummy", "*.pdf")):
    print(f'Processing: {file}')
    documents = PyPDFLoader(file).load()
    all_splits = text_splitter.split_documents(documents)
    # Printing 2 chunks 
    for i, chunk in enumerate(all_splits[:2]): 
        print(f'chunk[{i}]\n{chunk.page_content}\n' + '-'*50)
    print('\n\n')

Processing: Govt_dummy/DoT_Finance_Compendium.pdf
chunk[0]
DISCLAIMER 
 
  
   This compendium has been published and all possible 
necessary care has been taken to make the material error-free. 
While every effort has been made to avoid any mistake or 
omission, neither IFD nor printer would be liable in any manner 
for any mistake/omission in this publication or for any action 
proposed/ taken or omitted to be proposed/ taken or advice 
rendered or accepted on the basis of this work. This 
compendium is prepared for use as a ready reckoner only and 
the reader is advised to exercise discretion and further consult 
the original OMs/instructions/guidelines. We look forward to 
your valuable feedback/ suggestions/corrections in this 
compilation.  
 
 
First Edition     : 2016 
Second Edition: 2017 
Third Edition   : 2019 
Fourth Edition  : 2023
--------------------------------------------------
chunk[1]
DOT FINANCE COMPENDIUM-2023 
  Updated upto February, 2023
(IMPORTANT ORDERS/INSTRU

In [3]:
model_embed_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(model_name = model_embed_name, \
                           model_kwargs = model_kwargs, \
                           encode_kwargs = encode_kwargs)
vector_db_dir = "./DoT_Gov"
if os.path.exists(vector_db_dir):
    vectorstore = Chroma(embedding_function = hf, persist_directory = vector_db_dir)
else:
    print('Creating vector-store from scratch.')
    vectorstore = Chroma.from_documents(documents = all_splits, \
                                        embedding = hf, 
                                        persist_directory = vector_db_dir)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [19]:
def doc_retriver(input_dict):
    rdocs = retriever.invoke(input_dict["query"])
    info = []
    for docs in rdocs:
        info.append(docs.page_content)
    input_dict["context"] = "\n".join(info)
    print(input_dict)
    return input_dict

In [4]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """You are an AI assistant specializing in summarizing information from government documents.""" +
                """Always base your answers strictly on the provided context: {context}.""" +
                """If the answer is not found in the context or in your knowledge base, simply say, "I don't know.""" + 
                """Do not make up or assume any information."""),
        ("user", "{query}")
    ]
)

prompt.invoke({"context": "K. Rajaraman said, 'I am happy to learn that the fourth edition of the Finance Compendium is being published by IFD.'.",
               "query": "What message has K. Rajaraman provided?"})

ChatPromptValue(messages=[SystemMessage(content='You are an AI assistant specializing in summarizing information from government documents.Always base your answers strictly on the provided context: K. Rajaraman said, \'I am happy to learn that the fourth edition of the Finance Compendium is being published by IFD.\'..If the answer is not found in the context or in your knowledge base, simply say, "I don\'t know.Do not make up or assume any information.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What message has K. Rajaraman provided?', additional_kwargs={}, response_metadata={})])

In [None]:
openai_api_key = "NA"
openai_api_base = "http://localhost:8000/v1"
model_name = "qwen2.5-7b-instruct-q4_0.gguf"
model = ChatOpenAI(
    api_key = openai_api_key,
    base_url = openai_api_base,
    model_name = model_name
)


In [20]:
chain = doc_retriver | prompt | model | StrOutputParser()
chain.invoke({"query": "What message has K. Rajaraman provided?"})

{'query': 'What message has K. Rajaraman provided?', 'context': 'n DOP&T has initially been made one of the\nResponden\nts, is sent to this Departme\nnt by 29\nth \nFebruary, 2016.\nYours sincerely, \n(V. Vidyavath\ni) \nTo, \nSecretarie\ns/Heads of all Ministries/ Departmen\nts (as per standard list). \n182\nTo \nDeputy Secretary to the Government of India \nFinancial Advisors of All advisors of All Central Government Ministries Departments.\n     (Kotluru Narayana Reddy) \n133\n"Hav ing\n scrutinized in    char act\ner roll  and pers ona\nl file  of   Shri /Sm\nt. \nKum and having taken into account all other relevant available \n178'}


"The provided context does not contain any message from K. Rajaraman. The context includes signatures and addresses of individuals like V. Vidyavathi and Kotluru Narayana Reddy, but there is no mention or message from K. Rajaraman. Therefore, based on the given information:\n\nI don't know."