In [66]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from IPython.display import display as Markdown
from tqdm.autonotebook import tqdm as notebook_tqdm

In [67]:
# local_path = "org_gazette.pdf"
local_path = "org_gazette.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

In [68]:
# Preview first page
Markdown(data[0].page_content)

'GetOriginal\n\nCheckPointThreatExtractionsecuredthisdocument\n\nsl iudcjd§ ckrcfha .eiÜ m;%h S ,xld m%cd;dka;% Y% w;s úfYI The Gazette of the Democratic Socialist Republic of Sri Lanka\n\nEXTRAORDINARY\n\nwxl 2403$53 - 2024 iema;eïn¾ ui 27 jeks isl=rdod - 2024.09.27 No. 2403/53 - frIDAy, sePteMber 27, 2024\n\n(Published by Authority)\n\nPART I : SECTION (I) — GENERAL\n\nGovernment Notifications\n\nThE CONSTITuTION Of ThE DEmOCRATIC SOCIALIST REPubLIC Of SRI LANkA\n\nNotifications\n\nwhereAs I have in the exercise of powers vested in me in terms of Paragraph (1) of Article 44 of the Constitution of the Democratic socialist republic of sri Lanka, determined the number of Ministers of the Cabinet of Ministers and the Ministries and the assignment of subjects and functions and Departments, statutory Institutions and Public Corporations to the said Ministers ;\n\nIt is now hereby notified that the subjects and functions and Departments, Statutory Institutions and Public Corporations in the

In [69]:
# 3. Set the environment variable
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [70]:
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [81]:
from langchain.schema import Document

# Split and chunk 

#text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
#chunks = text_splitter.split_documents(data)
# chunks = [page.page_content for page in data]
# Wrap the chunks into Document objects
# documents = [Document(page_content=chunk) for chunk in chunks]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)

# Split the document into chunks
chunks = text_splitter.split_documents(data)

In [72]:
print(chunks[:5])

[Document(metadata={'source': 'org_gazette.pdf'}, page_content='GetOriginal\n\nCheckPointThreatExtractionsecuredthisdocument\n\nsl iudcjd§ ckrcfha .eiÜ m;%h S ,xld m%cd;dka;% Y% w;s úfYI The Gazette of the Democratic Socialist Republic of Sri Lanka\n\nEXTRAORDINARY\n\nwxl 2403$53 - 2024 iema;eïn¾ ui 27 jeks isl=rdod - 2024.09.27 No. 2403/53 - frIDAy, sePteMber 27, 2024\n\n(Published by Authority)\n\nPART I : SECTION (I) — GENERAL\n\nGovernment Notifications\n\nThE CONSTITuTION Of ThE DEmOCRATIC SOCIALIST REPubLIC Of SRI LANkA\n\nNotifications\n\nwhereAs I have in the exercise of powers vested in me in terms of Paragraph (1) of Article 44 of the Constitution of the Democratic socialist republic of sri Lanka, determined the number of Ministers of the Cabinet of Ministers and the Ministries and the assignment of subjects and functions and Departments, statutory Institutions and Public Corporations to the said Ministers ;\n\nIt is now hereby notified that the subjects and functions and Dep

In [73]:
# 5. Try creating the vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)

# vector_db = Chroma.from_documents(
#     documents=documents,
#     embedding=OllamaEmbeddings(model="nomic-embed-text"),
#     collection_name="local-rag"
# )

In [74]:
## Retrieval

from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [75]:
# LLM from Ollama
local_model = "llama3.2"
llm = ChatOllama(model=local_model)

In [76]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [77]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [78]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [79]:
chain.invoke("List all the ministries and their departments in the gazette.")

'The question is not directly answered in the provided text, as it only lists the functions and laws to be implemented by the Ministry under the respective Ministers.\n\nHowever, based on the provided text, we can infer that the following departments, statutory institutions, and public corporations are likely to be under the Minister of Rural and Urban Development, Housing and Construction:\n\n1. Department of National Planning\n2. Department of External Resources\n\nNote that this is an inference, as the text does not explicitly list all the departments, statutory institutions, and public corporations under the Minister of Rural and Urban Development, Housing and Construction.\n\nIf you need a more comprehensive answer, I recommend checking the official government website or other reliable sources for the most up-to-date information on the departments, statutory institutions, and public corporations under this Ministry.'

In [80]:
# Delete all collections in the db
# vector_db.delete_collection()