# Load and Embed PDF artciles
* [Following](https://www.linkedin.com/pulse/build-lightning-fast-rag-chatbot-powered-groqs-lpu-ollama-multani-ssloc)
* [Dockerized chromadb](https://medium.com/@pierrelouislet/getting-started-with-chroma-db-a-beginners-tutorial-6efa32300902)

In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True


In [2]:
import os
import sys
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

proj_dir = os.path.abspath(os.pardir)
sys.path.insert(1,proj_dir.split('wrangler/')[0])
from rezaware.modules.etl.loader import vectorDB as vec
from wrangler.modules.parliment.scrape import vectorize_pdf as vpdf

''' restart initiate classes '''
if debug:
    import importlib
    vec = importlib.reload(vec)
    vpdf= importlib.reload(vpdf)

__desc__ = "read pdf files in folder and store as vectors"
clsPDF = vpdf.dataWorkLoads(desc=__desc__)
__db_type__ = "chromadb"
__chromadb_dir__ = "/home/nuwan/workspace/penman/wrangler/data/parliment/scrape/"
__db_name__ = "ar"
clsVDB = vec.dataWorkLoads(
    desc=__desc__, 
    db_type=__db_type__, 
    db_root=__chromadb_dir__, 
    db_name=__db_name__
)

print("\n%s class initialization and load complete!" % __desc__)

All functional __PROPATTR__-libraries in LOADER-package of ETL-module imported successfully!
All functional VECTORDB-libraries in LOADER-package of ETL-module imported successfully!
All functional VECTORIZE_PDF-libraries in SCRAPE-package of PARLIMENT-module imported successfully!
All functional VECTORDB-libraries in LOADER-package of ETL-module imported successfully!
All functional VECTORIZE_PDF-libraries in SCRAPE-package of PARLIMENT-module imported successfully!
All functional APP-libraries in REZAWARE-package of REZAWARE-module imported successfully!
__propAttr__ Class initialization complete

read pdf files in folder and store as vectors class initialization and load complete!


## Loading documents

In [3]:
data_dir = "/home/nuwan/workspace/penman/wrangler/data/parliment/scrape/ar"
the_text=clsPDF.load_pdf_files(
    folder_path=data_dir
)
print("Loaded %d text pages" % len(the_text))

Loaded 56 text pages


## Splitting Text into Chunks

In [6]:
chunks = clsPDF.text_to_chunks(
    text = the_text
)
print("Split %d docs into %d chunks" % (len(the_text),len(chunks)))

Split 56 docs into 117 chunks


## Setting up the ChromaDB collection

In [None]:
from langchain_community.embeddings import OllamaEmbeddings

# __db_type__ = "chromadb"
# __db_name__ = "ar"
# __chromadb_dir__ = "/home/nuwan/workspace/penman/wrangler/data/article/stock/"
__collection__= "annualreports"
_embedding_fn = OllamaEmbeddings(model='nomic-embed-text')

collections_lst_ = []
try:
    collections_lst_ = clsVDB.get_collections(db_name = __db_name__)
except Exception as err:
    print(err)
if len(collections_lst_)<=0 or __collection__ not in [x.name for x in collections_lst_]:
    print("creating document collection %s" % __collection__.upper())
    vectorstore=clsVDB.store_vectors(
        documents=chunks,
        # db_name = __db_name__,
        collection = __collection__,
        embedding_fn=_embedding_fn,
    )
else:
    print("%s collection exists; reading documents" % __collection__.upper())
    vectorstore=clsVDB.read_vectors(
        db_name = __db_name__,
        collection = __collection__,
        embedding_fn=_embedding_fn,        
    )

print("Loaded vectorstore %s collection with %d embeddings"
      % (vectorstore._collection.name.upper(), vectorstore._collection.count()))

In [9]:
docs = vectorstore.similarity_search("list all the annual report years?")
docs

[Document(metadata={'page': 0, 'source': '/home/nuwan/workspace/penman/wrangler/data/parliment/scrape/ar/1725007046002945_navy.pdf'}, page_content='SRI LANKA NAVY \nANNUAL PERFORMANCE REPORT - 2023'),
 Document(metadata={'page': 21, 'source': '/home/nuwan/workspace/penman/wrangler/data/parliment/scrape/ar/1725007046002945_navy.pdf'}, page_content='27\n CHAPTER 03  \nOVERALL FINANCIAL PERFORMANCE FOR THE YEAR  \nENDED ON 31ST DECEMBER 2023 \n  \n3.1  Statement of Financial Performance \nACA -F \nStatement of Financial Performance \nfor the period ended on 31st December 2023 \n  \nBudget 2023   Note Actual \nCurrent Year \n(2023) (Rs.) Previous Year \n(2022) (Rs.)  Revenue Receipts  900,000,000.00 247,000,000.00 - Income Tax 1 - - - Taxes on Domestic Goods \n& Services  \n2 -  - - Taxes on International \nTrade  \n3 - - 850,000,000.00 Non Tax Revenue & \nothers  \n4 900,000,000.00 247,000,000.00 850,000,000.00 Total Revenue Receipts \n(A)  900,000,000.00 247,000,000.00 - Non Revenue Rece

## Setting Up Groq's LPU for Inference

In [35]:
from langchain_groq import ChatGroq

llm = ChatGroq(
            groq_api_key=os.getenv("GROQ_API_KEY"),
            model_name='mixtral-8x7b-32768'
    )

llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7f99b28c8c70>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7f99b2a125f0>, groq_api_key=SecretStr('**********'))

## Building the RAG Chain

In [36]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

retriever = vectorstore.as_retriever()

rag_template = """Answer this question using the provided context only.
{question}

Context:
{context}"""
rag_prompt = ChatPromptTemplate.from_template(rag_template)
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

## Testing the RAG Architecture

In [44]:
import textwrap
response = rag_chain.invoke("Build a formatted covering page for the Navy 2023 annual report")
# response = rag_chain.invoke("build a formatted covering page for the 2023 navy annual report")
print(textwrap.fill(response, width=80))

Here is a formatted covering page for the Navy 2023 annual report:  ---  SRI
LANKA NAVY  ANNUAL PERFORMANCE REPORT - 2023  ---  This annual report provides
an overview of the performance of the Sri Lanka Navy for the year 2023. It
includes a detailed analysis of various aspects of the navy's operations,
including institutional profile, progress and future outlook, financial
performance, performance indicators, and human resources profile.  The report
also includes a chapter on the navy's efforts to contribute to the achievement
of Sustainable Development Goals (SDGs), specifically in the area of human
resource development. The navy has provided internal courses for naval personnel
to improve their skills and attitudes, with a total of 437 trained officers and
5457 trained sailors. Additionally, the navy has trained a significant number of
personnel through external programs, with 584 trained officers and 1576 trained
sailors.  The navy has also conducted a total of 31 exams and tests, 

## Launching the Gradio Interface

In [41]:
def process_question(user_question):
    # Processing and response time measurement here
    iface = gr.Interface(
        fn=process_question,
        inputs=gr.Textbox(lines=2,
                          placeholder="Type your question here..."),
        outputs=gr.Textbox(),
        title="GROQ CHAT",
        description="Ask any question about your document, and get an answer along with the response time.")
    iface.launch(share=False)

In [42]:
user_question = "what is cap?"
process_question(user_question)

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


Traceback (most recent call last):
  File "/home/nuwan/.cache/pypoetry/virtualenvs/penman-UgVLv5KY-py3.10/lib/python3.10/site-packages/gradio/queueing.py", line 622, in process_events
    response = await route_utils.call_process_api(
  File "/home/nuwan/.cache/pypoetry/virtualenvs/penman-UgVLv5KY-py3.10/lib/python3.10/site-packages/gradio/route_utils.py", line 323, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/nuwan/.cache/pypoetry/virtualenvs/penman-UgVLv5KY-py3.10/lib/python3.10/site-packages/gradio/blocks.py", line 2014, in process_api
    result = await self.call_function(
  File "/home/nuwan/.cache/pypoetry/virtualenvs/penman-UgVLv5KY-py3.10/lib/python3.10/site-packages/gradio/blocks.py", line 1567, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/home/nuwan/.cache/pypoetry/virtualenvs/penman-UgVLv5KY-py3.10/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_asy

In [2]:
# from langchain_groq import ChatGroq
# from langchain_community import embeddings
# from langchain_core.prompts import ChatPromptTemplate
# from langchain.chains import create_retrieval_chain
# from langchain_core.runnables import RunnablePassthrough
# from langchain_core.output_parsers import StrOutputParser
# # from google.colab import userdata
# # import os
# import time
# import textwrap
# import gradio as gr
# ''' fix the problem with sqllite warning '''
# __import__('pysqlite3')
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
# # 
# __groq_api_key__ = os.environ.get("GROQ_API_KEY")

# print("All libs loaded!")

NameError: name 'os' is not defined