In [1]:
import os
import fitz

In [233]:
# Path to the dataset folder
DATASET_PATH = r"data/downloaded_pdfs/Dataset"

def extract_text_from_pdfs(folder_path):
    texts = []
    metadata = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(f"Extracting text from {pdf_path}...")
                
                doc = fitz.open(pdf_path)
                for page_num, page in enumerate(doc, start=1):
                    text = page.get_text()
                    # if text.strip():
                    texts.append(text.strip())
                    metadata.append({
                        "source": pdf_path,
                        "folder": folder_name,
                        "title": file,
                        "page": page_num
                    })
    return texts, metadata

In [234]:
documents, metadatas = extract_text_from_pdfs(DATASET_PATH)

Extracting text from data/downloaded_pdfs/Dataset/Services/MAIRDOES/3-CC2024-MAIRDOE-ENROLLMENT.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/MAIRDOES/1-CC2024-MAIRDOE-NEW.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/MAIRDOES/2-CC2024-MAIRDOE-RENEWAL.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/MAIRDOES/4-CC2024-MAIRDOE-STOCK-REPORT.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/Driver’s License/12-CC2024-DL-CERTIFICATION.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/Driver’s License/15-CC2024-MSC-TRX.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/Driver’s License/14-CC2024-DL-CODES.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/Driver’s License/1-CC2024-SP.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/Driver’s License/FDM-Vol.-1-2nd-Edition.pdf...
Extracting text from data/downloaded_pdfs/Dataset/Services/Driver’s License/13-CC2

In [235]:
docs = documents

In [236]:
len(docs)

3506

In [237]:
from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter

documents = [Document(text=docs[t], metadata=metadatas[t]) for t in range(len(docs))]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

In [247]:
oc = fitz.open('data/downloaded_pdfs/Dataset/Issuances/Republic Act/RA-4136.pdf')
oc[0].get_text()

''

In [243]:
for node in nodes:
    if node.metadata['source']== 'data/downloaded_pdfs/Dataset/Issuances/Republic Act/RA-4136.pdf':
        print(node)

Node ID: 9d79f4dd-c36c-49a2-9ddc-814d96403017
Text:
Node ID: 6c314cf7-be2d-4d5c-8d02-74ebd6d7571a
Text:
Node ID: 0e12206a-52d1-4aa6-8f88-39e19332dd4b
Text:
Node ID: e609eb95-9656-40c9-bf21-88f55951f3f7
Text:
Node ID: 9c907fca-0bdc-470d-9b94-6855faa5489f
Text:
Node ID: 4ca7c75c-7832-4bb2-8fe2-ac6c59c24033
Text:
Node ID: cbc9d961-f5d3-470d-af35-071baf8f6bc4
Text:
Node ID: 2c6bdac7-813a-4864-b0b3-b7d2aa2a7e2a
Text:
Node ID: 6afac422-190a-42a4-b44f-25a09ff3b919
Text:
Node ID: 2b393d80-a5fa-4434-9af6-f7a62337c920
Text:
Node ID: 8b7b5d07-1d13-4411-b36d-6a44157ec811
Text:
Node ID: 68edb3d2-7533-4235-9e50-a2486e4e9f7f
Text:
Node ID: e85a6ec2-6b68-46d7-98f4-fe79d1f37149
Text:
Node ID: da3ccf1c-fcb3-4fab-8317-0b7120877068
Text:
Node ID: 9241b2e2-b846-45bb-a53a-50482e01f6e8
Text:
Node ID: 11112f42-2d33-4672-b68b-9c2c355a7ba1
Text:
Node ID: af170151-4f33-412c-8868-ba414d607859
Text:
Node ID: 385b1dcf-6837-4b53-87c3-179f28ad0f19
Text:


In [8]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in nodes:
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [9]:
from ollama import Client
import numpy as np

client = Client(
  host='http://localhost:11434',
)

# Dense Embeddings

In [10]:
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

In [11]:
nodes_embed[0].embedding

[0.4905115067958832,
 -0.3361624479293823,
 0.35179951786994934,
 0.27008944749832153,
 0.017550677061080933,
 -0.5324192047119141,
 0.04573739320039749,
 -0.11445765197277069,
 0.1529209315776825,
 0.8597106337547302,
 0.5755346417427063,
 0.45315030217170715,
 -0.45152753591537476,
 0.343028724193573,
 -0.4754539728164673,
 0.5961640477180481,
 0.2048482447862625,
 0.8806662559509277,
 0.15800607204437256,
 -0.18487393856048584,
 -0.5244441032409668,
 0.33951929211616516,
 -0.9454126358032227,
 0.2534672021865845,
 -0.573695182800293,
 0.5495792031288147,
 0.07499080896377563,
 0.17193442583084106,
 1.1673059463500977,
 0.47238942980766296,
 -1.2556650638580322,
 0.07641199231147766,
 -0.126946821808815,
 -0.7730284333229065,
 -0.28147053718566895,
 0.38375645875930786,
 0.6447494029998779,
 -0.7354620695114136,
 0.10551166534423828,
 -0.4301678538322449,
 -0.5080622434616089,
 0.0801476389169693,
 0.2961499094963074,
 -0.12003739178180695,
 -1.2700153589248657,
 -0.5164370536804199,

In [12]:
import numpy as np
import faiss
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import TextNode, NodeWithScore

class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents, embeddings):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents
        self.embeddings = embeddings
    
    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """
        distances, indices = self.faiss_index.search(query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [13]:
embeddings = np.array([np.array(node.embedding) for node in nodes])
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [14]:
normalized_embeddings

array([[ 0.02990641, -0.02049577,  0.02144916, ...,  0.00137737,
        -0.02079806,  0.01479261],
       [ 0.02842975,  0.00793104, -0.00212581, ..., -0.04173804,
        -0.02449265,  0.02197738],
       [-0.00497755,  0.03766523, -0.01356215, ..., -0.02030472,
        -0.02340788, -0.00385419],
       ...,
       [ 0.00414736,  0.01393432,  0.01429416, ...,  0.01305378,
         0.00785751, -0.01734604],
       [-0.02822733,  0.03010949,  0.05656858, ...,  0.04565517,
        -0.00194257,  0.00101682],
       [ 0.00743732, -0.02455021,  0.03684381, ...,  0.02517157,
         0.01407912, -0.01194485]])

In [15]:
embedding_dim = normalized_embeddings[0].shape[0]
faiss_index = faiss.IndexFlatIP(embedding_dim)  # Inner-product similarity
faiss_index.add(normalized_embeddings)

In [16]:
retriever = FAISSVectorStoreRetriever(faiss_index, nodes, embeddings)

In [17]:
query = "What do you call a public utility vehicle that is operating with suspended or cancelled CPC?"

In [18]:
response = client.embeddings(prompt=query, model="mxbai-embed-large")
query_embedding = np.array([response["embedding"]])
query_embedding /= np.linalg.norm(query_embedding, axis=1, keepdims=True)

In [19]:
query_embedding

array([[-0.01295556, -0.01045785,  0.00355195, ...,  0.02388491,
         0.05192703, -0.02185194]])

In [20]:
top_k_docs = retriever._retrieve(query_embedding, top_k=15)

In [21]:
# Output retrieved documents
for doc in top_k_docs:
    print(f"Retrieved Document: {doc.node.text}, Score: {doc.score}\n")

Retrieved Document: I' 
cancelled 
CPC 
and 
the 
· Decision/Order. of 
suspension 
.or 
cancellation 
·is 
executory; and 
e.. A 
PUV 
with 
expired CPC and 
without a pending 
application 
for 
extension 
, of ' 
validity . timely ' 
filed before ·:the 
Board. 
, I 
! . 
i i 
! : 
! 
Revocation of ALL CPCs (ehtire fleet) of the 
operator; 
·. Disqualification ofibe -Operator, and, in case of a 
:corporation, alL · its stockholders and directors, to 
operate any kind of public land transportation; 
Blackl!sting of ALL authorized units (entire fleet) of 
the operator from being used as public utility vehicle; 
and 
Revocation of the registration of ALL authorized 
units (e~tire fle~t) of the operator. 
I 
. 
. 
, 
. 
. , 
In 'determining the >frequency of offenses, the L TFRB and its 
R~~Bs will'courit offenses against operators and not cigainst a 
P<3f1icLilar 
motor vehiCle 
or 
CPC. 
Hence, the 
second 
apprehen_sioh CDf a vehicle. belonging to the same operator, 
regafdless ~ of Wh

In [187]:
def summarize_each_chunk(nodes, client, model, query, if_docs=False):
    if if_docs:
        chunks = [doc.text for doc in nodes]
    else:
        chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Based on the query: "{query}", summarize the following text in at most one paragraph. 
        Preserve key points that are relevant to the query and remove redundant or unrelated information.
        And lastly do not answer the query itself, just focus on summarization.
        Do not add new information, focus only on the text provided.
        
        Text:
        {chunk}
        
        Summary (relevant to the query):
        """
        
        response = client.generate(model=model, prompt=prompt)
        
        summary = response['response'].strip()
        summaries.append(summary)
        # print(f"Chunk {i+1} Summary (Relevant to Query):\n{summary}\n")

    return summaries

In [23]:
summaries = summarize_each_chunk(top_k_docs, client, model='llama3.2:1b', query=query)

In [24]:
summaries

['The text refers to an action taken by the LTFRB that results in the suspension or cancellation of public utility vehicle (PUV) CPCs for operators who have expired CPCs, without pending applications for extension. The resulting CPC revocation affects not only the operating vehicles but also their registration, and any subsequent vehicles registered under a suspended CPC are considered "second offenses" and subject to further penalties.',
 'The summary can be: A public utility vehicle operating with suspended or cancelled CPC is subject to private/green plate; blacklist, revocation of registration for other units.',
 'The public utility vehicle refers to a group or number of road vehicles operated by private entities, such as operators established for providing transport and logistics services, with or without special permits or authorization.\n\nHere is the summary:\n\nPublic Utility Vehicles are those that operate under suspended or cancelled Public Convenience Permits (PSPs).',
 'He

In [25]:
def generate_response_with_notice(summaries, query, client, model="llama3.2"):
    # Combine summaries into context block
    context = "\n".join(summaries)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Use the following summarized information to answer the query accurately and concisely. 
    DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
    If the information is not sufficient to fully address the query, respond ONLY with:
    "The available information is insufficient to provide a complete answer to this query."

    Summarized Context:
    {context}
    
    Query:
    {query}
    
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

In [26]:
generate_response_with_notice(summaries, query, client)
# print(generate_response_with_notice([docs.node.text for docs in top_k_docs], query, client))

'A public utility vehicle operating with suspended or cancelled Community Protection Clause (CPC) is referred to as "colorum".'

# Parse Embeddings

In [27]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer

In [28]:
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

In [29]:
retrieved_nodes = bm25_retriever.retrieve(query)

In [30]:
# Output retrieved documents
for doc in retrieved_nodes:
    print(f"Retrieved Document: {doc.node.text}, Score: {doc.score}\n")

Retrieved Document: 84 
 
 
TYPE OF VIOLATION 
PENALTIES FOR 
2ND OFFENSE 
 
 
d. A 
PUV 
with 
suspended 
or 
cancelled CPC and 
Decision/Order 
of 
the suspension or 
cancellation 
is 
executory; and 
 
 
e. A PUV with expired 
CPC and without a 
pending application 
for 
extension 
of 
validity timely filed 
before the Board. 
 
 
1. Revocation of ALL CPCs (entire fleet) of 
the operator; 
 
2. Disqualification of the operator, and, in 
case of a corporation, all its stockholders 
and directors, to operate any kind of public 
land transportation; 
 
3. Blacklisting of ALL authorized units (entire 
fleet) of the operator from being used as 
public utility vehicle; and 
 
4. Revocation of the registration of ALL 
authorized units (entire fleet) of the 
operator. 
 
In determining the frequency of offenses, the 
LTFRB and its RFRBs will count offenses 
against operators and not against a particular 
motor vehicle or CPC. Hence, the second 
apprehension of a vehicle belonging to the 
sa

In [31]:
summaries_2 = summarize_each_chunk(retrieved_nodes, client, model='llama3.2:1b', query=query)

In [32]:
# generate_response_with_notice(summaries_2, query, client)
generate_response_with_notice([docs.node.text for docs in retrieved_nodes], query, client)

'A Public Utility Vehicle (PUV) that is operating with a suspended or cancelled Certificate of Public Convenience (CPC) is considered to be "Operating Without Franchise" and also may be subject to penalties as listed under violation 7, including fines ranging from P5,000.00 to P15,000.00, depending on the number of offenses.'

# Hybrid

In [33]:
results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}

In [34]:
from llama_index.core.retrievers import QueryFusionRetriever
x = QueryFusionRetriever
ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)

In [35]:
print(ranked_results[0].text)

84 
 
 
TYPE OF VIOLATION 
PENALTIES FOR 
2ND OFFENSE 
 
 
d. A 
PUV 
with 
suspended 
or 
cancelled CPC and 
Decision/Order 
of 
the suspension or 
cancellation 
is 
executory; and 
 
 
e. A PUV with expired 
CPC and without a 
pending application 
for 
extension 
of 
validity timely filed 
before the Board. 
 
 
1. Revocation of ALL CPCs (entire fleet) of 
the operator; 
 
2. Disqualification of the operator, and, in 
case of a corporation, all its stockholders 
and directors, to operate any kind of public 
land transportation; 
 
3. Blacklisting of ALL authorized units (entire 
fleet) of the operator from being used as 
public utility vehicle; and 
 
4. Revocation of the registration of ALL 
authorized units (entire fleet) of the 
operator. 
 
In determining the frequency of offenses, the 
LTFRB and its RFRBs will count offenses 
against operators and not against a particular 
motor vehicle or CPC. Hence, the second 
apprehension of a vehicle belonging to the 
same operator, regardl

In [36]:
# Output retrieved documents
for doc in ranked_results[:5]:
    print(f"Retrieved Document: {doc.node.text}, Score: {doc.score}\n")

Retrieved Document: 84 
 
 
TYPE OF VIOLATION 
PENALTIES FOR 
2ND OFFENSE 
 
 
d. A 
PUV 
with 
suspended 
or 
cancelled CPC and 
Decision/Order 
of 
the suspension or 
cancellation 
is 
executory; and 
 
 
e. A PUV with expired 
CPC and without a 
pending application 
for 
extension 
of 
validity timely filed 
before the Board. 
 
 
1. Revocation of ALL CPCs (entire fleet) of 
the operator; 
 
2. Disqualification of the operator, and, in 
case of a corporation, all its stockholders 
and directors, to operate any kind of public 
land transportation; 
 
3. Blacklisting of ALL authorized units (entire 
fleet) of the operator from being used as 
public utility vehicle; and 
 
4. Revocation of the registration of ALL 
authorized units (entire fleet) of the 
operator. 
 
In determining the frequency of offenses, the 
LTFRB and its RFRBs will count offenses 
against operators and not against a particular 
motor vehicle or CPC. Hence, the second 
apprehension of a vehicle belonging to the 
sa

In [37]:
ranked_results

[NodeWithScore(node=TextNode(id_='242f37c6-372d-4dd8-9d39-dbb4810183f5', embedding=None, metadata={'source': 'data/downloaded_pdfs/Dataset/Services/Driver’s License/FDM-vol.-2-2nd-Edition.pdf', 'folder': 'Driver’s License', 'title': 'FDM-vol.-2-2nd-Edition.pdf', 'page': 88}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='afc82b49-c914-4b8d-ba7d-9849d58bb9d6', node_type='4', metadata={'source': 'data/downloaded_pdfs/Dataset/Services/Driver’s License/FDM-vol.-2-2nd-Edition.pdf', 'folder': 'Driver’s License', 'title': 'FDM-vol.-2-2nd-Edition.pdf', 'page': 88}, hash='2bf87684384342ab87d5717f621b0505e927d7c4c9aa6a0468dbea6690d40b3d')}, metadata_template='{key}: {value}', metadata_separator='\n', text='84 \n \n \nTYPE OF VIOLATION \nPENALTIES FOR \n2ND OFFENSE \n \n \nd. A \nPUV \nwith \nsuspended \nor \ncancelled CPC and \nDecision/Order \nof \nthe suspension or \ncancellation \nis \nexecutory; and \n \

In [38]:
summaries_3 = summarize_each_chunk(ranked_results[:5], client, model='llama3.2:1b', query=query)

In [39]:
# generate_response_with_notice(summaries_3, query, client)
generate_response_with_notice([docs.node.text for docs in ranked_results[:5]], query, client)

'A Public Utility Vehicle (PUV) that is operating with a Suspended or Cancelled Certificate of Public Convenience (CPC) is referred to as "Colorum". \n\n Colorum refers to a PUV that operates without the proper authority and license from the Land Transportation Franchising and Regulatory Board (LTFRB), such as an expired, suspended, or cancelled CPC.'

In [70]:
import logging
logging.basicConfig(level=logging.INFO)

In [316]:
def gen_query(query, top_k, client, mode='dense', summary=False, model="llama3.2"):
    response = client.embeddings(prompt=query, model="mxbai-embed-large")
    query_embedding = np.array([response["embedding"]])
    query_embedding /= np.linalg.norm(query_embedding, axis=1, keepdims=True)

    top_k_docs = retriever._retrieve(query_embedding, top_k=top_k)

    # summaries = summarize_each_chunk(top_k_docs, client, model='llama3.2:1b', query=query)
    retrieved_nodes = bm25_retriever.retrieve(query)
    # summaries_2 = summarize_each_chunk(retrieved_nodes, client, model='llama3.2:1b', query=query)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    x = QueryFusionRetriever
    ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)

    # summaries_3 = summarize_each_chunk(ranked_results[:top_k], client, model='llama3.2:1b', query=query)
    # answer = generate_response_with_notice(summaries_3, query, client)
    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results[:top_k]

    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.2:latest', query=query,if_docs=True)
        context = summaries

    answer = generate_response_with_notice(context, query, client, model=model)

        # Format the references
    references = []
    for i, doc in enumerate(ranked_results[:top_k], start=1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
        references.append(source_info)

    return answer, "\n".join(references)

In [175]:
import gradio as gr

In [201]:
# Gradio interface with slider for top_k
iface = gr.Interface(
    fn=lambda query, top_k: gen_query(query=query, top_k=top_k, client=client),  # Pass top_k dynamically
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(1, 20, value=5, step=1, label="Top K Results")  # Slider for top_k (1 to 20)
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="References", lines=10),
    ],
    title="RAG System with LLaMA 3.2",
    description="Ask questions and get answers with references from PDF documents. Adjust Top-K to control the number of retrieved chunks."
)

iface.launch()
def on_close():
    iface.set_on_close(on_close)
    iface.launch()
    iface.close()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


using FAISS
using FAISS
using FAISS


In [43]:
ranked_results[0].metadata

{'source': 'data/downloaded_pdfs/Dataset/Services/Driver’s License/FDM-vol.-2-2nd-Edition.pdf',
 'folder': 'Driver’s License',
 'title': 'FDM-vol.-2-2nd-Edition.pdf',
 'page': 88}

In [44]:
import pandas as pd

In [45]:
# Load the Excel file
file_path = 'data/LTO_EXAM.csv'
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())



# Few-shot examples
FEW_SHOTS = """
Example 1:
Question: What should you do in case your vehicle breaks down on an expressway? Check all that apply.
A. Open your trunk and hood
B. Stand on the expressway and flag down passing drivers for help
C. Call for help using a mobile phone or an expressway phone
D. Park as far to the right as possible
E. Put your hazard warning light on
Please answer only in letters
Correct Answer: [A, B, C, D, E]

Example 2:
Question: What will happen when your front tire blows out?
A. The back end will sway towards the side of the blowout
B. The back end will sway away from the blowout
C. The front end will pull towards the side of the blowout
D. The front end will pull to the opposite side of the blowout
Please answer only in letters
Correct Answer: [C]

Example 3:
Question: What should you do when an ambulance comes up behind you flashing red lights and/or sounding its siren?
A. Stop as soon as you can
B. Maintain your speed, let the ambulance driver will find a way around you
C. Speed up so that you don't hold the ambulance
D. Pull over to the right and slow down or even stop if necessary
Please answer only in letters
Correct Answer: [D]
"""

# Generate prompts dynamically
def generate_prompt(row):
    options = []
    for choice in ['A', 'B', 'C', 'D', 'E']:
        # Check for NaN or blank values
        if pd.notna(row[choice]) and row[choice] != '':
            options.append(f"{choice}. {row[choice]}")
    
    # Construct the prompt with few-shot examples
    prompt = f"\nActual Question: {row['Question']}\n" + "\n".join(options)
    prompt += "\nPlease answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then addd comma separator if there are multiple answers ONLY IF ALLOWED."
    
    return prompt

# Apply function to each row
df['Prompt'] = df.apply(generate_prompt, axis=1)

# for i, prompt in enumerate(df['Prompt']):
#     if i >= 3:  # Skip the first three examples
#         print(prompt)
#         print('-' * 50)

                                            Question  \
0  What should you do in case your vehicle breaks...   
1   What will happen when your front tire blows out?   
2  What should you do when an ambulance comes up ...   
3  While driving the hood of your car lifts up bl...   
4  In case of an accident, the first duty of the ...   

                                                   A  \
0                           Open your trunk and hood   
1  The back end will sway towards the side of the...   
2                            Stop as soon as you can   
3  Look through the gap underneath the hood or ou...   
4  pick-up the injured person and take him to the...   

                                                   B  \
0  Stand on the expressway and flag down passing ...   
1       The back end will sway away from the blowout   
2  Maintain your speed, let the ambulance driver ...   
3         Brake suddenly so you don't leave the road   
4        report the accident to the nearest ho

In [249]:
from tqdm import tqdm

In [250]:
qr_range = (23,43)

In [251]:
df["AI"] = np.nan

In [325]:
ai_answer = []
for i in tqdm(range(*qr_range)):
    ai_answer.append(gen_query(df.loc[i,"Prompt"], top_k=20, client=client, mode='dense', model="llama3.1:8b", summary=False))

  0%|                                                                                                                                                                                          | 0/20 [00:00<?, ?it/s]

using FAISS


  5%|████████▉                                                                                                                                                                         | 1/20 [00:00<00:15,  1.21it/s]

using FAISS


 10%|█████████████████▊                                                                                                                                                                | 2/20 [00:01<00:14,  1.24it/s]

using FAISS


 15%|██████████████████████████▋                                                                                                                                                       | 3/20 [00:02<00:13,  1.27it/s]

using FAISS


 20%|███████████████████████████████████▌                                                                                                                                              | 4/20 [00:03<00:12,  1.32it/s]

using FAISS


 25%|████████████████████████████████████████████▌                                                                                                                                     | 5/20 [00:03<00:11,  1.32it/s]

using FAISS


 30%|█████████████████████████████████████████████████████▍                                                                                                                            | 6/20 [00:04<00:10,  1.30it/s]

using FAISS


 35%|██████████████████████████████████████████████████████████████▎                                                                                                                   | 7/20 [00:05<00:09,  1.32it/s]

using FAISS


 40%|███████████████████████████████████████████████████████████████████████▏                                                                                                          | 8/20 [00:06<00:09,  1.29it/s]

using FAISS


 45%|████████████████████████████████████████████████████████████████████████████████                                                                                                  | 9/20 [00:06<00:08,  1.36it/s]

using FAISS


 50%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 10/20 [00:07<00:07,  1.35it/s]

using FAISS


 55%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 11/20 [00:08<00:06,  1.46it/s]

using FAISS


 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 12/20 [00:08<00:05,  1.37it/s]

using FAISS


 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 13/20 [00:09<00:05,  1.34it/s]

using FAISS


 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 14/20 [00:10<00:04,  1.34it/s]

using FAISS


 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 15/20 [00:11<00:03,  1.33it/s]

using FAISS


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 16/20 [00:12<00:03,  1.31it/s]

using FAISS


 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 17/20 [00:12<00:02,  1.32it/s]

using FAISS


 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 18/20 [00:13<00:01,  1.28it/s]

using FAISS


 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 19/20 [00:14<00:00,  1.27it/s]

using FAISS


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.32it/s]


In [326]:
df.loc[qr_range[0]:qr_range[1]-1, "AI"] = [answ[0] for answ in ai_answer]
df.loc[qr_range[0]:qr_range[1]-1, ["Question","Answer","AI"]]

Unnamed: 0,Question,Answer,AI
23,"To aobtain one's driver's license, one must be...",B,[C]
24,A pre-trip inspection should be completed:,C,[C]
25,It is not conidered safe driving on an express...,C,[C]
26,When you do not see the wheels of the vehicles...,A,"[A, B]"
27,Your speed while driving at night should keep on:,A,[A]
28,"When driving at night, you should",C,[A]
29,How close should another car be before you dim...,A,[A]
30,"If you are backing up in a straight line, turn...",A,[B]
31,The blind spot is the area to your right or le...,C,[C]
32,"At an intrsection with traffic signals, if you...",A,[A]


In [127]:
print(df.loc[16,"Prompt"])


Actual Question: Which of the following is the maximum speed limit on expressway for cars?
A. 60 kph
B. 80 kph
C. 100 kph
Please answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then addd comma separator if there are multiple answers ONLY IF ALLOWED.


In [152]:
# Get top 10 chunks
top_nodes = top_k_docs[:5]

# Collect unique documents
seen_documents = set()
unique_nodes = []

# for node in top_nodes:
#     doc_id = node.metadata['source']  # or 'title' if more appropriate
    
#     if doc_id not in seen_documents:
#         seen_documents.add(doc_id)
#         unique_nodes.append(node)

In [153]:
top_nodes[0].metadata

{'source': 'data/downloaded_pdfs/Dataset/Issuances/Administrative Order/JAO-2014-01.pdf',
 'folder': 'Administrative Order',
 'title': 'JAO-2014-01.pdf',
 'page': 18}

In [156]:
print(top_nodes[0].text)

I' 
cancelled 
CPC 
and 
the 
· Decision/Order. of 
suspension 
.or 
cancellation 
·is 
executory; and 
e.. A 
PUV 
with 
expired CPC and 
without a pending 
application 
for 
extension 
, of ' 
validity . timely ' 
filed before ·:the 
Board. 
, I 
! . 
i i 
! : 
! 
Revocation of ALL CPCs (ehtire fleet) of the 
operator; 
·. Disqualification ofibe -Operator, and, in case of a 
:corporation, alL · its stockholders and directors, to 
operate any kind of public land transportation; 
Blackl!sting of ALL authorized units (entire fleet) of 
the operator from being used as public utility vehicle; 
and 
Revocation of the registration of ALL authorized 
units (e~tire fle~t) of the operator. 
I 
. 
. 
, 
. 
. , 
In 'determining the >frequency of offenses, the L TFRB and its 
R~~Bs will'courit offenses against operators and not cigainst a 
P<3f1icLilar 
motor vehiCle 
or 
CPC. 
Hence, the 
second 
apprehen_sioh CDf a vehicle. belonging to the same operator, 
regafdless ~ of Whether the ' first 
a

In [161]:
docstore = {}

# Store documents using full metadata as the key
for doc in documents:
    key = tuple(doc.metadata.items())  # Convert metadata to tuple for hashable key
    docstore[key] = doc

In [170]:
len(docstore)

2096

In [171]:
def get_document_by_chunk_metadata(chunk_node):
    # Convert chunk metadata to tuple for matching
    metadata_key = tuple(chunk_node.metadata.items())

    # Retrieve document from docstore
    document = docstore.get(metadata_key)
    return document

In [279]:
doc_top = get_document_by_chunk_metadata(top_nodes[1])

In [280]:
doc_top

Document(id_='86a17bbe-f5fd-4d81-a09d-c1724bc212cd', embedding=None, metadata={'source': 'data/downloaded_pdfs/Dataset/Services/Driver’s License/FDM-vol.-2-2nd-Edition.pdf', 'folder': 'Driver’s License', 'title': 'FDM-vol.-2-2nd-Edition.pdf', 'page': 87}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text=' \n83 \n \nVIOLATIONS IN CONNECTION WITH FRANCHISE \n \nTYPE OF VIOLATION \nTYPE OF \nVEHICLE \nPENALTIES FOR FIRST \n(1ST) OFFENSE \nFINE \nMinimum \nImpounding \nPeriod \n(until) \nSTATUS of CPC \n1. Colorum Violation - A \nmotor \nvehicle \nis \nconsidered operating \nas "colorum" under \nany of the following \ncircumstances: \n \na. A \nprivate \nmotor \nvehicle operating as \na PUV but without \nproper authority from \nthe LTFRB; \n \n \nb. A \nPUV \noperating \noutside \nof \nits \napproved route or \narea without a prior \nper

In [312]:
def remove_duplicate_documents(doc_list):
    seen_ids = set()
    unique_docs = []

    for doc in doc_list:
        if doc.doc_id not in seen_ids:
            seen_ids.add(doc.doc_id)
            unique_docs.append(doc)

    return unique_docs

In [308]:
doc_top = get_document_by_chunk_metadata(top_nodes[1])

In [310]:
doc_top.doc_id

'86a17bbe-f5fd-4d81-a09d-c1724bc212cd'

In [314]:
remove_duplicate_documents([get_document_by_chunk_metadata(top_nodes[1]) for i in range(2)])

[Document(id_='86a17bbe-f5fd-4d81-a09d-c1724bc212cd', embedding=None, metadata={'source': 'data/downloaded_pdfs/Dataset/Services/Driver’s License/FDM-vol.-2-2nd-Edition.pdf', 'folder': 'Driver’s License', 'title': 'FDM-vol.-2-2nd-Edition.pdf', 'page': 87}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text=' \n83 \n \nVIOLATIONS IN CONNECTION WITH FRANCHISE \n \nTYPE OF VIOLATION \nTYPE OF \nVEHICLE \nPENALTIES FOR FIRST \n(1ST) OFFENSE \nFINE \nMinimum \nImpounding \nPeriod \n(until) \nSTATUS of CPC \n1. Colorum Violation - A \nmotor \nvehicle \nis \nconsidered operating \nas "colorum" under \nany of the following \ncircumstances: \n \na. A \nprivate \nmotor \nvehicle operating as \na PUV but without \nproper authority from \nthe LTFRB; \n \n \nb. A \nPUV \noperating \noutside \nof \nits \napproved route or \narea without a prior \npe