In [1]:
import os
import fitz
import re

from ollama import Client
import faiss
import pandas as pd
import numpy as np
import Stemmer
from tqdm import tqdm
import gradio as gr

from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.retrievers import BaseRetriever, QueryFusionRetriever
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.retrievers.bm25 import BM25Retriever

# Connect to Ollama Server

In [2]:
client = Client(
  host='http://localhost:11434',
)

# Ingestion

In [3]:
# Path to the dataset folder
DATASET_PATH = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2'

def extract_text_from_pdfs(folder_path):
    texts = []
    metadata = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(f"Extracting text from {pdf_path}...")
                
                doc = fitz.open(pdf_path)
                for page_num, page in enumerate(doc, start=1):
                    text = page.get_text()
                    if text.strip():
                        texts.append(text.strip())
                        metadata.append({
                            "source": pdf_path,
                            "folder": folder_name,
                            "title": file,
                            "page": page_num
                        })
                    else:
                        print(f"WARNING: {file} page {page_num} not processed...")
    return texts, metadata

In [4]:
docs, metadatas = extract_text_from_pdfs(DATASET_PATH)

Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-INITIAL-REG.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-MAIRDOE-NEW.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SETTLEMENT.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-DL-CC-P-NP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-MV-CONDUCTION-STICKER.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-RELEASING.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-DL-ENHANCE.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-MV-CONDUCTION-VERIFICATION.pdf...
Extracting t

In [5]:
documents = [Document(text=docs[t], metadata=metadatas[t]) for t in range(len(docs))]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

# Embedding and Retrieval

## Dense via FAISS

In [6]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in tqdm(nodes):
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [7]:
class FaissIndexer:
    """
    Faiss-based indexer for efficient similarity search using inner-product (cosine) similarity.

    This class handles the creation and management of a FAISS index from node embeddings.
    
    :ivar faiss_index: The FAISS index for storing and querying embeddings.
    :vartype faiss_index: faiss.IndexFlatIP
    :ivar embedding_dim: Dimensionality of the embeddings.
    :vartype embedding_dim: int
    """

    def __init__(self):
        """
        Initialize the FaissIndexer class.

        :ivar faiss_index: The FAISS index, initialized as None.
        :ivar embedding_dim: The dimension of embeddings, initialized as None.
        """
        self.faiss_index = None
        self.embedding_dim = None

    def normalize_embeddings(self, embeddings):
        """
        Normalize embeddings to have unit L2 norm.

        :param embeddings: Array of embeddings to normalize.
        :type embeddings: np.ndarray
        :return: Normalized embeddings.
        :rtype: np.ndarray
        """
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    def build_index(self, nodes):
        """
        Build the FAISS index from a list of nodes containing embeddings.

        :param nodes: List of nodes, where each node contains an `embedding` attribute.
        :type nodes: list
        :raises ValueError: If the nodes list is empty or embeddings are inconsistent.
        """
        if not nodes:
            raise ValueError("Nodes list cannot be empty.")
        
        embeddings = np.array([np.array(node.embedding) for node in nodes])
        normalized_embeddings = self.normalize_embeddings(embeddings)

        self.embedding_dim = normalized_embeddings[0].shape[0]
        self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)  # Inner-product similarity
        self.faiss_index.add(normalized_embeddings)

    def get_index(self):
        """
        Get the FAISS index instance.

        :return: The FAISS index used for similarity search.
        :rtype: faiss.IndexFlatIP
        :raises ValueError: If the index has not been built.
        """
        if self.faiss_index is None:
            raise ValueError("Index has not been built yet. Call 'build_index' first.")
        return self.faiss_index

In [8]:
class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents

    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """

        norm_query_embedding = np.array([query_embedding])
        norm_query_embedding /= np.linalg.norm(norm_query_embedding, axis=1, keepdims=True)

        distances, indices = self.faiss_index.search(norm_query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [9]:
# embedding
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

  0%|          | 0/2592 [00:00<?, ?it/s]

100%|██████████| 2592/2592 [02:22<00:00, 18.24it/s]


In [10]:
#indexing
index = FaissIndexer()
index.build_index(nodes_embed)
faiss_index = index.get_index()

faiss_retriever = FAISSVectorStoreRetriever(faiss_index=faiss_index,documents=nodes_embed)

## Sparse Embedding via BM25

In [11]:
# bm25_retriever = BM25Retriever.from_defaults(
#     nodes=nodes,
#     similarity_top_k=5,
#     stemmer=Stemmer.Stemmer("english"),
#     language="english",
# )

## Hybrid Retrieval via Reciprocal Rank

In [12]:
def hybrid_embedding(results: dict, top_k: int):
    x = QueryFusionRetriever
    ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)
    return ranked_results[:top_k]

# Post Retrieval

## Summarization

In [13]:
def summarize_each_chunk(nodes, client, query, model="llama3.2", parent=False):
    if parent:
        chunks = [doc.text for doc in nodes]
    else:
        chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Summarize the following text in one concise paragraph, focusing on key points relevant to the query: "{query}".
        
        - Emphasize information directly related to the query.
        - Exclude unrelated, redundant, or speculative details.
        - Do NOT introduce new information or answer the query itself. 
        
        Text:
        {chunk}
        
        Summary:
        """
        
        response = client.generate(model=model, prompt=prompt)
        summary = response['response'].strip()
        summaries.append(summary)

    return summaries

# Generation

In [14]:


def generate_response_with_notice(query, choices, query_context, subquery_contexts, client, model="llama3.2"):
    # Combine summaries into context block
    context = "\n".join(query_context)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Use the following summarized information to answer the query accurately and concisely. 
    DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
    If the information is not sufficient to fully address the query, respond ONLY with:
    "The available information is insufficient to provide a complete answer to this query."

    Query Context:
    {context}

    Subquery Context:
    {subquery_contexts}
    
    Query:
    {query}
    {choices}
    
    \nPlease answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then add comma separator if there are multiple answers ONLY IF ALLOWED.
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

# Querying

## Query Transforms

In [15]:
import re
import json

prompt = f"""
    You are an expert assistant. Below are examples of how to decompose a query
    into three sub-queries.Finally, place these into an array.

    Example #1:
    USER: "Why is the sky blue?"
    SUB-QUERY 1: "Explain Rayleigh scattering"
    SUB-QUERY 2: "How light interacts with molecules?"
    SUB-QUERY 3: "How human eyes perceive color?"

    FINAL ANSWER (array):
    ["Explain Rayleigh scattering", "How light interacts with molecules?", "How human eyes perceive color?"]

    Now your turn:
    USER: "{"What are the requirements to get a license?"}"

    FINAL ANSWER (array):
    """

response_text = client.generate(model="llama3.2", prompt=prompt)
print(response_text.response)

# Extract sub-queries using regex
matches = re.findall(r'SUB-QUERY \d+: "(.*?)"', response_text.response)

# Optional: If FINAL ANSWER is already formatted as an array, extract directly
if not matches:
    array_match = re.search(r'\[(.*?)\]', response_text)
    if array_match:
        matches = [item.strip().strip('"') for item in array_match.group(1).split(',')]
    
print(matches)

SUB-QUERY 1: "What are the basic educational requirements for getting a driver's license?"
SUB-QUERY 2: "How old must you be to obtain a driver's license?"
SUB-QUERY 3: "What is the process of passing a driving test?"

FINAL ANSWER (array):
["What are the basic educational requirements for getting a driver's license?", "How old must you be to obtain a driver's license?", "What is the process of passing a driving test?"]
["What are the basic educational requirements for getting a driver's license?", "How old must you be to obtain a driver's license?", 'What is the process of passing a driving test?']


## Query Generation

In [16]:
docstore = {}

# Store documents using full metadata as the key
for doc in documents:
    key = tuple(doc.metadata.items())  # Convert metadata to tuple for hashable key
    docstore[key] = doc

In [17]:
def get_document_by_chunk_metadata(chunk_node):
    # Convert chunk metadata to tuple for matching
    metadata_key = tuple(chunk_node.metadata.items())

    # Retrieve document from docstore
    document = docstore.get(metadata_key)
    return document

In [18]:
def remove_duplicate_documents(doc_list):
    seen_ids = set()
    unique_docs = []

    for doc in doc_list:
        if doc.doc_id not in seen_ids:
            seen_ids.add(doc.doc_id)
            unique_docs.append(doc)

    return unique_docs

In [19]:
import json

def context_retrieval(text, top_k, mode, summary, chunks_only, references):
    embed_result = client.embeddings(prompt=text, model="mxbai-embed-large")
    # Return both the parsed sub-queries/answers and the embedding vector
    query_embedding = embed_result["embedding"]

    top_k_docs = faiss_retriever._retrieve(query_embedding, top_k=top_k)

    bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=top_k,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
    )
    retrieved_nodes = bm25_retriever.retrieve(text)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    ranked_results = hybrid_embedding(results, top_k=top_k)

    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results

    parent_flag = True
    if chunks_only:
        parent_flag = False
        print('using chunks only')
        context = [docs.node.text for docs in ans_nodes]
        return context
        
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])

        if chunks_only:
            context_nodes=ans_nodes
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.2:latest', query=text,parent=parent_flag)
        context = summaries
        return context
    
    if references:
        references = []
        for i, doc in enumerate(ranked_results[:top_k], start=1):
            metadata = doc.metadata
            source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
            references.append(source_info)
        return "\n".join(references)
    
    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    return context


def gen_query(query, choices, top_k, client, mode='dense', summary=False, model="llama3.2", chunks_only=False):
    prompt = f"""
    You are an expert assistant. Below are examples of how to decompose a query
    into three sub-queries.Finally, place these into an array.

    Example #1:
    USER: "Why is the sky blue?"
    SUB-QUERY 1: "Explain Rayleigh scattering"
    SUB-QUERY 2: "How light interacts with molecules?"
    SUB-QUERY 3: "How human eyes perceive color?"

    FINAL ANSWER (array):
    ["Explain Rayleigh scattering", "How light interacts with molecules?", "How human eyes perceive color?"]

    Now your turn:
    USER: "{query}"

    FINAL ANSWER (array):
    """
    response_text = client.generate(model="llama3.2", prompt=prompt)
    print(response_text.response)

    # Extract sub-queries using regex
    matches = re.findall(r'SUB-QUERY \d+: "(.*?)"', response_text.response)

    # Optional: If FINAL ANSWER is already formatted as an array, extract directly
    if not matches:
        array_match = re.search(r'\[(.*?)\]', response_text)
        if array_match:
            matches = [item.strip().strip('"') for item in array_match.group(1).split(',')]
    subqueries = matches
    print(matches)    

    query_context = context_retrieval(query, top_k, mode, summary, chunks_only, references = False)

    subquery_answers=[]
    for subquery in subqueries:
        context = "\n".join(context_retrieval(subquery,top_k, mode, summary, chunks_only, references = False))
        prompt = f"""
        You are an expert assistant. Your task is to provide concise answer on the query based on the provided CONTEXT. The CONTEXT serves as reference material or necessary background information to ensure precise and informative answers.
        Query: {subquery}
        Context: {context}
        Final Answer:
        """
        response = client.generate(model="llama3.2", prompt=prompt)
        subquery_answers.append(response.response)
    
    print(subquery_answers)
    responses_json = json.dumps([
        {
            "sub_query": subqueries[0],
            "answer": subquery_answers[0]
        },
        {
            "sub_query": subqueries[1],
            "answer": subquery_answers[1]
        },
        {
            "sub_query": subqueries[2],
            "answer": subquery_answers[2]
        }
    ], indent=4) 
    print(responses_json)
    subquery_contexts = json.dumps(json.loads(responses_json))
    answer = generate_response_with_notice(query, choices, query_context, subquery_contexts, client, model=model)
    references = context_retrieval(query,top_k, mode, summary, chunks_only, references = True)


    return answer, references

# Evaluation

In [20]:
# Generate prompts dynamically
def generate_choices(row):
    options = []
    for choice in ['A', 'B', 'C', 'D', 'E']:
        # Check for NaN or blank values
        if pd.notna(row[choice]) and row[choice] != '':
            options.append(f"{choice}. {row[choice]}")
    
    # Construct the prompt with few-shot examples
    choices = "\n".join(options)
    
    return choices

In [21]:
from sklearn.model_selection import train_test_split
# Load the Excel file
file_path = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/LTO_EXAM.csv'
df = pd.read_csv(file_path)
df['Choices'] = df.apply(generate_choices, axis=1)
# Split the data into test (80%) and holdout validation (20%)
test_df, holdout_df = train_test_split(df, test_size=0.8, random_state=42)

# Display the first few rows of each set
print("Testing Data:")
display(test_df.head())

print("\nHoldout Validation Data:")
display(holdout_df.head())

test_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/test_data.csv', index=False)
holdout_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/holdout_data.csv', index=False)

Testing Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
59,"When driving on the highway at night, you shou...",another driver dims his lights,blinded by the headlights of an approaching ve...,all of the above,,,C,A. another driver dims his lights\nB. blinded ...
63,The safest thing to do even if you have the ri...,don't force your rights,horn,force your rights,,,A,A. don't force your rights\nB. horn\nC. force ...
78,It shall mean that the LEO has reasonable grou...,improbable cause,probable cause,likely cause,,,B,A. improbable cause\nB. probable cause\nC. lik...
37,"You are preparing to exit an expressway, when ...",Immediately before entering the declaration lane,immediately upon entering the declaration lane,immediately upon spotting the declaration lane,,,B,A. Immediately before entering the declaration...
29,How close should another car be before you dim...,150 meter,100 meter,200 meters,,,A,A. 150 meter\nB. 100 meter\nC. 200 meters



Holdout Validation Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
40,What light shall be used when vehicles are par...,Headlight,Parking lights or lower-beam headlights,Signal lights,,,B,A. Headlight\nB. Parking lights or lower-beam ...
22,To have one's driver's license suspended means...,have it revalidated by the LTO,have it taken away premanently by the LTO,have it taken temporarily by the LTO,,,C,A. have it revalidated by the LTO\nB. have it ...
55,"On a two-lane road, overtaking is only allowed...",left lane,both right and left lane,right lane,,,A,A. left lane\nB. both right and left lane\nC. ...
88,The driver is using a motor vehicle in committ...,revokes and will pay a fine,confiscated and will pay fine,suspended and will pay fine,,,A,A. revokes and will pay a fine\nB. confiscated...
0,What should you do in case your vehicle breaks...,Open your trunk and hood,Stand on the expressway and flag down passing ...,Call for help using a mobile phone or an expre...,Park as far to the right as possible,Put your hazard warning light on,"A, C, D, E",A. Open your trunk and hood\nB. Stand on the e...


In [22]:
df = test_df.copy()
df["AI"] = np.nan
ai_answer = []

for i in tqdm(range(len(df))):
    answ = gen_query(df["Question"].iloc[i], df["Choices"].iloc[i], top_k=15, client=client, mode='hybrid', model="llama3.1:8b")
    ai_answer.append(answ[0])  # Get the first answer

df["AI"] = ai_answer
print("Updated Testing Data with AI Answers:")
display(df.head())

  0%|          | 0/18 [00:00<?, ?it/s]

SUB-QUERY 1: "What is the purpose of using low beam headlights?"

SUB-QUERY 2: "How do dim lights reduce glare from oncoming traffic?"

SUB-QUERY 3: "Are there any specific speed limits or safe distances for following other vehicles when using low beams at night?"

FINAL ANSWER (array):
["What is the purpose of using low beam headlights?", "How do dim lights reduce glare from oncoming traffic?", "Are there any specific speed limits or safe distances for following other vehicles when using low beams at night?"]
['What is the purpose of using low beam headlights?', 'How do dim lights reduce glare from oncoming traffic?', 'Are there any specific speed limits or safe distances for following other vehicles when using low beams at night?']
using Hybrid
using Hybrid
using Hybrid
using Hybrid
['This is not a typical Q&A format question, but rather a collection of driving rules and regulations from Republic Act No. 11698 - Vintage Vehicle Regulation Act, Republic Act No. 4136, and other sources

  6%|▌         | 1/18 [00:16<04:37, 16.33s/it]

using Hybrid
USER: "The safest thing to do even if you have the rights of using the road is:"

SUB-QUERY 1: "Understanding traffic laws and regulations"
SUB-QUERY 2: "Maintaining vehicle safety features"
SUB-QUERY 3: "Practicing defensive driving techniques"

FINAL ANSWER (array):
["Understanding traffic laws and regulations", "Maintaining vehicle safety features", "Practicing defensive driving techniques"]
['Understanding traffic laws and regulations', 'Maintaining vehicle safety features', 'Practicing defensive driving techniques']
using Hybrid
using Hybrid
using Hybrid
using Hybrid
["It seems you provided a lengthy text about driving manuals and traffic laws in the Philippines, but didn't ask a specific question that requires a numerical answer. However, I'll attempt to extract some relevant information from the text.\n\nIf you're looking for answers related to driving skills or traffic rules, here are a few points:\n\n1. The manual covers Licensing Information, Getting Ready to Dri

 11%|█         | 2/18 [00:33<04:28, 16.77s/it]

using Hybrid
Based on the given examples, I will decompose the query into three sub-queries as follows:

USER: "It shall mean that the LEO has reasonable ground to believe that the person driving the motor vehicle is under the influence of alcohol, dangerous drugs and/or other similar substances upon personally witnessing a traffic offense committed."

SUB-QUERY 1: "What are similar substances to alcohol and drugs?"
 
SUB-QUERY 2: "How do LEOs determine if a driver is under the influence?"

SUB-QUERY 3: "What traffic offenses qualify as reasonable grounds for suspicion of DUI?"

FINAL ANSWER (array):
["What are similar substances to alcohol and drugs?", "How do LEOs determine if a driver is under the influence?", "What traffic offenses qualify as reasonable grounds for suspicion of DUI?"]
['What are similar substances to alcohol and drugs?', 'How do LEOs determine if a driver is under the influence?', 'What traffic offenses qualify as reasonable grounds for suspicion of DUI?']
using Hy

 17%|█▋        | 3/18 [00:54<04:38, 18.58s/it]

using Hybrid
SUB-QUERY 1: "What is the general rule for slowing down before exiting an expressway?"
SUB-QUERY 2: "At what distance should the driver slow down to prepare to exit the expressway?"
SUB-QUERY 3: "According to traffic laws, at what speed should the driver be traveling when approaching the exit?"

FINAL ANSWER (array):
["What is the general rule for slowing down before exiting an expressway?", "At what distance should the driver slow down to prepare to exit the expressway?", "According to traffic laws, at what speed should the driver be traveling when approaching the exit?"]
['What is the general rule for slowing down before exiting an expressway?', 'At what distance should the driver slow down to prepare to exit the expressway?', 'According to traffic laws, at what speed should the driver be traveling when approaching the exit?']
using Hybrid
using Hybrid
using Hybrid
using Hybrid
[
    {
        "sub_query": "What is the general rule for slowing down before exiting an expr

 22%|██▏       | 4/18 [01:11<04:16, 18.29s/it]

using Hybrid
SUB-QUERY 1: "What is the standard distance for tailgating?"
SUB-QUERY 2: "How does the visibility of other vehicles affect driving conditions?"
SUB-QUERY 3: "What are the specific guidelines for dimming headlights at night?"

FINAL ANSWER (array):
["What is the standard distance for tailgating?", "How does the visibility of other vehicles affect driving conditions?", "What are the specific guidelines for dimming headlights at night?"]
['What is the standard distance for tailgating?', 'How does the visibility of other vehicles affect driving conditions?', 'What are the specific guidelines for dimming headlights at night?']
using Hybrid
using Hybrid
using Hybrid
using Hybrid
["It seems like there are multiple questions hidden in the text. I'll provide answers to each of them:\n\n1. What color is the headlight of a motor vehicle?\nAnswer: White or yellowish white (Sec 34c)\n\n2. What is the allowable height of top clearance lights?\nAnswer: 10 cms (Sec 34c)\n\n3. What shall 

 28%|██▊       | 5/18 [01:32<04:06, 18.97s/it]

using Hybrid
SUB-QUERY 1: "What is the physics behind a tire blowout?"
SUB-QUERY 2: "How does air pressure affect vehicle safety?"
SUB-QUERY 3: "What emergency procedures should I follow in case of a tire blowout?"

FINAL ANSWER (array):
["What is the physics behind a tire blowout?", "How does air pressure affect vehicle safety?", "What emergency procedures should I follow in case of a tire blowout?"]
['What is the physics behind a tire blowout?', 'How does air pressure affect vehicle safety?', 'What emergency procedures should I follow in case of a tire blowout?']
using Hybrid
using Hybrid
using Hybrid
using Hybrid
['There is no problem or question provided for me to solve. The text appears to be a sample of a driving school course outline, covering various topics such as vehicle maintenance, emergency handling, and written examination.\n\nIf you could provide a specific problem or question, I would be happy to assist you in solving it.', 'The text appears to be a collection of tips a

 33%|███▎      | 6/18 [01:47<03:34, 17.90s/it]

using Hybrid
SUB-QUERY 1: "What are the specific speed limits for parking violations?"
SUB-QUERY 2: "How do traffic laws define a 'parking violation'?"
SUB-QUERY 3: "Are there any exceptions to parking speed limit rules?"

FINAL ANSWER (array):
["What are the specific speed limits for parking violations?", "How do traffic laws define a 'parking violation'?", "Are there any exceptions to parking speed limit rules?"]
['What are the specific speed limits for parking violations?', "How do traffic laws define a 'parking violation'?", 'Are there any exceptions to parking speed limit rules?']
using Hybrid
using Hybrid
using Hybrid
using Hybrid
['The final answer is not explicitly stated in the provided text, but based on the content, it appears that the main goal of Republic Act No. 10916, also known as the "Road Speed Limiter Act of 2016", is to ensure the safety and security of road users by mandating the use of standard speed limiters in all motor vehicles.\n\nThe Implementing Rules and Re

 39%|███▉      | 7/18 [02:05<03:17, 17.94s/it]

using Hybrid
To decompose the given query into three sub-queries and place them in an array, we can follow these steps:

1. Identify the main question: "To avoid suspension or revocation, how many days must a driver with an apprehended license settle his case with LTO?"
2. Break down the main question into smaller, more manageable parts:
   - What is the legal definition of suspension and revocation?
   - How long do drivers with an apprehended license need to settle their cases with LTO?
   - What are the specific requirements for settling a case with LTO?

3. Decompose each part further:
   - SUB-QUERY 1: "What is the legal definition of suspension and revocation?"
      * Definition of terms like 'suspension', 'revocation'
      * Relevant laws or regulations
   - SUB-QUERY 2: "How long do drivers with an apprehended license need to settle their cases with LTO?"
      * Timeframes specified in relevant laws or regulations
      * Cases involving apprehended licenses and LTO
   - SUB

 44%|████▍     | 8/18 [02:27<03:11, 19.11s/it]

using Hybrid
SUB-QUERY 1: "What is the purpose of the red lights on an ambulance?"
SUB-QUERY 2: "What does the sound of a siren indicate to drivers?"
SUB-QUERY 3: "Should I move to the side of the road or pull over when an ambulance approaches?"

FINAL ANSWER (array):
["What is the purpose of the red lights on an ambulance?", "What does the sound of a siren indicate to drivers?", "Should I move to the side of the road or pull over when an ambulance approaches?"]
['What is the purpose of the red lights on an ambulance?', 'What does the sound of a siren indicate to drivers?', 'Should I move to the side of the road or pull over when an ambulance approaches?']
using Hybrid
using Hybrid
using Hybrid
using Hybrid
["It appears that you provided a large amount of text, but no specific question or prompt for me to answer. However, I can try to extract some information from the text and provide answers based on the content.\n\nHere are a few possible questions and answers:\n\n**Q: What is the al

 50%|█████     | 9/18 [02:43<02:43, 18.21s/it]

using Hybrid
To decompose the query into sub-queries, we can break it down as follows:

USER: "To obtain one's driver's license, one must be at least X years old."

SUB-QUERY 1: "What is the typical age requirement for obtaining a driver's license in most countries?"
SUB-QUERY 2: "How does the law define a 'minor' or 'young person' in relation to licensing requirements?"
SUB-QUERY 3: "What are the consequences of holding back from granting a driver's license until one reaches a certain age?"

FINAL ANSWER (array):
["Typical age requirement for obtaining a driver's license", "Definition of a minor/young person in licensing", "Consequences of holding back on licensing"]

Note that these sub-queries are not exhaustive and can be further decomposed or expanded upon as needed.
["What is the typical age requirement for obtaining a driver's license in most countries?", "How does the law define a 'minor' or 'young person' in relation to licensing requirements?", "What are the consequences of h

 56%|█████▌    | 10/18 [03:02<02:25, 18.21s/it]

using Hybrid
Here's an example of how to decompose the query into three sub-queries:

USER: "Driving with a fake license is prohibited and is punishable by:"

SUB-QUERY 1: "What are the laws regarding fake licenses?"
SUB-QUERY 2: "What are the penalties for driving with a fake license?"
SUB-QUERY 3: "How can one obtain a valid driver's license?"

FINAL ANSWER (array):
["What are the laws regarding fake licenses?", "What are the penalties for driving with a fake license?", "How can one obtain a valid driver's license?"]
['What are the laws regarding fake licenses?', 'What are the penalties for driving with a fake license?', "How can one obtain a valid driver's license?"]
using Hybrid
using Hybrid
using Hybrid
using Hybrid
["Based on the provided text, it appears that the question is asking for general information about driving rules and regulations in the Philippines, rather than a specific answer to a problem or question.\n\nHowever, I can provide some answers based on the text:\n\n1. 

 61%|██████    | 11/18 [03:21<02:09, 18.44s/it]

using Hybrid
To answer the user's question, we can decompose it into three sub-queries:

SUB-QUERY 1: "What is a full stop?"
SUB- Quarry 2: "Where is used in writing to indicate the end of a sentence?"
SUB-QUERY 3: "Why is punctuation important in language?"

The final answer would be an array containing these sub-queries:

["What is a full stop?", "Where is used in writing to indicate the end of a sentence?", "Why is punctuation important in language?"]
['What is a full stop?', 'Why is punctuation important in language?']
using Hybrid
using Hybrid
using Hybrid


 61%|██████    | 11/18 [03:32<02:15, 19.35s/it]

["It appears to be a driving test score sheet and a driving skills evaluation form. I will provide an analysis of the content.\n\n**Driving Test Score Sheet:**\n\nThe driving test score sheet provides information about the applicant's vehicle, license category, and any demerit points accumulated during the test. The sheet also includes sections for observing traffic rules, exercising care for pedestrians, and controlling vehicles/trailers.\n\n**Driving Skills Evaluation Form:**\n\nThe driving skills evaluation form assesses the applicant's driving skills in various areas, including:\n\n1. Pre-driving check-up (e.g., checking tires and batteries)\n2. Observance to traffic rules (e.g., adjusting mirrors, using seatbelt, and observing speed limits)\n3. Driving skills (e.g., position of hands, anticipating before signaling, smoothness, and use of gears)\n\n**Recommendations:**\n\nThe form includes sections for comments/recommendations, recommended vehicle category, and the licensing center




IndexError: list index out of range

In [None]:
import re


def process_answers(answers):
    formatted_answers = []
    
    for a in answers:
        
        matches = re.findall(r'\[?\s*([A-E](?:\s*,\s*[A-E])*)\s*\]?', str(a)) # Extract answers like [A, C, D] or [A] or [B, D]
        answers = []
        for match in matches:
            answers.extend(re.split(r'\s*,\s*', match))  # Split by comma and remove spaces
        unique_sorted_answers = sorted(set(answers), key=lambda x: ['A', 'B', 'C', 'D', 'E'].index(x))
        if not unique_sorted_answers:
            formatted_answers.append(None)
        else:
            formatted_answers.append(unique_sorted_answers)
    return formatted_answers

df_results = df[["Question", "Answer", "AI"]].copy()
df_results['Answer'] = df_results['Answer'].apply(lambda x: x.split(', '))
df_results['AI'] = process_answers(df_results["AI"])
df_results['Answer'] = process_answers(df_results["Answer"])



def calculate_scores(df):
    scores = []
    for index, row in df.iterrows():
        correct_answers = set(row['Answer'] if row['Answer'] is not None else [])
        ai_answers = set(row['AI'] if row['AI'] is not None else [])
        if ai_answers == correct_answers:
            score = 1.0
        else:
            score = 0.0
        scores.append(score)
    
    df['Score'] = scores
    accuracy = scores.count(1.0) / len(scores)
    print(f'Final Score: {scores.count(1.0):.2f}/{len(scores):.2f}')
    print(f'Accuracy: {accuracy:.2f}%')
    return df

# Apply the scoring function
scored_df = calculate_scores(df_results)

# Display the dataframe to verify the results
display(scored_df[['Question', 'Answer', 'AI', 'Score']])

Final Score: 8.00/18.00
Accuracy: 0.44%


Unnamed: 0,Question,Answer,AI,Score
59,"When driving on the highway at night, you shou...",[C],"[A, B]",0.0
63,The safest thing to do even if you have the ri...,[A],[A],1.0
78,It shall mean that the LEO has reasonable grou...,[B],[B],1.0
37,"You are preparing to exit an expressway, when ...",[B],[A],0.0
29,How close should another car be before you dim...,[A],[A],1.0
1,What will happen when your front tire blows out?,[C],"[A, B, C, D]",0.0
52,Parking is considered as a violation when a mo...,[A],[B],0.0
21,"To avoid suspension or revocation, how many da...",[A],,0.0
2,What should you do when an ambulance comes up ...,[D],[D],1.0
23,"To aobtain one's driver's license, one must be...",[B],"[B, C]",0.0


# Visualization

In [None]:
# Gradio interface with dynamic model, mode selection, and top_k slider
iface = gr.Interface(
    fn=lambda query, top_k, model, mode: gen_query(
        query=query,
        top_k=top_k,
        client=client,
        mode=mode,
        model=model
    ),
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(1, 20, value=5, step=1, label="Top K Results"),  # Slider for top_k (1 to 20)
        gr.Dropdown(
            choices=["llama3.3", "llama3.2:latest", "llama3.1:8b"],
            value="llama3.3",
            label="Select Model"
        ),
        gr.Dropdown(
            choices=["hybrid", "dense", "sparse"],
            value="hybrid",
            label="Select Retrieval Mode"
        )
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="References", lines=10),
    ],
    title="RAG System with LLaMA Models",
    description=(
        "Ask questions and get answers with references from PDF documents. "
        "Adjust Top-K to control the number of retrieved chunks. "
        "Choose different models and retrieval modes for customization."
    )
)

# Launch the Gradio interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [None]:
# Handle closing behavior
def on_close():
    iface.close()
on_close()

Closing server running on port: 7860
