In [10]:
import os
import fitz
import re
import json

from ollama import Client
import faiss
import pandas as pd
import numpy as np
import Stemmer
from tqdm import tqdm
import gradio as gr

from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.retrievers import BaseRetriever, QueryFusionRetriever
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.retrievers.bm25 import BM25Retriever

# Connect to Ollama Server

In [11]:
client = Client(
  host='http://localhost:11434',
)

# Ingestion

In [12]:
# Path to the dataset folder
DATASET_PATH = 'extracted_text.json'

def get_text_and_metadata(input_path):
    """Load text and metadata from a file and perform chunking."""
    with open(input_path, "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    texts = []
    metadata = []

    for entry in extracted_data:
        text = entry["text"]
        source_metadata = {
            "source": entry["source"],
            "folder": entry["folder"],
            "file_name": entry["file_name"],
            "page": entry["page"],
            "title": entry["title"],
            "url": entry["url"]
        }
        texts.append(text)
        metadata.append(source_metadata)

    return texts, metadata

In [13]:
docs, metadatas = get_text_and_metadata(DATASET_PATH)

In [14]:
documents = [Document(text=docs[t], metadata=metadatas[t]) for t in range(len(docs))]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

# Embedding and Retrieval

## Dense via FAISS

In [15]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in tqdm(nodes):
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [16]:
class FaissIndexer:
    """
    Faiss-based indexer for efficient similarity search using inner-product (cosine) similarity.

    This class handles the creation and management of a FAISS index from node embeddings.
    
    :ivar faiss_index: The FAISS index for storing and querying embeddings.
    :vartype faiss_index: faiss.IndexFlatIP
    :ivar embedding_dim: Dimensionality of the embeddings.
    :vartype embedding_dim: int
    """

    def __init__(self):
        """
        Initialize the FaissIndexer class.

        :ivar faiss_index: The FAISS index, initialized as None.
        :ivar embedding_dim: The dimension of embeddings, initialized as None.
        """
        self.faiss_index = None
        self.embedding_dim = None

    def normalize_embeddings(self, embeddings):
        """
        Normalize embeddings to have unit L2 norm.

        :param embeddings: Array of embeddings to normalize.
        :type embeddings: np.ndarray
        :return: Normalized embeddings.
        :rtype: np.ndarray
        """
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    def build_index(self, nodes):
        """
        Build the FAISS index from a list of nodes containing embeddings.

        :param nodes: List of nodes, where each node contains an `embedding` attribute.
        :type nodes: list
        :raises ValueError: If the nodes list is empty or embeddings are inconsistent.
        """
        if not nodes:
            raise ValueError("Nodes list cannot be empty.")
        
        embeddings = np.array([np.array(node.embedding) for node in nodes])
        normalized_embeddings = self.normalize_embeddings(embeddings)

        self.embedding_dim = normalized_embeddings[0].shape[0]
        self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)  # Inner-product similarity
        self.faiss_index.add(normalized_embeddings)

    def get_index(self):
        """
        Get the FAISS index instance.

        :return: The FAISS index used for similarity search.
        :rtype: faiss.IndexFlatIP
        :raises ValueError: If the index has not been built.
        """
        if self.faiss_index is None:
            raise ValueError("Index has not been built yet. Call 'build_index' first.")
        return self.faiss_index

In [17]:
class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents

    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """

        norm_query_embedding = np.array([query_embedding])
        norm_query_embedding /= np.linalg.norm(norm_query_embedding, axis=1, keepdims=True)

        distances, indices = self.faiss_index.search(norm_query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [18]:
# embedding
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

100%|██████████| 12302/12302 [27:50<00:00,  7.36it/s]  


In [19]:
#indexing
index = FaissIndexer()
index.build_index(nodes_embed)
faiss_index = index.get_index()

faiss_retriever = FAISSVectorStoreRetriever(faiss_index=faiss_index,documents=nodes_embed)

## Sparse Embedding via BM25

In [20]:
# bm25_retriever = BM25Retriever.from_defaults(
#     nodes=nodes,
#     similarity_top_k=5,
#     stemmer=Stemmer.Stemmer("english"),
#     language="english",
# )

## Hybrid Retrieval via Reciprocal Rank

In [21]:
def hybrid_embedding(results: dict, top_k: int):
    x = QueryFusionRetriever
    ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)
    return ranked_results[:top_k]

# Post Retrieval

## Summarization

In [22]:
def summarize_each_chunk(nodes, client, query, model="llama3.3", parent=False):
    if parent:
        chunks = [doc.text for doc in nodes]
    else:
        chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Summarize the following text in one concise paragraph, focusing on key points relevant to the query: "{query}".
        
        - Emphasize information directly related to the query.
        - Exclude unrelated, redundant, or speculative details.
        - Do NOT introduce new information or answer the query itself. 
        
        Text:
        {chunk}
        
        Summary:
        """
        
        response = client.generate(model=model, prompt=prompt)
        summary = response['response'].strip()
        summaries.append(summary)

    return summaries

# Generation

In [23]:


def generate_response_with_notice(summaries, query, choices, client, model="llama3.3"):
    # Combine summaries into context block
    context = "\n".join(summaries)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Use the following summarized information to answer the query accurately and concisely. 
    DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
    If the information is not sufficient to fully address the query, respond ONLY with:
    "The available information is insufficient to provide a complete answer to this query."

    Summarized Context:
    {context}
    
    Query:
    {query}
    {choices}
    
    \nPlease answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then add comma separator if there are multiple answers ONLY IF ALLOWED.
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

# Querying

## Query Transforms

In [24]:
import re
import json

prompt = f"""
    You are an expert assistant. Below are examples of how to decompose a query
    into three sub-queries, and also provide short answers for each sub-query.
    Finally, place these into a JSON array.

    Example #1:
    USER: "Why is the sky blue?"
    SUB-QUERY 1: "Explain Rayleigh scattering"
    ANSWER 1: "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    SUB-QUERY 2: "How light interacts with molecules?"
    ANSWER 2: "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    SUB-QUERY 3: "How human eyes perceive color?"
    ANSWER 3: "The human eye detects the shorter, scattered wavelengths as blue..."

    FINAL ANSWER (JSON array):
    [
    {{
        "sub_query": "Explain Rayleigh scattering",
        "answer": "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    }},
    {{
        "sub_query": "How light interacts with molecules?",
        "answer": "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    }},
    {{
        "sub_query": "How human eyes perceive color?",
        "answer": "The human eye detects the shorter, scattered wavelengths as blue..."
    }}
    ]

    Now your turn:
    USER: "{"What are the requirements in getting a license?"}"

    FINAL ANSWER (JSON array):
    """

response = client.generate(model="llama3.3", prompt=prompt)
print(response.response)

# Extract the JSON part using regex
match = re.search(r'\[\s*{.*?}\s*\]', response.response, re.S)
if match:
    json_text = match.group(0)
    parsed_json = json.loads(json_text)
    print(json.dumps(parsed_json, indent=4))
else:
    print("No JSON found.")

To decompose the query "What are the requirements in getting a license?" into three sub-queries and provide short answers for each, we need to consider the general process of obtaining a license, which can vary depending on the type of license (e.g., driver's license, professional license, etc.). However, there are common steps that apply broadly. Here's how we can break it down:

1. **Meeting Eligibility Criteria**: The first step in getting any license is to meet the basic eligibility criteria set by the issuing authority. This often includes age requirements, residency, and sometimes background checks.

2. **Completing Required Education or Training**: Many licenses require applicants to complete a specific course of study or training program. For example, a driver's license typically requires completing a driver's education course, while professional licenses might require a degree from an accredited institution.

3. **Passing Examinations or Tests**: Nearly all licensing processes

## Query Generation

In [25]:
docstore = {}

# Store documents using full metadata as the key
for doc in documents:
    key = tuple(doc.metadata.items())  # Convert metadata to tuple for hashable key
    docstore[key] = doc

In [26]:
def get_document_by_chunk_metadata(chunk_node):
    # Convert chunk metadata to tuple for matching
    metadata_key = tuple(chunk_node.metadata.items())

    # Retrieve document from docstore
    document = docstore.get(metadata_key)
    return document

In [27]:
def remove_duplicate_documents(doc_list):
    seen_ids = set()
    unique_docs = []

    for doc in doc_list:
        if doc.doc_id not in seen_ids:
            seen_ids.add(doc.doc_id)
            unique_docs.append(doc)

    return unique_docs

In [28]:
import json

def gen_query(query, choices, top_k, client, mode='dense', summary=False, model="llama3.3", chunks_only=False):
    prompt = f"""
    You are an expert assistant. Below are examples of how to decompose a query
    into three sub-queries, and also provide short answers for each sub-query.
    Finally, place these into a JSON array.

    Example #1:
    USER: "Why is the sky blue?"
    SUB-QUERY 1: "Explain Rayleigh scattering"
    ANSWER 1: "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    SUB-QUERY 2: "How light interacts with molecules?"
    ANSWER 2: "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    SUB-QUERY 3: "How human eyes perceive color?"
    ANSWER 3: "The human eye detects the shorter, scattered wavelengths as blue..."

    FINAL ANSWER (JSON array):
    [
    {{
        "sub_query": "Explain Rayleigh scattering",
        "answer": "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    }},
    {{
        "sub_query": "How light interacts with molecules?",
        "answer": "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    }},
    {{
        "sub_query": "How human eyes perceive color?",
        "answer": "The human eye detects the shorter, scattered wavelengths as blue..."
    }}
    ]

    Now your turn:
    USER: "{query}"

    FINAL ANSWER (JSON array):
    """

    response_text = client.generate(model=model, prompt=prompt)

    # Extract the JSON part using regex
    match = re.search(r'\[\s*{.*?}\s*\]', response_text.response, re.S)
    if match:
        json_text = match.group(0)
        parsed_json = json.loads(json_text)
        print(json.dumps(parsed_json, indent=4))
        json_text = json.dumps(parsed_json)
        embed_result = client.embeddings(prompt=json_text, model="mxbai-embed-large")
    else:
        print("No JSON found.")
        embed_result = client.embeddings(prompt=query, model="mxbai-embed-large")


    # Return both the parsed sub-queries/answers and the embedding vector
    query_embedding = embed_result["embedding"]

    top_k_docs = faiss_retriever._retrieve(query_embedding, top_k=top_k)

    bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=top_k,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
    )
    retrieved_nodes = bm25_retriever.retrieve(query)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    ranked_results = hybrid_embedding(results, top_k=top_k)

    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results

    parent_flag = True
    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    if chunks_only:
        parent_flag = False
        print('using chunks only')
        context = [docs.node.text for docs in ans_nodes]
        
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])

        if chunks_only:
            context_nodes=ans_nodes
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.3', query=query,parent=parent_flag)
        context = summaries

    answer = generate_response_with_notice(context, query, choices, client, model=model)

    # Format the references
    references = []
    for i, doc in enumerate(ranked_results[:top_k], start=1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
        references.append(source_info)

    return answer, "\n".join(references)

# Evaluation

In [29]:
# Generate prompts dynamically
def generate_choices(row):
    options = []
    for choice in ['A', 'B', 'C', 'D', 'E']:
        # Check for NaN or blank values
        if pd.notna(row[choice]) and row[choice] != '':
            options.append(f"{choice}. {row[choice]}")
    
    # Construct the prompt with few-shot examples
    choices = "\n".join(options)
    
    return choices

In [None]:
from sklearn.model_selection import train_test_split
# Load the Excel file
file_path = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/LTO_EXAM.csv'
df = pd.read_csv(file_path)
df['Choices'] = df.apply(generate_choices, axis=1)
# Split the data into test (80%) and holdout validation (20%)
test_df, holdout_df = train_test_split(df, test_size=0.8, random_state=42)

# Display the first few rows of each set
print("Testing Data:")
display(test_df.head())

print("\nHoldout Validation Data:")
display(holdout_df.head())

test_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/test_data.csv', index=False)
holdout_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/holdout_data.csv', index=False)

Testing Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
59,"When driving on the highway at night, you shou...",another driver dims his lights,blinded by the headlights of an approaching ve...,all of the above,,,C,A. another driver dims his lights\nB. blinded ...
63,The safest thing to do even if you have the ri...,don't force your rights,horn,force your rights,,,A,A. don't force your rights\nB. horn\nC. force ...
78,It shall mean that the LEO has reasonable grou...,improbable cause,probable cause,likely cause,,,B,A. improbable cause\nB. probable cause\nC. lik...
37,"You are preparing to exit an expressway, when ...",Immediately before entering the declaration lane,immediately upon entering the declaration lane,immediately upon spotting the declaration lane,,,B,A. Immediately before entering the declaration...
29,How close should another car be before you dim...,150 meter,100 meter,200 meters,,,A,A. 150 meter\nB. 100 meter\nC. 200 meters



Holdout Validation Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
40,What light shall be used when vehicles are par...,Headlight,Parking lights or lower-beam headlights,Signal lights,,,B,A. Headlight\nB. Parking lights or lower-beam ...
22,To have one's driver's license suspended means...,have it revalidated by the LTO,have it taken away premanently by the LTO,have it taken temporarily by the LTO,,,C,A. have it revalidated by the LTO\nB. have it ...
55,"On a two-lane road, overtaking is only allowed...",left lane,both right and left lane,right lane,,,A,A. left lane\nB. both right and left lane\nC. ...
88,The driver is using a motor vehicle in committ...,revokes and will pay a fine,confiscated and will pay fine,suspended and will pay fine,,,A,A. revokes and will pay a fine\nB. confiscated...
0,What should you do in case your vehicle breaks...,Open your trunk and hood,Stand on the expressway and flag down passing ...,Call for help using a mobile phone or an expre...,Park as far to the right as possible,Put your hazard warning light on,"A, C, D, E",A. Open your trunk and hood\nB. Stand on the e...


In [31]:
df = test_df.copy()
df["AI"] = np.nan
ai_answer = []

for i in tqdm(range(len(df))):
    answ = gen_query(df["Question"].iloc[i], df["Choices"].iloc[i], top_k=20, client=client, mode='hybrid', model="llama3.3")
    ai_answer.append(answ[0])  # Get the first answer

df["AI"] = ai_answer
print("Updated Testing Data with AI Answers:")
display(df.head())

  0%|          | 0/18 [00:00<?, ?it/s]

[
    {
        "sub_query": "What are low beam headlights used for?",
        "answer": "Low beam headlights, also known as dim lights, are used to provide visibility without dazzling other drivers with high beams, reducing glare."
    },
    {
        "sub_query": "When should low beam headlights be used in relation to oncoming traffic?",
        "answer": "You should use low beam headlights when facing oncoming traffic to prevent blinding the drivers of approaching vehicles, ensuring safer passage for both parties."
    },
    {
        "sub_query": "Are there any other situations where low beam headlights are preferred over high beams?",
        "answer": "Yes, besides oncoming traffic, use low beam headlights when following another vehicle closely to avoid dazzling the driver in front of you through their rearview mirror."
    }
]
using Hybrid


  6%|▌         | 1/18 [01:12<20:40, 72.98s/it]

[
    {
        "sub_query": "What are the principles of defensive driving?",
        "answer": "Defensive driving involves anticipating potential hazards, maintaining a safe distance from other vehicles, and being prepared to react to unexpected situations."
    },
    {
        "sub_query": "How can staying alert and aware of one's surroundings contribute to road safety?",
        "answer": "Staying alert allows drivers to notice and respond to potential dangers more effectively, such as pedestrians stepping into the road or other vehicles changing lanes unexpectedly."
    },
    {
        "sub_query": "Why is following traffic laws crucial for safety on the road?",
        "answer": "Following traffic laws helps maintain order on the roads, reduces the risk of accidents by ensuring all road users are acting predictably, and minimizes conflicts between different types of road users."
    }
]
using Hybrid


 11%|█         | 2/18 [02:18<18:17, 68.56s/it]

[
    {
        "sub_query": "What constitutes reasonable ground for suspicion?",
        "answer": "Reasonable ground refers to a standard of proof that is more than mere suspicion but less than probable cause, where an officer has enough evidence or observation to believe a crime may have been committed."
    },
    {
        "sub_query": "How can a LEO identify signs of alcohol or drug influence in drivers?",
        "answer": "LEOs are trained to look for specific behaviors and physical indicators such as erratic driving, inability to maintain lane position, odor of alcohol, slurred speech, and failure to follow simple instructions."
    },
    {
        "sub_query": "What legal actions can a LEO take upon witnessing a traffic offense by a potentially impaired driver?",
        "answer": "Upon witnessing such an offense, a LEO can initiate a traffic stop, conduct field sobriety tests, and if warranted, arrest the individual for driving under the influence (DUI), which may involve f

 17%|█▋        | 3/18 [03:42<18:57, 75.82s/it]

[
    {
        "sub_query": "What triggers the need to reduce speed on an expressway?",
        "answer": "The need to reduce speed is typically triggered by the approach of an exit, indicated by signs signaling an upcoming exit or interchange."
    },
    {
        "sub_query": "How do traffic signs guide the reduction in speed for exiting?",
        "answer": "Traffic signs, such as 'Exit Ahead' or 'Reduced Speed Ahead' signs, provide advance notice to drivers that they should start preparing to slow down by easing off the accelerator and applying gentle pressure on the brakes."
    },
    {
        "sub_query": "What are the general guidelines for the distance at which one should begin slowing down?",
        "answer": "A general guideline is to start reducing speed about a quarter of a mile before the exit, adjusting this distance based on the posted exit speed limit and road conditions. However, the exact distance may vary depending on the vehicle's speed, road conditions, and th

 22%|██▏       | 4/18 [04:54<17:20, 74.32s/it]

[
    {
        "sub_query": "Explain high and low beam headlight settings",
        "answer": "High beams provide long-range visibility at night without oncoming traffic, while low beams reduce glare to oncoming drivers."
    },
    {
        "sub_query": "Determine the safe distance for switching to low beams based on visibility and approach time",
        "answer": "Switch to low beams when an oncoming vehicle is about 500 feet away or when their headlights are visible, to avoid causing distraction."
    },
    {
        "sub_query": "Identify specific traffic regulations regarding headlight use",
        "answer": "Check local traffic laws for specific distances or guidelines on when to switch from high to low beams, as these can vary by jurisdiction."
    }
]
using Hybrid


 28%|██▊       | 5/18 [05:37<13:38, 63.00s/it]

[
    {
        "sub_query": "Explain immediate effects of tire blowout",
        "answer": "A front tire blowout results in immediate loss of tire integrity, leading to a decrease in vehicle stability and potentially causing the vehicle to swerve or become difficult to control."
    },
    {
        "sub_query": "Describe impact on vehicle stability and control",
        "answer": "The vehicle may pull sharply towards the side of the blowout, making steering and braking more challenging. This requires careful handling to avoid losing control of the vehicle."
    },
    {
        "sub_query": "Outline safety measures in case of a front tire blowout",
        "answer": "To safely manage a front tire blowout, it's crucial to remain calm, take your foot off the accelerator, and gradually steer towards a safe location to change the tire. Regular maintenance can also help prevent such incidents by identifying worn tires before they fail."
    }
]
using Hybrid


 33%|███▎      | 6/18 [06:43<12:47, 63.99s/it]

[
    {
        "sub_query": "What are the common parking restrictions?",
        "answer": "Common parking restrictions include time limits, permit requirements, no-parking zones, and areas reserved for specific users like the disabled or residents."
    },
    {
        "sub_query": "How does a vehicle's position or manner of parking lead to a violation?",
        "answer": "A vehicle can be in violation if it is parked outside designated parking spaces, blocks traffic, obstructs pedestrian paths, or is positioned in a way that disregards visible signs or markings indicating no parking or specific parking rules."
    },
    {
        "sub_query": "What are the consequences of parking violations?",
        "answer": "Consequences can include fines, towing of the vehicle, and in some cases, points added to the driver's license, depending on the jurisdiction and severity of the violation."
    }
]
using Hybrid


 39%|███▉      | 7/18 [07:51<11:56, 65.16s/it]

[
    {
        "sub_query": "What does it mean for a driver's license to be apprehended?",
        "answer": "A license apprehension occurs when a driver is stopped by authorities and their license is temporarily confiscated due to a violation. The driver must then settle the case with the LTO."
    },
    {
        "sub_query": "What are the steps and timeline for settling a case with the LTO?",
        "answer": "The driver typically has a limited timeframe, often a matter of days (the exact number can depend on specific regulations or the nature of the violation), to appear before the LTO, pay any fines, and comply with other requirements to have their license returned."
    },
    {
        "sub_query": "What are the consequences if a driver fails to settle their case within the given timeframe?",
        "answer": "Failure to comply can result in the suspension or revocation of the driving license, depending on the severity of the offense and the LTO's policies. It is essential t

 44%|████▍     | 8/18 [09:23<12:17, 73.73s/it]

[
    {
        "sub_query": "What is the first action to take when you hear or see an ambulance approaching?",
        "answer": "The first action is to remain calm and start looking for a safe place to pull over to the right side of the road, if possible."
    },
    {
        "sub_query": "How should you position your vehicle when pulling over for an ambulance?",
        "answer": "When pulling over, do so in a way that does not block intersections or roads and turn off your radio and any other distractions to focus on the surroundings."
    },
    {
        "sub_query": "What actions should be avoided when an ambulance is approaching?",
        "answer": "Avoid sudden movements, do not try to outrun the ambulance, and never try to follow or lead the emergency vehicle; simply yield and provide a clear path for it to pass through safely."
    }
]
using Hybrid


 50%|█████     | 9/18 [10:31<10:49, 72.12s/it]

[
    {
        "sub_query": "What are the basic requirements for obtaining a driver's license?",
        "answer": "Basic requirements include being of legal age, passing a vision test, and completing both written and driving exams."
    },
    {
        "sub_query": "How does one's age affect their eligibility for a driver's license?",
        "answer": "Age is a critical factor; applicants must meet the minimum age requirement set by their state or country to be eligible."
    },
    {
        "sub_query": "What is the minimum age to apply for a driver's license in most jurisdictions?",
        "answer": "In most U.S. states, the minimum age for a learner's permit is around 15 or 16 years old, and for a full license, it's typically 16 or 17 years old, but this can vary by state."
    }
]
using Hybrid


 56%|█████▌    | 10/18 [11:38<09:24, 70.51s/it]

[
    {
        "sub_query": "What are the legal implications of driving with a fake license?",
        "answer": "Driving with a fake license violates traffic laws and can lead to severe penalties, as it is considered fraud and puts road safety at risk."
    },
    {
        "sub_query": "What are the specific punishments for driving with a fake license?",
        "answer": "Punishments can include fines, community service, suspension of actual or future driving privileges, and in some cases, imprisonment, depending on the jurisdiction and circumstances."
    },
    {
        "sub_query": "What other consequences might someone face beyond legal penalties for driving with a fake license?",
        "answer": "Beyond legal penalties, individuals may face increased insurance rates, difficulties in obtaining future employment, especially in roles requiring driving, and damage to their personal reputation and trustworthiness."
    }
]
using Hybrid


 61%|██████    | 11/18 [12:57<08:30, 72.97s/it]

[
    {
        "sub_query": "What is a full stop in punctuation?",
        "answer": "A full stop (.) is a punctuation mark used to end a sentence."
    },
    {
        "sub_query": "When do you use a full stop in writing?",
        "answer": "You use a full stop at the end of a declarative sentence or an imperative sentence, and sometimes after abbreviations."
    },
    {
        "sub_query": "How does the use of a full stop affect sentence structure?",
        "answer": "The use of a full stop indicates the completion of a thought or sentence, helping to clarify the structure and meaning of written content."
    }
]
using Hybrid


 67%|██████▋   | 12/18 [13:44<06:30, 65.10s/it]

[
    {
        "sub_query": "What are the general rules for driving on a super highway?",
        "answer": "General rules include following speed limits, staying in lanes, and avoiding hazardous maneuvers."
    },
    {
        "sub_query": "What laws prohibit car racing on public highways?",
        "answer": "Laws against reckless driving, speeding, and endangering the safety of others typically prohibit car racing on public roads."
    },
    {
        "sub_query": "What specific traffic violation is committed by engaging in car racing?",
        "answer": "The primary violation would be reckless driving, with possible additional citations for excessive speeding or other unsafe driving practices."
    }
]
using Hybrid


 72%|███████▏  | 13/18 [14:56<05:36, 67.33s/it]

[
    {
        "sub_query": "What is the main purpose of the Philippine Clean Air Act of 1999?",
        "answer": "The main purpose of the Philippine Clean Air Act of 1999 is to improve air quality and protect public health by regulating and reducing air pollution."
    },
    {
        "sub_query": "What are the key provisions and penalties under the act?",
        "answer": "Key provisions include setting standards for clean air, controlling emissions from vehicles and industrial sources, and implementing measures to prevent and control air pollution. Penalties for non-compliance can range from fines to imprisonment."
    },
    {
        "sub_query": "How has the Philippine Clean Air Act of 1999 been implemented, and what is its impact?",
        "answer": "The act has been implemented through a combination of regulatory measures, public education campaigns, and enforcement actions by government agencies. Its impact includes improved air quality in some areas, increased awareness 

 78%|███████▊  | 14/18 [16:15<04:43, 70.79s/it]

[
    {
        "sub_query": "Explain the importance of safe following distances",
        "answer": "Maintaining a safe distance allows drivers enough time to react and stop if the vehicle in front of them suddenly stops or slows down."
    },
    {
        "sub_query": "What factors influence stopping distances?",
        "answer": "The stopping distance is influenced by the speed of the vehicle, the condition of the road surface, and the efficiency of the vehicle's braking system."
    },
    {
        "sub_query": "How should following distances be adjusted in different driving conditions?",
        "answer": "Drivers need to adjust their following distances based on weather conditions (like rain or fog), traffic density, and the type of road they are driving on."
    }
]
using Hybrid


 83%|████████▎ | 15/18 [17:25<03:31, 70.38s/it]

[
    {
        "sub_query": "What do the colors on a traffic light mean?",
        "answer": "Traffic lights use red, yellow, and green colors to signal when drivers should stop, prepare to stop or go, and go, respectively."
    },
    {
        "sub_query": "What does a steady yellow traffic light mean?",
        "answer": "A steady yellow light indicates that the red light is about to appear, signaling drivers to prepare to stop."
    },
    {
        "sub_query": "What does a blinking yellow traffic light mean?",
        "answer": "A blinking yellow traffic light indicates caution and advises drivers to proceed with caution, yielding to other traffic or pedestrians as necessary."
    }
]
using Hybrid


 89%|████████▉ | 16/18 [18:32<02:18, 69.39s/it]

[
    {
        "sub_query": "What is a fixed stare, and how does it affect productivity?",
        "answer": "A fixed stare refers to maintaining eye focus on a single point for an extended period without blinking or moving the eyes, which can lead to eye strain and decrease in productivity due to mental fatigue."
    },
    {
        "sub_query": "How do regular vision breaks contribute to healthy vision?",
        "answer": "Regular vision breaks, such as following the 20-20-20 rule (looking at something 20 feet away every 20 minutes for 20 seconds), help reduce eye strain and promote healthy vision by giving the eyes a chance to rest."
    },
    {
        "sub_query": "What mindfulness techniques can help improve focus?",
        "answer": "Mindfulness techniques, such as meditation and deep breathing exercises, improve focus by training the mind to concentrate on the present moment, reducing mind wandering and increasing self-awareness of distraction patterns."
    }
]
using Hybr

 94%|█████████▍| 17/18 [19:27<01:05, 65.22s/it]

[
    {
        "sub_query": "What are common reasons for no-parking zones?",
        "answer": "No-parking zones are designated due to safety concerns, traffic flow optimization, or to maintain accessibility for emergency vehicles and pedestrians."
    },
    {
        "sub_query": "How can one identify areas where parking is prohibited?",
        "answer": "Areas where parking is prohibited are usually marked with clear signage indicating 'No Parking' or similar restrictions. Additionally, certain zones like bus lanes, bike lanes, or in front of fire hydrants are universally recognized as no-parking areas."
    },
    {
        "sub_query": "What happens if you park in a no-parking zone?",
        "answer": "Parking in a no-parking zone can result in fines, towing of the vehicle, or in some cases, both. The specific consequences vary by jurisdiction but are enforced to maintain public safety and compliance with traffic regulations."
    }
]
using Hybrid


100%|██████████| 18/18 [20:47<00:00, 69.29s/it]

Updated Testing Data with AI Answers:





Unnamed: 0,Question,A,B,C,D,E,Answer,Choices,AI
59,"When driving on the highway at night, you shou...",another driver dims his lights,blinded by the headlights of an approaching ve...,all of the above,,,C,A. another driver dims his lights\nB. blinded ...,[C]
63,The safest thing to do even if you have the ri...,don't force your rights,horn,force your rights,,,A,A. don't force your rights\nB. horn\nC. force ...,[A]
78,It shall mean that the LEO has reasonable grou...,improbable cause,probable cause,likely cause,,,B,A. improbable cause\nB. probable cause\nC. lik...,[B]
37,"You are preparing to exit an expressway, when ...",Immediately before entering the declaration lane,immediately upon entering the declaration lane,immediately upon spotting the declaration lane,,,B,A. Immediately before entering the declaration...,[B]
29,How close should another car be before you dim...,150 meter,100 meter,200 meters,,,A,A. 150 meter\nB. 100 meter\nC. 200 meters,[B]


In [32]:
import re


def process_answers(answers):
    formatted_answers = []
    
    for a in answers:
        
        matches = re.findall(r'\[?\s*([A-E](?:\s*,\s*[A-E])*)\s*\]?', str(a)) # Extract answers like [A, C, D] or [A] or [B, D]
        answers = []
        for match in matches:
            answers.extend(re.split(r'\s*,\s*', match))  # Split by comma and remove spaces
        unique_sorted_answers = sorted(set(answers), key=lambda x: ['A', 'B', 'C', 'D', 'E'].index(x))
        if not unique_sorted_answers:
            formatted_answers.append(None)
        else:
            formatted_answers.append(unique_sorted_answers)
    return formatted_answers

df_results = df[["Question", "Answer", "AI"]].copy()
df_results['Answer'] = df_results['Answer'].apply(lambda x: x.split(', '))
df_results['AI'] = process_answers(df_results["AI"])
df_results['Answer'] = process_answers(df_results["Answer"])



def calculate_scores(df):
    scores = []
    for index, row in df.iterrows():
        correct_answers = set(row['Answer'] if row['Answer'] is not None else [])
        ai_answers = set(row['AI'] if row['AI'] is not None else [])
        if ai_answers == correct_answers:
            score = 1.0
        else:
            score = 0.0
        scores.append(score)
    
    df['Score'] = scores
    accuracy = scores.count(1.0) / len(scores)
    print(f'Final Score: {scores.count(1.0):.2f}/{len(scores):.2f}')
    print(f'Accuracy: {accuracy:.2f}%')
    return df

# Apply the scoring function
scored_df = calculate_scores(df_results)

# Display the dataframe to verify the results
display(scored_df[['Question', 'Answer', 'AI', 'Score']])

Final Score: 15.00/18.00
Accuracy: 0.83%


Unnamed: 0,Question,Answer,AI,Score
59,"When driving on the highway at night, you shou...",[C],[C],1.0
63,The safest thing to do even if you have the ri...,[A],[A],1.0
78,It shall mean that the LEO has reasonable grou...,[B],[B],1.0
37,"You are preparing to exit an expressway, when ...",[B],[B],1.0
29,How close should another car be before you dim...,[A],[B],0.0
1,What will happen when your front tire blows out?,[C],[C],1.0
52,Parking is considered as a violation when a mo...,[A],[A],1.0
21,"To avoid suspension or revocation, how many da...",[A],[A],1.0
2,What should you do when an ambulance comes up ...,[D],[D],1.0
23,"To aobtain one's driver's license, one must be...",[B],[C],0.0


# Visualization

In [33]:
# Gradio interface with dynamic model, mode selection, and top_k slider
iface = gr.Interface(
    fn=lambda query, top_k, model, mode: gen_query(
        query=query,
        top_k=top_k,
        client=client,
        mode=mode,
        model=model
    ),
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(1, 20, value=5, step=1, label="Top K Results"),  # Slider for top_k (1 to 20)
        gr.Dropdown(
            choices=["llama3.3", "llama3.2:latest", "llama3.1:8b"],
            value="llama3.3",
            label="Select Model"
        ),
        gr.Dropdown(
            choices=["hybrid", "dense", "sparse"],
            value="hybrid",
            label="Select Retrieval Mode"
        )
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="References", lines=10),
    ],
    title="RAG System with LLaMA Models",
    description=(
        "Ask questions and get answers with references from PDF documents. "
        "Adjust Top-K to control the number of retrieved chunks. "
        "Choose different models and retrieval modes for customization."
    )
)

# Launch the Gradio interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [34]:
# Handle closing behavior
def on_close():
    iface.close()
on_close()

Closing server running on port: 7860
