In [26]:
import os
import fitz
import re

from ollama import Client
import faiss
import pandas as pd
import numpy as np
import Stemmer
from tqdm import tqdm
import gradio as gr

from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.retrievers import BaseRetriever, QueryFusionRetriever
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.retrievers.bm25 import BM25Retriever

# Connect to Ollama Server

In [27]:
client = Client(
  host='http://localhost:11434',
)

# Ingestion

In [28]:
# Path to the dataset folder
DATASET_PATH = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2'

def extract_text_from_pdfs(folder_path):
    texts = []
    metadata = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(f"Extracting text from {pdf_path}...")
                
                doc = fitz.open(pdf_path)
                for page_num, page in enumerate(doc, start=1):
                    text = page.get_text()
                    if text.strip():
                        texts.append(text.strip())
                        metadata.append({
                            "source": pdf_path,
                            "folder": folder_name,
                            "title": file,
                            "page": page_num
                        })
                    else:
                        print(f"WARNING: {file} page {page_num} not processed...")
    return texts, metadata

In [29]:
docs, metadatas = extract_text_from_pdfs(DATASET_PATH)

Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-INITIAL-REG.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-MAIRDOE-NEW.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SETTLEMENT.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-DL-CC-P-NP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-MV-CONDUCTION-STICKER.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-RELEASING.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-DL-ENHANCE.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-MV-CONDUCTION-VERIFICATION.pdf...
Extracting t

In [30]:
documents = [Document(text=docs[t], metadata=metadatas[t]) for t in range(len(docs))]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

# Embedding and Retrieval

## Dense via FAISS

In [31]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in tqdm(nodes):
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [32]:
class FaissIndexer:
    """
    Faiss-based indexer for efficient similarity search using inner-product (cosine) similarity.

    This class handles the creation and management of a FAISS index from node embeddings.
    
    :ivar faiss_index: The FAISS index for storing and querying embeddings.
    :vartype faiss_index: faiss.IndexFlatIP
    :ivar embedding_dim: Dimensionality of the embeddings.
    :vartype embedding_dim: int
    """

    def __init__(self):
        """
        Initialize the FaissIndexer class.

        :ivar faiss_index: The FAISS index, initialized as None.
        :ivar embedding_dim: The dimension of embeddings, initialized as None.
        """
        self.faiss_index = None
        self.embedding_dim = None

    def normalize_embeddings(self, embeddings):
        """
        Normalize embeddings to have unit L2 norm.

        :param embeddings: Array of embeddings to normalize.
        :type embeddings: np.ndarray
        :return: Normalized embeddings.
        :rtype: np.ndarray
        """
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    def build_index(self, nodes):
        """
        Build the FAISS index from a list of nodes containing embeddings.

        :param nodes: List of nodes, where each node contains an `embedding` attribute.
        :type nodes: list
        :raises ValueError: If the nodes list is empty or embeddings are inconsistent.
        """
        if not nodes:
            raise ValueError("Nodes list cannot be empty.")
        
        embeddings = np.array([np.array(node.embedding) for node in nodes])
        normalized_embeddings = self.normalize_embeddings(embeddings)

        self.embedding_dim = normalized_embeddings[0].shape[0]
        self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)  # Inner-product similarity
        self.faiss_index.add(normalized_embeddings)

    def get_index(self):
        """
        Get the FAISS index instance.

        :return: The FAISS index used for similarity search.
        :rtype: faiss.IndexFlatIP
        :raises ValueError: If the index has not been built.
        """
        if self.faiss_index is None:
            raise ValueError("Index has not been built yet. Call 'build_index' first.")
        return self.faiss_index

In [33]:
class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents

    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """

        norm_query_embedding = np.array([query_embedding])
        norm_query_embedding /= np.linalg.norm(norm_query_embedding, axis=1, keepdims=True)

        distances, indices = self.faiss_index.search(norm_query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [34]:
# embedding
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

  0%|          | 0/2592 [00:00<?, ?it/s]

100%|██████████| 2592/2592 [02:05<00:00, 20.71it/s]


In [35]:
#indexing
index = FaissIndexer()
index.build_index(nodes_embed)
faiss_index = index.get_index()

faiss_retriever = FAISSVectorStoreRetriever(faiss_index=faiss_index,documents=nodes_embed)

## Sparse Embedding via BM25

In [36]:
# bm25_retriever = BM25Retriever.from_defaults(
#     nodes=nodes,
#     similarity_top_k=5,
#     stemmer=Stemmer.Stemmer("english"),
#     language="english",
# )

## Hybrid Retrieval via Reciprocal Rank

In [37]:
def hybrid_embedding(results: dict, top_k: int):
    x = QueryFusionRetriever
    ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)
    return ranked_results[:top_k]

# Post Retrieval

## Summarization

In [38]:
def summarize_each_chunk(nodes, client, query, model="llama3.3", parent=False):
    if parent:
        chunks = [doc.text for doc in nodes]
    else:
        chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Summarize the following text in one concise paragraph, focusing on key points relevant to the query: "{query}".
        
        - Emphasize information directly related to the query.
        - Exclude unrelated, redundant, or speculative details.
        - Do NOT introduce new information or answer the query itself. 
        
        Text:
        {chunk}
        
        Summary:
        """
        
        response = client.generate(model=model, prompt=prompt)
        summary = response['response'].strip()
        summaries.append(summary)

    return summaries

# Generation

In [39]:


def generate_response_with_notice(summaries, query, choices, client, model="llama3.3"):
    # Combine summaries into context block
    context = "\n".join(summaries)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Use the following summarized information to answer the query accurately and concisely. 
    DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
    If the information is not sufficient to fully address the query, respond ONLY with:
    "The available information is insufficient to provide a complete answer to this query."

    Summarized Context:
    {context}
    
    Query:
    {query}
    {choices}
    
    \nPlease answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then add comma separator if there are multiple answers ONLY IF ALLOWED.
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

# Querying

## Query Transforms

In [40]:
import re
import json

prompt = f"""
    You are an expert assistant. Below are examples of how to decompose a query
    into three sub-queries, and also provide short answers for each sub-query.
    Finally, place these into a JSON array.

    Example #1:
    USER: "Why is the sky blue?"
    SUB-QUERY 1: "Explain Rayleigh scattering"
    ANSWER 1: "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    SUB-QUERY 2: "How light interacts with molecules?"
    ANSWER 2: "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    SUB-QUERY 3: "How human eyes perceive color?"
    ANSWER 3: "The human eye detects the shorter, scattered wavelengths as blue..."

    FINAL ANSWER (JSON array):
    [
    {{
        "sub_query": "Explain Rayleigh scattering",
        "answer": "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    }},
    {{
        "sub_query": "How light interacts with molecules?",
        "answer": "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    }},
    {{
        "sub_query": "How human eyes perceive color?",
        "answer": "The human eye detects the shorter, scattered wavelengths as blue..."
    }}
    ]

    Now your turn:
    USER: "{"What are the requirements in getting a license?"}"

    FINAL ANSWER (JSON array):
    """

response = client.generate(model="llama3.3", prompt=prompt)
print(response.response)

# Extract the JSON part using regex
match = re.search(r'\[\s*{.*?}\s*\]', response.response, re.S)
if match:
    json_text = match.group(0)
    parsed_json = json.loads(json_text)
    print(json.dumps(parsed_json, indent=4))
else:
    print("No JSON found.")

To decompose the query "What are the requirements in getting a license?" into three sub-queries and provide short answers, we can break it down as follows:

1. **Understanding License Types**: The first step is to identify what kind of license is being referred to, as different types (e.g., driver's, professional, business) have different requirements.

2. **Meeting Eligibility Criteria**: Each type of license has specific eligibility criteria that must be met before an application can be submitted.

3. **Application and Documentation Process**: Once eligible, applicants must navigate the application process, which includes submitting required documents and possibly passing exams or interviews.

Given these sub-queries, the short answers would be:

1. **SUB-QUERY 1: "What are the different types of licenses?"**
   - **ANSWER 1:** "Licenses can be for driving, professional practice (like law or medicine), business operation, etc., each with its own set of requirements."

2. **SUB-QUERY 

## Query Generation

In [41]:
docstore = {}

# Store documents using full metadata as the key
for doc in documents:
    key = tuple(doc.metadata.items())  # Convert metadata to tuple for hashable key
    docstore[key] = doc

In [42]:
def get_document_by_chunk_metadata(chunk_node):
    # Convert chunk metadata to tuple for matching
    metadata_key = tuple(chunk_node.metadata.items())

    # Retrieve document from docstore
    document = docstore.get(metadata_key)
    return document

In [43]:
def remove_duplicate_documents(doc_list):
    seen_ids = set()
    unique_docs = []

    for doc in doc_list:
        if doc.doc_id not in seen_ids:
            seen_ids.add(doc.doc_id)
            unique_docs.append(doc)

    return unique_docs

In [44]:
import json

def gen_query(query, choices, top_k, client, mode='dense', summary=False, model="llama3.3", chunks_only=False):
    prompt = f"""
    You are an expert assistant. Below are examples of how to decompose a query
    into three sub-queries, and also provide short answers for each sub-query.
    Finally, place these into a JSON array.

    Example #1:
    USER: "Why is the sky blue?"
    SUB-QUERY 1: "Explain Rayleigh scattering"
    ANSWER 1: "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    SUB-QUERY 2: "How light interacts with molecules?"
    ANSWER 2: "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    SUB-QUERY 3: "How human eyes perceive color?"
    ANSWER 3: "The human eye detects the shorter, scattered wavelengths as blue..."

    FINAL ANSWER (JSON array):
    [
    {{
        "sub_query": "Explain Rayleigh scattering",
        "answer": "Rayleigh scattering is the scattering of light by particles in the atmosphere..."
    }},
    {{
        "sub_query": "How light interacts with molecules?",
        "answer": "Light interacts with molecules through absorption and scattering, altering wavelengths..."
    }},
    {{
        "sub_query": "How human eyes perceive color?",
        "answer": "The human eye detects the shorter, scattered wavelengths as blue..."
    }}
    ]

    Now your turn:
    USER: "{query}"

    FINAL ANSWER (JSON array):
    """

    response_text = client.generate(model=model, prompt=prompt)

    # Extract the JSON part using regex
    match = re.search(r'\[\s*{.*?}\s*\]', response_text.response, re.S)
    if match:
        json_text = match.group(0)
        parsed_json = json.loads(json_text)
        print(json.dumps(parsed_json, indent=4))
        json_text = json.dumps(parsed_json)
        embed_result = client.embeddings(prompt=json_text, model="mxbai-embed-large")
    else:
        print("No JSON found.")
        embed_result = client.embeddings(prompt=query, model="mxbai-embed-large")


    # Return both the parsed sub-queries/answers and the embedding vector
    query_embedding = embed_result["embedding"]

    top_k_docs = faiss_retriever._retrieve(query_embedding, top_k=top_k)

    bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=top_k,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
    )
    retrieved_nodes = bm25_retriever.retrieve(query)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    ranked_results = hybrid_embedding(results, top_k=top_k)

    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results

    parent_flag = True
    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    if chunks_only:
        parent_flag = False
        print('using chunks only')
        context = [docs.node.text for docs in ans_nodes]
        
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])

        if chunks_only:
            context_nodes=ans_nodes
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.3', query=query,parent=parent_flag)
        context = summaries

    answer = generate_response_with_notice(context, query, choices, client, model=model)

    # Format the references
    references = []
    for i, doc in enumerate(ranked_results[:top_k], start=1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
        references.append(source_info)

    return answer, "\n".join(references)

# Evaluation

In [45]:
# Generate prompts dynamically
def generate_choices(row):
    options = []
    for choice in ['A', 'B', 'C', 'D', 'E']:
        # Check for NaN or blank values
        if pd.notna(row[choice]) and row[choice] != '':
            options.append(f"{choice}. {row[choice]}")
    
    # Construct the prompt with few-shot examples
    choices = "\n".join(options)
    
    return choices

In [46]:
from sklearn.model_selection import train_test_split
# Load the Excel file
file_path = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/LTO_EXAM.csv'
df = pd.read_csv(file_path)
df['Choices'] = df.apply(generate_choices, axis=1)
# Split the data into test (80%) and holdout validation (20%)
test_df, holdout_df = train_test_split(df, test_size=0.8, random_state=42)

# Display the first few rows of each set
print("Testing Data:")
display(test_df.head())

print("\nHoldout Validation Data:")
display(holdout_df.head())

test_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/test_data.csv', index=False)
holdout_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/holdout_data.csv', index=False)

Testing Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
59,"When driving on the highway at night, you shou...",another driver dims his lights,blinded by the headlights of an approaching ve...,all of the above,,,C,A. another driver dims his lights\nB. blinded ...
63,The safest thing to do even if you have the ri...,don't force your rights,horn,force your rights,,,A,A. don't force your rights\nB. horn\nC. force ...
78,It shall mean that the LEO has reasonable grou...,improbable cause,probable cause,likely cause,,,B,A. improbable cause\nB. probable cause\nC. lik...
37,"You are preparing to exit an expressway, when ...",Immediately before entering the declaration lane,immediately upon entering the declaration lane,immediately upon spotting the declaration lane,,,B,A. Immediately before entering the declaration...
29,How close should another car be before you dim...,150 meter,100 meter,200 meters,,,A,A. 150 meter\nB. 100 meter\nC. 200 meters



Holdout Validation Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
40,What light shall be used when vehicles are par...,Headlight,Parking lights or lower-beam headlights,Signal lights,,,B,A. Headlight\nB. Parking lights or lower-beam ...
22,To have one's driver's license suspended means...,have it revalidated by the LTO,have it taken away premanently by the LTO,have it taken temporarily by the LTO,,,C,A. have it revalidated by the LTO\nB. have it ...
55,"On a two-lane road, overtaking is only allowed...",left lane,both right and left lane,right lane,,,A,A. left lane\nB. both right and left lane\nC. ...
88,The driver is using a motor vehicle in committ...,revokes and will pay a fine,confiscated and will pay fine,suspended and will pay fine,,,A,A. revokes and will pay a fine\nB. confiscated...
0,What should you do in case your vehicle breaks...,Open your trunk and hood,Stand on the expressway and flag down passing ...,Call for help using a mobile phone or an expre...,Park as far to the right as possible,Put your hazard warning light on,"A, C, D, E",A. Open your trunk and hood\nB. Stand on the e...


In [47]:
df = test_df.copy()
df["AI"] = np.nan
ai_answer = []

for i in tqdm(range(len(df))):
    answ = gen_query(df["Question"].iloc[i], df["Choices"].iloc[i], top_k=15, client=client, mode='hybrid', model="llama3.3")
    ai_answer.append(answ[0])  # Get the first answer

df["AI"] = ai_answer
print("Updated Testing Data with AI Answers:")
display(df.head())

  0%|          | 0/18 [00:00<?, ?it/s]

[
    {
        "sub_query": "What is the purpose of low beam headlights?",
        "answer": "Low beam headlights, also known as dim lights, are designed to illuminate the road ahead without shining too far into the distance, reducing glare for oncoming traffic."
    },
    {
        "sub_query": "When should drivers switch to low beams at night?",
        "answer": "Drivers should switch to their low beam headlights when they see an oncoming vehicle within a certain distance (usually about 500 feet or less) to prevent dazzling the other driver and to ensure mutual safe passage."
    },
    {
        "sub_query": "Are there specific scenarios where low beams are recommended?",
        "answer": "Yes, use low beam headlights when following another vehicle closely at night to avoid over-illuminating the vehicle in front of you, as well as in foggy or heavy rain conditions where high beams can create glare from water droplets in the air."
    }
]
using Hybrid


  6%|▌         | 1/18 [00:44<12:29, 44.11s/it]

[
    {
        "sub_query": "Explain the importance of knowing traffic laws.",
        "answer": "Knowing traffic laws is crucial for ensuring safety on the roads, as it helps users understand their rights and responsibilities."
    },
    {
        "sub_query": "Describe defensive driving techniques.",
        "answer": "Defensive driving involves anticipating potential hazards, maintaining safe distances, and being prepared to react to unexpected situations."
    },
    {
        "sub_query": "Discuss ways to minimize risks while using the road.",
        "answer": "Minimizing risks on the road involves wearing appropriate safety gear (like helmets for cyclists), avoiding distractions (such as using a phone while driving), and being mindful of weather conditions."
    }
]
using Hybrid


 11%|█         | 2/18 [01:27<11:37, 43.59s/it]

[
    {
        "sub_query": "What constitutes reasonable grounds for a LEO to suspect DUI?",
        "answer": "Reasonable grounds are based on specific, articulable facts that suggest the driver might be impaired, such as erratic driving or failure to obey traffic laws."
    },
    {
        "sub_query": "Which traffic offenses are indicative of possible impairment?",
        "answer": "Offenses like weaving between lanes, speeding, running red lights, or failing to yield can indicate potential impairment due to alcohol, drugs, or other substances."
    },
    {
        "sub_query": "How does personally witnessing a traffic offense lead to further action by the LEO?",
        "answer": "Witnessing such offenses provides the LEO with direct evidence of potentially impaired driving, thereby establishing reasonable grounds for a stop and further investigation to assess the driver's sobriety."
    }
]
using Hybrid


 17%|█▋        | 3/18 [02:16<11:30, 46.02s/it]

[
    {
        "sub_query": "What are the general guidelines for slowing down on an expressway?",
        "answer": "Generally, you should start reducing speed as soon as you see the exit sign or about a mile before your exit, adjusting based on traffic and road conditions."
    },
    {
        "sub_query": "How do traffic signs indicate when to slow down for an exit?",
        "answer": "Traffic signs, including exit signs and speed reduction signs, indicate when you should start slowing down; they often provide ample notice before the exit ramp."
    },
    {
        "sub_query": "What role does road geometry play in determining when to reduce speed for an exit?",
        "answer": "The design of the exit, such as its curvature and length, can influence when you should start reducing speed; tighter curves or shorter ramps may require earlier speed reduction."
    }
]
using Hybrid


 22%|██▏       | 4/18 [02:58<10:22, 44.47s/it]

[
    {
        "sub_query": "What are the general rules for using high vs. low beam headlights?",
        "answer": "High beams should be used when there's no oncoming traffic or vehicles within a certain distance ahead, typically beyond 500 feet for most jurisdictions, to maximize visibility without causing glare to others."
    },
    {
        "sub_query": "How does the distance of another car influence the decision to dim headlights?",
        "answer": "When an oncoming vehicle is within approximately 500 feet or when following another vehicle and your headlights illuminate it, you should dim your high beams to low beams to avoid causing glare that could temporarily blind other drivers."
    },
    {
        "sub_query": "What are the safety implications of proper headlight beam usage in relation to distance?",
        "answer": "Properly adjusting your headlight beam based on the proximity of other vehicles enhances road safety by preventing accidents caused by glare-induced tem

 28%|██▊       | 5/18 [03:50<10:13, 47.20s/it]

[
    {
        "sub_query": "What are the immediate effects of a front tire blowout?",
        "answer": "A front tire blowout results in sudden loss of air pressure, leading to reduced traction and potentially causing the vehicle to pull sharply in one direction."
    },
    {
        "sub_query": "How does a blown-out front tire affect vehicle control?",
        "answer": "The loss of a front tire can significantly impair steering capability, making it difficult to maintain the vehicle's direction due to uneven tire performance."
    },
    {
        "sub_query": "What safety procedures should be followed after a front tire blowout?",
        "answer": "In the event of a front tire blowout, drivers should gently take their foot off the accelerator, avoid slamming on the brakes, and slowly steer to a safe location to change the tire or seek assistance."
    }
]
using Hybrid


 33%|███▎      | 6/18 [04:43<09:49, 49.12s/it]

[
    {
        "sub_query": "What are the common parking restrictions?",
        "answer": "Common parking restrictions include no-parking zones, time-limited parking, and areas reserved for specific users such as the disabled or residents."
    },
    {
        "sub_query": "How do traffic signs indicate parking violations?",
        "answer": "Traffic signs indicating parking violations can include explicit 'No Parking' signs, symbols indicating time restrictions, or signs reserving spaces for particular groups like emergency vehicles."
    },
    {
        "sub_query": "What are the consequences of illegal parking?",
        "answer": "Consequences of illegal parking can range from fines and penalties to towing of the vehicle in severe cases, depending on local laws and regulations."
    }
]
using Hybrid


 39%|███▉      | 7/18 [05:20<08:18, 45.28s/it]

[
    {
        "sub_query": "What are the common reasons for license suspension or revocation by LTO?",
        "answer": "License suspension or revocation can result from accumulating demerit points, serious traffic violations, or failure to settle pending cases within the prescribed timeframe."
    },
    {
        "sub_query": "How does one typically settle a case with LTO after being apprehended?",
        "answer": "Settling a case involves submitting the required documents, paying any necessary fines, and possibly attending a seminar or hearing, depending on the nature of the violation."
    },
    {
        "sub_query": "What is the timeframe for settling a case to avoid license suspension or revocation?",
        "answer": "The specific number of days can vary based on LTO regulations and the nature of the offense. Generally, drivers are given a certain period (often around 30 to 60 days) to settle their cases, but this should be verified with the LTO for accuracy."
    }
]
us

 44%|████▍     | 8/18 [06:20<08:19, 49.93s/it]

[
    {
        "sub_query": "What do flashing red lights and sirens on an ambulance mean?",
        "answer": "Flashing red lights and sirens indicate that the ambulance is responding to an emergency and needs all other vehicles to yield immediately."
    },
    {
        "sub_query": "How should you yield to an approaching ambulance?",
        "answer": "To yield, pull over to the right side of the road or slow down if moving to the right is not possible. Come to a stop until the ambulance has passed, then check for safety before merging back into traffic."
    },
    {
        "sub_query": "What precautions should be taken after yielding to an ambulance?",
        "answer": "After yielding, avoid sudden stops and do not block intersections. Be cautious of other vehicles and pedestrians who may also be reacting to the emergency vehicle's approach."
    }
]
using Hybrid


 50%|█████     | 9/18 [07:15<07:42, 51.40s/it]

[
    {
        "sub_query": "What is the minimum age to apply for a learner's permit?",
        "answer": "The minimum age to apply for a learner's permit varies by state but is typically 15 or 16 years old."
    },
    {
        "sub_query": "What are the requirements to obtain a probationary or restricted license?",
        "answer": "After holding a learner's permit, individuals can usually apply for a probationary or restricted license at 16 years old, provided they meet specific requirements such as completing a driver's education course and logging a certain number of supervised driving hours."
    },
    {
        "sub_query": "What is the minimum age to obtain a full, unrestricted driver's license?",
        "answer": "In most states, individuals can apply for a full, unrestricted driver's license at 17 or 18 years old, after completing a probationary period and fulfilling all other licensing requirements."
    }
]
using Hybrid


 56%|█████▌    | 10/18 [07:59<06:33, 49.22s/it]

[
    {
        "sub_query": "What laws prohibit driving with a fake license?",
        "answer": "Driving with a fake license is prohibited by state and federal laws that require all drivers to have valid, government-issued licenses to operate a vehicle legally."
    },
    {
        "sub_query": "What are the punishments for driving with a fake license?",
        "answer": "Punishments can include fines, community service, suspension or revocation of driving privileges, and in severe cases, imprisonment, depending on the jurisdiction and specifics of the offense."
    },
    {
        "sub_query": "How are instances of driving with a fake license reported and enforced?",
        "answer": "Instances are typically reported by law enforcement officers during traffic stops or at the scene of accidents. Enforcement involves investigation by these officers, followed by legal proceedings in court, which may result in the aforementioned penalties."
    }
]
using Hybrid


 61%|██████    | 11/18 [08:56<06:01, 51.63s/it]

[
    {
        "sub_query": "What is the definition of a full stop in punctuation?",
        "answer": "A full stop, or period, is a punctuation mark used to end a sentence."
    },
    {
        "sub_query": "In what grammatical contexts is a full stop typically used?",
        "answer": "Full stops are used after declarative sentences, abbreviations, and sometimes after initials in names."
    },
    {
        "sub_query": "Are there any exceptions or special cases for using a full stop?",
        "answer": "Exceptions include not using a full stop after headings, titles, or in certain abbreviations within sentences, as well as its use being dependent on regional language conventions (e.g., UK vs. US English)."
    }
]
using Hybrid


 67%|██████▋   | 12/18 [09:35<04:47, 47.88s/it]

[
    {
        "sub_query": "What are the typical regulations for driving on super highways?",
        "answer": "Typical regulations include adhering to specific speed limits, using designated lanes for passing, and avoiding reckless or competitive driving behaviors."
    },
    {
        "sub_query": "How is car racing defined in the context of traffic laws?",
        "answer": "Car racing on public roads involves driving in a manner that competes with another vehicle, often involving excessive speeds and potentially endangering others."
    },
    {
        "sub_query": "What traffic violations are associated with competitive driving on highways?",
        "answer": "Common violations include reckless driving, speeding, and participating in unauthorized speed contests, all of which can pose significant risks to the driver and other road users."
    }
]
using Hybrid


 72%|███████▏  | 13/18 [10:21<03:56, 47.27s/it]

[
    {
        "sub_query": "What is the main purpose of the Philippine Clean Air Act of 1999?",
        "answer": "The main purpose is to improve air quality and protect public health by reducing pollution."
    },
    {
        "sub_query": "What are the key provisions of the act regarding emission standards and pollution control?",
        "answer": "Key provisions include setting emission standards for vehicles and industries, promoting clean energy, and mandating pollution control technologies."
    },
    {
        "sub_query": "How has the implementation of the Philippine Clean Air Act impacted environmental quality and public health in the Philippines?",
        "answer": "The act's implementation has led to improvements in air quality through reduced emissions from industrial and vehicular sources, contributing to better public health outcomes by decreasing respiratory diseases."
    }
]
using Hybrid


 78%|███████▊  | 14/18 [11:05<03:04, 46.10s/it]

[
    {
        "sub_query": "What is a safe following distance?",
        "answer": "A safe following distance is typically considered to be at least a car length for every 10 mph of speed, allowing enough time to react and stop if the vehicle ahead suddenly stops or slows down."
    },
    {
        "sub_query": "How do road conditions affect stopping times?",
        "answer": "Road conditions such as wet, icy, or slippery roads can significantly increase stopping distances due to reduced tire traction, necessitating even greater following distances for safety."
    },
    {
        "sub_query": "Why is maintaining a safe distance crucial in preventing accidents?",
        "answer": "Maintaining a safe distance reduces the risk of rear-end collisions by providing adequate time and space to react to unexpected maneuvers or hazards on the road, thereby playing a critical role in accident prevention."
    }
]
using Hybrid


 83%|████████▎ | 15/18 [11:43<02:11, 43.68s/it]

[
    {
        "sub_query": "What do traffic lights generally signify?",
        "answer": "Traffic lights are standardized signals that indicate when drivers should stop (red), go (green), or prepare to stop (yellow)."
    },
    {
        "sub_query": "How does the meaning of a yellow light differ from other colors?",
        "answer": "A yellow light, whether steady or blinking, serves as a cautionary signal. A steady yellow light warns drivers that the signal is about to turn red, while a blinking yellow light advises caution but typically does not precede a red light."
    },
    {
        "sub_query": "What specifically does a blinking yellow traffic light mean?",
        "answer": "A blinking yellow traffic light indicates that drivers should proceed with caution. It is often used at intersections or areas where a more cautious approach is necessary due to specific conditions, and it does not signal an immediate transition to a red light."
    }
]
using Hybrid


 89%|████████▉ | 16/18 [12:46<01:39, 49.72s/it]

[
    {
        "sub_query": "Explain the physiological effects of prolonged focus",
        "answer": "Prolonged focus on a single point can lead to visual fatigue and decreased productivity due to the brain's tendency to seek novelty. Regular breaks can help mitigate this effect by allowing the brain to rest and replenish its ability to focus."
    },
    {
        "sub_query": "What strategies can improve attention span?",
        "answer": "Strategies such as the Pomodoro Technique, which involves working in focused, 25-minute increments, followed by a five-minute break, can significantly improve attention span and reduce distraction. Additionally, minimizing multitasking and creating an environment conducive to focus are beneficial."
    },
    {
        "sub_query": "How does mindfulness practice help in resisting distractions?",
        "answer": "Mindfulness practices, such as meditation and deep breathing exercises, enhance the brain's ability to filter out distractions by imp

 94%|█████████▍| 17/18 [13:51<00:54, 54.24s/it]

[
    {
        "sub_query": "Why are there parking restrictions?",
        "answer": "Parking restrictions are in place for safety, traffic flow, and accessibility reasons, ensuring that emergency vehicles can pass through and pedestrians have safe access."
    },
    {
        "sub_query": "Where are no-parking zones usually located?",
        "answer": "No-parking zones are typically found near critical infrastructure like fire hydrants, hospitals, and in areas with heavy pedestrian traffic to maintain public safety and convenience."
    },
    {
        "sub_query": "What are the consequences of parking in a no-parking zone?",
        "answer": "Parking in a no-parking zone can result in fines, towing of the vehicle, or other penalties as enforced by local law enforcement or parking authorities."
    }
]
using Hybrid


100%|██████████| 18/18 [14:50<00:00, 49.46s/it]

Updated Testing Data with AI Answers:





Unnamed: 0,Question,A,B,C,D,E,Answer,Choices,AI
59,"When driving on the highway at night, you shou...",another driver dims his lights,blinded by the headlights of an approaching ve...,all of the above,,,C,A. another driver dims his lights\nB. blinded ...,"[B, C]"
63,The safest thing to do even if you have the ri...,don't force your rights,horn,force your rights,,,A,A. don't force your rights\nB. horn\nC. force ...,[A]
78,It shall mean that the LEO has reasonable grou...,improbable cause,probable cause,likely cause,,,B,A. improbable cause\nB. probable cause\nC. lik...,[B]
37,"You are preparing to exit an expressway, when ...",Immediately before entering the declaration lane,immediately upon entering the declaration lane,immediately upon spotting the declaration lane,,,B,A. Immediately before entering the declaration...,[B]
29,How close should another car be before you dim...,150 meter,100 meter,200 meters,,,A,A. 150 meter\nB. 100 meter\nC. 200 meters,[C]


In [48]:
import re


def process_answers(answers):
    formatted_answers = []
    
    for a in answers:
        
        matches = re.findall(r'\[?\s*([A-E](?:\s*,\s*[A-E])*)\s*\]?', str(a)) # Extract answers like [A, C, D] or [A] or [B, D]
        answers = []
        for match in matches:
            answers.extend(re.split(r'\s*,\s*', match))  # Split by comma and remove spaces
        unique_sorted_answers = sorted(set(answers), key=lambda x: ['A', 'B', 'C', 'D', 'E'].index(x))
        if not unique_sorted_answers:
            formatted_answers.append(None)
        else:
            formatted_answers.append(unique_sorted_answers)
    return formatted_answers

df_results = df[["Question", "Answer", "AI"]].copy()
df_results['Answer'] = df_results['Answer'].apply(lambda x: x.split(', '))
df_results['AI'] = process_answers(df_results["AI"])
df_results['Answer'] = process_answers(df_results["Answer"])



def calculate_scores(df):
    scores = []
    for index, row in df.iterrows():
        correct_answers = set(row['Answer'] if row['Answer'] is not None else [])
        ai_answers = set(row['AI'] if row['AI'] is not None else [])
        if ai_answers == correct_answers:
            score = 1.0
        else:
            score = 0.0
        scores.append(score)
    
    df['Score'] = scores
    accuracy = scores.count(1.0) / len(scores)
    print(f'Final Score: {scores.count(1.0):.2f}/{len(scores):.2f}')
    print(f'Accuracy: {accuracy:.2f}%')
    return df

# Apply the scoring function
scored_df = calculate_scores(df_results)

# Display the dataframe to verify the results
display(scored_df[['Question', 'Answer', 'AI', 'Score']])

Final Score: 14.00/18.00
Accuracy: 0.78%


Unnamed: 0,Question,Answer,AI,Score
59,"When driving on the highway at night, you shou...",[C],"[B, C]",0.0
63,The safest thing to do even if you have the ri...,[A],[A],1.0
78,It shall mean that the LEO has reasonable grou...,[B],[B],1.0
37,"You are preparing to exit an expressway, when ...",[B],[B],1.0
29,How close should another car be before you dim...,[A],[C],0.0
1,What will happen when your front tire blows out?,[C],[C],1.0
52,Parking is considered as a violation when a mo...,[A],[A],1.0
21,"To avoid suspension or revocation, how many da...",[A],[A],1.0
2,What should you do when an ambulance comes up ...,[D],[D],1.0
23,"To aobtain one's driver's license, one must be...",[B],[C],0.0


# Visualization

In [49]:
# Gradio interface with dynamic model, mode selection, and top_k slider
iface = gr.Interface(
    fn=lambda query, top_k, model, mode: gen_query(
        query=query,
        top_k=top_k,
        client=client,
        mode=mode,
        model=model
    ),
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(1, 20, value=5, step=1, label="Top K Results"),  # Slider for top_k (1 to 20)
        gr.Dropdown(
            choices=["llama3.3", "llama3.2:latest", "llama3.1:8b"],
            value="llama3.3",
            label="Select Model"
        ),
        gr.Dropdown(
            choices=["hybrid", "dense", "sparse"],
            value="hybrid",
            label="Select Retrieval Mode"
        )
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="References", lines=10),
    ],
    title="RAG System with LLaMA Models",
    description=(
        "Ask questions and get answers with references from PDF documents. "
        "Adjust Top-K to control the number of retrieved chunks. "
        "Choose different models and retrieval modes for customization."
    )
)

# Launch the Gradio interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [50]:
# Handle closing behavior
def on_close():
    iface.close()
on_close()

Closing server running on port: 7860
