In [1]:
import os
import fitz
import re

from ollama import Client
import faiss
import pandas as pd
import numpy as np
import Stemmer
from tqdm import tqdm
import gradio as gr

from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.retrievers import BaseRetriever, QueryFusionRetriever
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.retrievers.bm25 import BM25Retriever

# Connect to Ollama Server

In [2]:
client = Client(
  host='http://localhost:11434',
)

# Ingestion

In [3]:
# Path to the dataset folder
DATASET_PATH = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2'

def extract_text_from_pdfs(folder_path):
    texts = []
    metadata = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(f"Extracting text from {pdf_path}...")
                
                doc = fitz.open(pdf_path)
                for page_num, page in enumerate(doc, start=1):
                    text = page.get_text()
                    if text.strip():
                        texts.append(text.strip())
                        metadata.append({
                            "source": pdf_path,
                            "folder": folder_name,
                            "title": file,
                            "page": page_num
                        })
                    else:
                        print(f"WARNING: {file} page {page_num} not processed...")
    return texts, metadata

In [4]:
docs, metadatas = extract_text_from_pdfs(DATASET_PATH)

Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-INITIAL-REG.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-MAIRDOE-NEW.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SETTLEMENT.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-DL-CC-P-NP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-MV-CONDUCTION-STICKER.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-RELEASING.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-DL-ENHANCE.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-MV-CONDUCTION-VERIFICATION.pdf...
Extracting t

In [5]:
documents = [Document(text=docs[t], metadata=metadatas[t]) for t in range(len(docs))]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

# Embedding and Retrieval

## Dense via FAISS

In [6]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in tqdm(nodes):
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [7]:
class FaissIndexer:
    """
    Faiss-based indexer for efficient similarity search using inner-product (cosine) similarity.

    This class handles the creation and management of a FAISS index from node embeddings.
    
    :ivar faiss_index: The FAISS index for storing and querying embeddings.
    :vartype faiss_index: faiss.IndexFlatIP
    :ivar embedding_dim: Dimensionality of the embeddings.
    :vartype embedding_dim: int
    """

    def __init__(self):
        """
        Initialize the FaissIndexer class.

        :ivar faiss_index: The FAISS index, initialized as None.
        :ivar embedding_dim: The dimension of embeddings, initialized as None.
        """
        self.faiss_index = None
        self.embedding_dim = None

    def normalize_embeddings(self, embeddings):
        """
        Normalize embeddings to have unit L2 norm.

        :param embeddings: Array of embeddings to normalize.
        :type embeddings: np.ndarray
        :return: Normalized embeddings.
        :rtype: np.ndarray
        """
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    def build_index(self, nodes):
        """
        Build the FAISS index from a list of nodes containing embeddings.

        :param nodes: List of nodes, where each node contains an `embedding` attribute.
        :type nodes: list
        :raises ValueError: If the nodes list is empty or embeddings are inconsistent.
        """
        if not nodes:
            raise ValueError("Nodes list cannot be empty.")
        
        embeddings = np.array([np.array(node.embedding) for node in nodes])
        normalized_embeddings = self.normalize_embeddings(embeddings)

        self.embedding_dim = normalized_embeddings[0].shape[0]
        self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)  # Inner-product similarity
        self.faiss_index.add(normalized_embeddings)

    def get_index(self):
        """
        Get the FAISS index instance.

        :return: The FAISS index used for similarity search.
        :rtype: faiss.IndexFlatIP
        :raises ValueError: If the index has not been built.
        """
        if self.faiss_index is None:
            raise ValueError("Index has not been built yet. Call 'build_index' first.")
        return self.faiss_index

In [8]:
class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents

    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """

        norm_query_embedding = np.array([query_embedding])
        norm_query_embedding /= np.linalg.norm(norm_query_embedding, axis=1, keepdims=True)

        distances, indices = self.faiss_index.search(norm_query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [9]:
# embedding
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

100%|██████████| 2592/2592 [02:47<00:00, 15.47it/s]


In [10]:
#indexing
index = FaissIndexer()
index.build_index(nodes_embed)
faiss_index = index.get_index()

faiss_retriever = FAISSVectorStoreRetriever(faiss_index=faiss_index,documents=nodes_embed)

## Sparse Embedding via BM25

In [11]:
# bm25_retriever = BM25Retriever.from_defaults(
#     nodes=nodes,
#     similarity_top_k=5,
#     stemmer=Stemmer.Stemmer("english"),
#     language="english",
# )

## Hybrid Retrieval via Reciprocal Rank

In [12]:
def hybrid_embedding(results: dict, top_k: int):
    x = QueryFusionRetriever
    ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)
    return ranked_results[:top_k]

# Post Retrieval

## Summarization

In [13]:
def summarize_each_chunk(nodes, client, query, model="llama3.2", parent=False):
    if parent:
        chunks = [doc.text for doc in nodes]
    else:
        chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Summarize the following text in one concise paragraph, focusing on key points relevant to the query: "{query}".
        
        - Emphasize information directly related to the query.
        - Exclude unrelated, redundant, or speculative details.
        - Do NOT introduce new information or answer the query itself. 
        
        Text:
        {chunk}
        
        Summary:
        """
        
        response = client.generate(model=model, prompt=prompt)
        summary = response['response'].strip()
        summaries.append(summary)

    return summaries

# Generation

# Querying

## Query Transforms

In [14]:
# Few-shot examples
FEW_SHOTS = """

Example 2:
Question: What will happen when your front tire blows out?
A. The back end will sway towards the side of the blowout
B. The back end will sway away from the blowout
C. The front end will pull towards the side of the blowout
D. The front end will pull to the opposite side of the blowout
Please answer only in letters
Correct Answer: [C]

Example 3:
Question: What should you do when an ambulance comes up behind you flashing red lights and/or sounding its siren?
A. Stop as soon as you can
B. Maintain your speed, let the ambulance driver will find a way around you
C. Speed up so that you don't hold the ambulance
D. Pull over to the right and slow down or even stop if necessary
Please answer only in letters
Correct Answer: [D]
"""

def paraphrase_query(query,client,model):

    few_shot_examples = '''
    Example 1:
    Transform this into a Retrieval Augmented Generation query: What is AI?
    Output: "Retrieve information on artificial intelligence and provide a concise definition."

    Example 2:
    Transform this into a Retrieval Augmented Generation query: How does reinforcement learning work?
    Output: "Retrieve explanations of reinforcement learning algorithms with examples."

    Example 3:
    Transform this into a Retrieval Augmented Generation query: Explain neural networks.
    Output: "Retrieve details on neural networks and explain their structure and function."

    Now, transform the following query and provide the response inside double quotes:
    '''

    full_prompt = few_shot_examples + f"\nQuery: {query}"

    response = client.generate(
        model=model,
        prompt=full_prompt
    )
    # Extract text within double quotes
    matches = re.findall(r'"([^"]*)"', response['response'].strip())

    print(f"Query: {query}")
    print(f"Transformed: {matches if matches else 'No match found'}")
    print('-' * 60)

    if matches:
        return matches[0]
    else:
        return query


def cot_qa_format(query, context, client, model):
    return f"""
    Let's break down the problem step by step:
    1. Identify the key components of the following question: {paraphrase_query(query, client, model)}
    2. Analyze relevant facts and eliminate unnecessary details.
    3. Provide the best possible answer based on the reasoning process.

    Here is relevant context about the query:
    {context}
    """

def generate_response_with_cot(summaries, query, choices, client, model="llama3.1:8b", cot_model="llama3.1:8b", para_model="llama3.1:8b"):
    """
    Generate a response using Chain of Thought (CoT) reasoning by passing intermediate
    results through successive steps to build a synthesized final answer.
    """

    # 1. Combine summaries into context block
    context = "\n".join(summaries)

    try:
        # 2. Step 1: Generate initial thought process (CoT step 1)
        cot_query = cot_qa_format(query, context, client, para_model)
        response1 = client.generate(model=cot_model, prompt=cot_query)
        reasoning1 = response1['response'].strip()

        # 3. Step 2: Refine based on first result
        cot_query2 = f"""
        We reasoned that: {reasoning1}
        Can you expand on this by addressing additional details or gaps?
        {query}
        """
        response2 = client.generate(model=cot_model, prompt=cot_query2)
        reasoning2 = response2['response'].strip()

        # 5. Aggregate all steps for the final decision
        final_query = f"""
        Based on the results:
        - Step 1: {reasoning1}
        - Step 2: {reasoning2}

        Now, synthesize these results and answer this question:
        {query + " Answer which of the choices match the synthesized results."}
        Choices: {choices}

        Please answer only in letters and put them inside brackets '[]'. If the question contains the statement 'Check all that apply', use a comma separator for multiple answers.
        Here are two examples on how to answer the question:
        {FEW_SHOTS}
        """
        
        # 6. Final model call to produce the overall response
        final_response = client.generate(model=model, prompt=final_query)
        
        return final_response['response'].strip()
    
    except Exception as e:
        print(f"Error during CoT generation: {str(e)}")
        return "Error in generating response."


    except Exception as e:
        print(f"Error processing prompt: {e}")
        return None



## Query Generation

In [15]:
docstore = {}

# Store documents using full metadata as the key
for doc in documents:
    key = tuple(doc.metadata.items())  # Convert metadata to tuple for hashable key
    docstore[key] = doc

In [16]:
def get_document_by_chunk_metadata(chunk_node):
    # Convert chunk metadata to tuple for matching
    metadata_key = tuple(chunk_node.metadata.items())

    # Retrieve document from docstore
    document = docstore.get(metadata_key)
    return document

In [17]:
def remove_duplicate_documents(doc_list):
    seen_ids = set()
    unique_docs = []

    for doc in doc_list:
        if doc.doc_id not in seen_ids:
            seen_ids.add(doc.doc_id)
            unique_docs.append(doc)

    return unique_docs

In [18]:
def gen_query(query, choices, top_k, client, mode='dense', summary=False, model="llama3.2", chunks_only=False):
    response = client.embeddings(prompt=query, model="mxbai-embed-large")
    query_embedding = response["embedding"]

    top_k_docs = faiss_retriever._retrieve(query_embedding, top_k=top_k)

    bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=top_k,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
    )
    retrieved_nodes = bm25_retriever.retrieve(query)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    ranked_results = hybrid_embedding(results, top_k=top_k)

    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results

    parent_flag = True
    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    if chunks_only:
        parent_flag = False
        print('using chunks only')
        context = [docs.node.text for docs in ans_nodes]
        
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])

        if chunks_only:
            context_nodes=ans_nodes
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.2:latest', query=query,parent=parent_flag)
        context = summaries

    answer = generate_response_with_cot(context, query, choices, client, model=model)

    # Format the references
    references = []
    for i, doc in enumerate(ranked_results[:top_k], start=1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
        references.append(source_info)

    return answer, "\n".join(references)

# Evaluation

In [19]:
# Generate prompts dynamically
def generate_choices(row):
    options = []
    for choice in ['A', 'B', 'C', 'D', 'E']:
        # Check for NaN or blank values
        if pd.notna(row[choice]) and row[choice] != '':
            options.append(f"{choice}. {row[choice]}")
    
    # Construct the prompt with few-shot examples
    choices = "\n".join(options)
    
    return choices

In [20]:
from sklearn.model_selection import train_test_split
# Load the Excel file
file_path = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/LTO_EXAM.csv'
df = pd.read_csv(file_path)
df['Choices'] = df.apply(generate_choices, axis=1)
# Split the data into test (80%) and holdout validation (20%)
test_df, holdout_df = train_test_split(df, test_size=0.8, random_state=42)

# Display the first few rows of each set
print("Testing Data:")
display(test_df.head())

print("\nHoldout Validation Data:")
display(holdout_df.head())

test_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/test_data.csv', index=False)
holdout_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/holdout_data.csv', index=False)

Testing Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
59,"When driving on the highway at night, you shou...",another driver dims his lights,blinded by the headlights of an approaching ve...,all of the above,,,C,A. another driver dims his lights\nB. blinded ...
63,The safest thing to do even if you have the ri...,don't force your rights,horn,force your rights,,,A,A. don't force your rights\nB. horn\nC. force ...
78,It shall mean that the LEO has reasonable grou...,improbable cause,probable cause,likely cause,,,B,A. improbable cause\nB. probable cause\nC. lik...
37,"You are preparing to exit an expressway, when ...",Immediately before entering the declaration lane,immediately upon entering the declaration lane,immediately upon spotting the declaration lane,,,B,A. Immediately before entering the declaration...
29,How close should another car be before you dim...,150 meter,100 meter,200 meters,,,A,A. 150 meter\nB. 100 meter\nC. 200 meters



Holdout Validation Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Choices
40,What light shall be used when vehicles are par...,Headlight,Parking lights or lower-beam headlights,Signal lights,,,B,A. Headlight\nB. Parking lights or lower-beam ...
22,To have one's driver's license suspended means...,have it revalidated by the LTO,have it taken away premanently by the LTO,have it taken temporarily by the LTO,,,C,A. have it revalidated by the LTO\nB. have it ...
55,"On a two-lane road, overtaking is only allowed...",left lane,both right and left lane,right lane,,,A,A. left lane\nB. both right and left lane\nC. ...
88,The driver is using a motor vehicle in committ...,revokes and will pay a fine,confiscated and will pay fine,suspended and will pay fine,,,A,A. revokes and will pay a fine\nB. confiscated...
0,What should you do in case your vehicle breaks...,Open your trunk and hood,Stand on the expressway and flag down passing ...,Call for help using a mobile phone or an expre...,Park as far to the right as possible,Put your hazard warning light on,"A, C, D, E",A. Open your trunk and hood\nB. Stand on the e...


In [21]:
df = test_df.copy()
df["AI"] = np.nan
ai_answer = []

for i in tqdm(range(len(df))):
    answ = gen_query(df["Question"].iloc[i], df["Choices"].iloc[i], top_k=15, client=client, mode='hybrid', model="llama3.1:8b")
    ai_answer.append(answ[0])  # Get the first answer

df["AI"] = ai_answer
print("Updated Testing Data with AI Answers:")
display(df.head())

  0%|          | 0/18 [00:00<?, ?it/s]

using Hybrid
Query: When driving on the highway at night, you should use low beam headlights (dim lights) when:
Transformed: ['Retrieve safety guidelines for nighttime highway driving with specific information on using low beam headlights in certain situations.']
------------------------------------------------------------


  6%|▌         | 1/18 [03:33<1:00:34, 213.82s/it]

using Hybrid
Query: The safest thing to do even if you have the rights of using the road is:
Transformed: ['Retrieve guidelines for safe driving practices and provide advice on responsible road use.']
------------------------------------------------------------


 11%|█         | 2/18 [07:56<1:04:41, 242.59s/it]

using Hybrid
Query: It shall mean that the LEO has reasonable ground to velieve that the person driving the motor vehicle is under the influence of alcohol, dangerous drugs and/or other similar substanes upon personally witnessing a traffic offense committed.
Transformed: ['Retrieve laws and regulations regarding impaired driving in [country/region] and provide examples of scenarios where an officer has reasonable grounds to believe a driver is intoxicated.']
------------------------------------------------------------


 17%|█▋        | 3/18 [14:44<1:19:34, 318.27s/it]

using Hybrid
Query: You are preparing to exit an expressway, when should you start reducing speed?
Transformed: ['Retrieve information on safe driving practices and guidelines for exiting expressways, with a focus on when to initiate speed reduction.']
------------------------------------------------------------


 22%|██▏       | 4/18 [18:56<1:08:07, 291.95s/it]

using Hybrid
Query: How close should another car be before you dim your headlights?
Transformed: ['Retrieve information on safe driving distances between vehicles and formulate guidelines for adjusting headlights based on proximity.']
------------------------------------------------------------


 28%|██▊       | 5/18 [23:16<1:00:44, 280.38s/it]

using Hybrid
Query: What will happen when your front tire blows out?
Transformed: ['Retrieve information on vehicle safety, tire blowout consequences, and driving procedures in emergency situations.']
------------------------------------------------------------


 33%|███▎      | 6/18 [27:59<56:14, 281.17s/it]  

using Hybrid
Query: Parking is considered as a violation when a motor vehicle:
Transformed: ['Retrieve information on parking regulations and penalties for violations, specifically focusing on when a motor vehicle is considered in violation of parking rules.']
------------------------------------------------------------


 39%|███▉      | 7/18 [31:17<46:35, 254.16s/it]

using Hybrid
Query: To avoid suspension or revocation, how many days must a driver with an apprehended license settle his case with LTO?
Transformed: ['Retrieve information on license suspension or revocation procedures by the Land Transportation Office (LTO) and provide details on the time frame for settling a case related to an apprehended license.']
------------------------------------------------------------


 44%|████▍     | 8/18 [34:34<39:20, 236.05s/it]

using Hybrid
Query: What should you do when an ambulance comes up behind you flashing red lights and/or sounding its siren?
Transformed: ['Retrieve information on safe driving practices around emergency vehicles and provide specific guidance on how to respond when encountering an ambulance with flashing red lights and/or a siren.']
------------------------------------------------------------


 50%|█████     | 9/18 [38:28<35:18, 235.44s/it]

using Hybrid
Query: To aobtain one's driver's license, one must be at least:
Transformed: ["Retrieve age requirements for obtaining a driver's license and specify the minimum age."]
------------------------------------------------------------


 56%|█████▌    | 10/18 [45:59<40:14, 301.82s/it]

using Hybrid
Query: Driving with a fake license is prohibited and is punishable by:
Transformed: ['Retrieve laws regarding driving with a fake ID and describe penalties for violators.']
------------------------------------------------------------


 61%|██████    | 11/18 [49:34<32:06, 275.15s/it]

using Hybrid
Query: When do you have a complete/full stop
Transformed: ['Retrieve information on punctuation rules regarding complete stops and provide examples of when to use them.']
------------------------------------------------------------


 67%|██████▋   | 12/18 [54:16<27:44, 277.49s/it]

using Hybrid
Query: You were apprehended because you were engaged in car racing while driving in a super highway, what traffic violation did you commit
Transformed: ['Retrieve information on traffic laws related to high-speed driving and reckless endangerment, then explain which specific violation is applicable to this scenario.']
------------------------------------------------------------


 72%|███████▏  | 13/18 [56:40<19:44, 236.93s/it]

using Hybrid
Query: According to the Philippine Clean Air Act of 1999 (R.A. No 8749)
Transformed: ['Retrieve information on the Philippine Clean Air Act of 1999 (R.A. No 8749) and provide key highlights or relevant sections related to air quality regulations.']
------------------------------------------------------------


 78%|███████▊  | 14/18 [1:02:12<17:42, 265.74s/it]

using Hybrid
Query: Keeping one's distance lessens the risk of accident. One good rule is to leave a car length or:
Transformed: ['Retrieve information on safe driving distances and guidelines for maintaining a safe gap between vehicles, then summarize the importance of leaving a specific amount of space.']
------------------------------------------------------------


 83%|████████▎ | 15/18 [1:10:10<16:28, 329.65s/it]

using Hybrid
Query: What is the meaning of a blinking yellow traffic light?
Transformed: ['Retrieve information on traffic light signals with a focus on the specific interpretation of a blinking yellow light.']
------------------------------------------------------------


 89%|████████▉ | 16/18 [1:13:45<09:49, 294.98s/it]

using Hybrid
Query: What habit will help you prevent getting a fixed-stare and resist distraction?
Transformed: ['Retrieve strategies for improving focus and concentration with examples of habits that can help prevent distractions and fixed stares.']
------------------------------------------------------------


 94%|█████████▍| 17/18 [1:20:47<05:33, 333.27s/it]

using Hybrid
Query: You may never park:
Transformed: ['Retrieve information about parking restrictions and obstacles in urban environments.']
------------------------------------------------------------


100%|██████████| 18/18 [1:26:16<00:00, 287.60s/it]

Updated Testing Data with AI Answers:





Unnamed: 0,Question,A,B,C,D,E,Answer,Choices,AI
59,"When driving on the highway at night, you shou...",another driver dims his lights,blinded by the headlights of an approaching ve...,all of the above,,,C,A. another driver dims his lights\nB. blinded ...,"[A, B]"
63,The safest thing to do even if you have the ri...,don't force your rights,horn,force your rights,,,A,A. don't force your rights\nB. horn\nC. force ...,[ A ]
78,It shall mean that the LEO has reasonable grou...,improbable cause,probable cause,likely cause,,,B,A. improbable cause\nB. probable cause\nC. lik...,[B]
37,"You are preparing to exit an expressway, when ...",Immediately before entering the declaration lane,immediately upon entering the declaration lane,immediately upon spotting the declaration lane,,,B,A. Immediately before entering the declaration...,[BC]
29,How close should another car be before you dim...,150 meter,100 meter,200 meters,,,A,A. 150 meter\nB. 100 meter\nC. 200 meters,[B]


In [22]:
import re


def process_answers(answers):
    formatted_answers = []
    
    for a in answers:
        
        matches = re.findall(r'\[?\s*([A-E](?:\s*,\s*[A-E])*)\s*\]?', str(a)) # Extract answers like [A, C, D] or [A] or [B, D]
        answers = []
        for match in matches:
            answers.extend(re.split(r'\s*,\s*', match))  # Split by comma and remove spaces
        unique_sorted_answers = sorted(set(answers), key=lambda x: ['A', 'B', 'C', 'D', 'E'].index(x))
        if not unique_sorted_answers:
            formatted_answers.append(None)
        else:
            formatted_answers.append(unique_sorted_answers)
    return formatted_answers

df_results = df[["Question", "Answer", "AI"]].copy()
df_results['Answer'] = df_results['Answer'].apply(lambda x: x.split(', '))
df_results['AI'] = process_answers(df_results["AI"])
df_results['Answer'] = process_answers(df_results["Answer"])



def calculate_scores(df):
    scores = []
    for index, row in df.iterrows():
        correct_answers = set(row['Answer'] if row['Answer'] is not None else [])
        ai_answers = set(row['AI'] if row['AI'] is not None else [])
        if ai_answers == correct_answers:
            score = 1.0
        else:
            score = 0.0
        scores.append(score)
    
    df['Score'] = scores
    accuracy = scores.count(1.0) / len(scores)
    print(f'Final Score: {scores.count(1.0):.2f}/{len(scores):.2f}')
    print(f'Accuracy: {accuracy:.2f}%')
    return df

# Apply the scoring function
scored_df = calculate_scores(df_results)

# Display the dataframe to verify the results
display(scored_df[['Question', 'Answer', 'AI', 'Score']])

Final Score: 7.00/18.00
Accuracy: 0.39%


Unnamed: 0,Question,Answer,AI,Score
59,"When driving on the highway at night, you shou...",[C],"[A, B]",0.0
63,The safest thing to do even if you have the ri...,[A],[A],1.0
78,It shall mean that the LEO has reasonable grou...,[B],[B],1.0
37,"You are preparing to exit an expressway, when ...",[B],"[B, C]",0.0
29,How close should another car be before you dim...,[A],[B],0.0
1,What will happen when your front tire blows out?,[C],[C],1.0
52,Parking is considered as a violation when a mo...,[A],"[A, B, C]",0.0
21,"To avoid suspension or revocation, how many da...",[A],,0.0
2,What should you do when an ambulance comes up ...,[D],[D],1.0
23,"To aobtain one's driver's license, one must be...",[B],"[A, C]",0.0


# Visualization

In [23]:
# Gradio interface with dynamic model, mode selection, and top_k slider
iface = gr.Interface(
    fn=lambda query, top_k, model, mode: gen_query(
        query=query,
        top_k=top_k,
        client=client,
        mode=mode,
        model=model
    ),
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(1, 20, value=5, step=1, label="Top K Results"),  # Slider for top_k (1 to 20)
        gr.Dropdown(
            choices=["llama3.3", "llama3.2:latest", "llama3.1:8b"],
            value="llama3.3",
            label="Select Model"
        ),
        gr.Dropdown(
            choices=["hybrid", "dense", "sparse"],
            value="hybrid",
            label="Select Retrieval Mode"
        )
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="References", lines=10),
    ],
    title="RAG System with LLaMA Models",
    description=(
        "Ask questions and get answers with references from PDF documents. "
        "Adjust Top-K to control the number of retrieved chunks. "
        "Choose different models and retrieval modes for customization."
    )
)

# Launch the Gradio interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [24]:
# Handle closing behavior
def on_close():
    iface.close()
on_close()

Closing server running on port: 7860


# TESTING

In [25]:
# Few-shot examples
FEW_SHOTS = """

Example 2:
Question: What will happen when your front tire blows out?
A. The back end will sway towards the side of the blowout
B. The back end will sway away from the blowout
C. The front end will pull towards the side of the blowout
D. The front end will pull to the opposite side of the blowout
Please answer only in letters
Correct Answer: [C]

Example 3:
Question: What should you do when an ambulance comes up behind you flashing red lights and/or sounding its siren?
A. Stop as soon as you can
B. Maintain your speed, let the ambulance driver will find a way around you
C. Speed up so that you don't hold the ambulance
D. Pull over to the right and slow down or even stop if necessary
Please answer only in letters
Correct Answer: [D]
"""

def paraphrase_query(query,client,model):

    few_shot_examples = '''
    Example 1:
    Transform this into a Retrieval Augmented Generation query: What is AI?
    Output: "Retrieve information on artificial intelligence and provide a concise definition."

    Example 2:
    Transform this into a Retrieval Augmented Generation query: How does reinforcement learning work?
    Output: "Retrieve explanations of reinforcement learning algorithms with examples."

    Example 3:
    Transform this into a Retrieval Augmented Generation query: Explain neural networks.
    Output: "Retrieve details on neural networks and explain their structure and function."

    Now, transform the following query and provide the response inside double quotes:
    '''

    full_prompt = few_shot_examples + f"\nQuery: {query}"

    response = client.generate(
        model=model,
        prompt=full_prompt
    )
    # Extract text within double quotes
    matches = re.findall(r'"([^"]*)"', response['response'].strip())

    print(f"Query: {query}")
    print(f"Transformed: {matches if matches else 'No match found'}")
    print('-' * 60)

    if matches:
        return matches[0]
    else:
        return query


def cot_qa_format(query, context, client, model):
    return f"""
    Let's break down the problem step by step:
    1. Identify the key components of the following question: {paraphrase_query(query, client, model)}
    2. Analyze relevant facts and eliminate unnecessary details.
    3. Provide the best possible answer based on the reasoning process.

    Here is relevant context about the query:
    {context}
    """

def generate_response_with_cot(summaries, query, choices, client, model="llama3.1:8b", cot_model="llama3.1:8b", para_model="llama3.1:8b"):
    """
    Generate a response using Chain of Thought (CoT) reasoning by passing intermediate
    results through successive steps to build a synthesized final answer.
    """

    # 1. Combine summaries into context block
    context = "\n".join(summaries)

    try:
        # 2. Step 1: Generate initial thought process (CoT step 1)
        cot_query = cot_qa_format(query, context, client, para_model)
        print(cot_query)
        response1 = client.generate(model=cot_model, prompt=cot_query)
        reasoning1 = response1['response'].strip()
        print(reasoning1)

        # 3. Step 2: Refine based on first result
        cot_query2 = f"""
        We reasoned that: {reasoning1}
        Can you expand on this by addressing additional details or gaps?
        {query}
        """
        response2 = client.generate(model=cot_model, prompt=cot_query2)
        reasoning2 = response2['response'].strip()
        print(reasoning2)

        # 5. Aggregate all steps for the final decision
        final_query = f"""
        Based on the results:
        - Step 1: {reasoning1}
        - Step 2: {reasoning2}

        Now, synthesize these results and answer this question:
        {query + " Answer which of the choices match the synthesized results."}
        Choices: {choices}

        Please answer only in letters and put them inside brackets '[]'. If the question contains the statement 'Check all that apply', use a comma separator for multiple answers.
        """
        print(final_query)
        
        # 6. Final model call to produce the overall response
        final_response = client.generate(model=model, prompt=final_query)
        print(final_response['response'])
        
        return final_response['response'].strip()
    
    except Exception as e:
        print(f"Error during CoT generation: {str(e)}")
        return "Error in generating response."


    except Exception as e:
        print(f"Error processing prompt: {e}")
        return None



In [26]:
def gen_query(query, choices, top_k, client, mode='dense', summary=False, model="llama3.2", chunks_only=False):
    response = client.embeddings(prompt=query, model="mxbai-embed-large")
    query_embedding = response["embedding"]

    top_k_docs = faiss_retriever._retrieve(query_embedding, top_k=top_k)

    bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=top_k,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
    )
    retrieved_nodes = bm25_retriever.retrieve(query)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    ranked_results = hybrid_embedding(results, top_k=top_k)

    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results

    parent_flag = True
    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    if chunks_only:
        parent_flag = False
        print('using chunks only')
        context = [docs.node.text for docs in ans_nodes]
        
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])

        if chunks_only:
            context_nodes=ans_nodes
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.2:latest', query=query,parent=parent_flag)
        context = summaries

    answer = generate_response_with_cot(context, query, choices, client, model=model)

    # Format the references
    references = []
    for i, doc in enumerate(ranked_results[:top_k], start=1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
        references.append(source_info)

    return answer, "\n".join(references)

In [27]:
df = test_df.copy()
df["AI"] = np.nan
ai_answer = []

for i in tqdm(range(len(df))):
    answ = gen_query(df["Question"].iloc[i], df["Choices"].iloc[i], top_k=15, client=client, mode='hybrid', model="llama3.1:8b")
    ai_answer.append(answ[0])  # Get the first answer
    if i==1:
        break

df["AI"] = ai_answer
print("Updated Testing Data with AI Answers:")
display(df.head())

  0%|          | 0/18 [00:00<?, ?it/s]

using Hybrid
Query: When driving on the highway at night, you should use low beam headlights (dim lights) when:
Transformed: ['Retrieve information on safe driving practices for nighttime driving on highways and provide guidelines on when to use low beam headlights.']
------------------------------------------------------------

    Let's break down the problem step by step:
    1. Identify the key components of the following question: Retrieve information on safe driving practices for nighttime driving on highways and provide guidelines on when to use low beam headlights.
    2. Analyze relevant facts and eliminate unnecessary details.
    3. Provide the best possible answer based on the reasoning process.

    Here is relevant context about the query:
    ROAD AND TRAFFIC RULES
Restrictions on Overtaking and Passing. –
(b) The driver of a vehicle shall not overtake or pass another vehicle proceeding in the same direction, 
when approaching the crest of a grade, not upon a curve in th

  6%|▌         | 1/18 [05:14<1:29:13, 314.92s/it]

[A, B]
using Hybrid
Query: The safest thing to do even if you have the rights of using the road is:
Transformed: ['Retrieve information on safe driving practices and provide a concise list of precautions for drivers.']
------------------------------------------------------------

    Let's break down the problem step by step:
    1. Identify the key components of the following question: Retrieve information on safe driving practices and provide a concise list of precautions for drivers.
    2. Analyze relevant facts and eliminate unnecessary details.
    3. Provide the best possible answer based on the reasoning process.

    Here is relevant context about the query:
    143 
 
Dealing with Emergency Situations 
 
SKIDDING 
 
Most skids happen when a driver tries to turn quickly or stop 
suddenly on a downhill, curb or slippery pavement (wet, gravel, 
sand on the highway). 
 
1. Do not use your brake, if 
possible. 
2. Take off your foot on the gas 
pedal. 
3. Identify your escape rout

  6%|▌         | 1/18 [10:43<3:02:18, 643.42s/it]

[A, C]





ValueError: Length of values (2) does not match length of index (18)