In [66]:
import os
import fitz
import re
import json

from ollama import Client
import faiss
import pandas as pd
import numpy as np
import Stemmer
from tqdm import tqdm
import gradio as gr

from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.retrievers import BaseRetriever, QueryFusionRetriever
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.retrievers.bm25 import BM25Retriever

# Connect to Ollama Server

In [67]:
client = Client(
  host='http://localhost:11434',
)

# Ingestion

In [68]:
# Path to the dataset folder
DATASET_PATH = 'extracted_text.json'

def get_text_and_metadata(input_path):
    """Load text and metadata from a file and perform chunking."""
    with open(input_path, "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    texts = []
    metadata = []

    for entry in extracted_data:
        text = entry["text"]
        source_metadata = {
            "source": entry["source"],
            "folder": entry["folder"],
            "file_name": entry["file_name"],
            "page": entry["page"],
            "title": entry["title"],
            "url": entry["url"]
        }
        texts.append(text)
        metadata.append(source_metadata)

    return texts, metadata

In [69]:
docs, metadatas = get_text_and_metadata(DATASET_PATH)

In [70]:
documents = [Document(text=docs[t], metadata=metadatas[t]) for t in range(len(docs))]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

# Embedding and Retrieval

## Dense via FAISS

In [71]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in tqdm(nodes):
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [72]:
class FaissIndexer:
    """
    Faiss-based indexer for efficient similarity search using inner-product (cosine) similarity.

    This class handles the creation and management of a FAISS index from node embeddings.
    
    :ivar faiss_index: The FAISS index for storing and querying embeddings.
    :vartype faiss_index: faiss.IndexFlatIP
    :ivar embedding_dim: Dimensionality of the embeddings.
    :vartype embedding_dim: int
    """

    def __init__(self):
        """
        Initialize the FaissIndexer class.

        :ivar faiss_index: The FAISS index, initialized as None.
        :ivar embedding_dim: The dimension of embeddings, initialized as None.
        """
        self.faiss_index = None
        self.embedding_dim = None

    def normalize_embeddings(self, embeddings):
        """
        Normalize embeddings to have unit L2 norm.

        :param embeddings: Array of embeddings to normalize.
        :type embeddings: np.ndarray
        :return: Normalized embeddings.
        :rtype: np.ndarray
        """
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    def build_index(self, nodes):
        """
        Build the FAISS index from a list of nodes containing embeddings.

        :param nodes: List of nodes, where each node contains an `embedding` attribute.
        :type nodes: list
        :raises ValueError: If the nodes list is empty or embeddings are inconsistent.
        """
        if not nodes:
            raise ValueError("Nodes list cannot be empty.")
        
        embeddings = np.array([np.array(node.embedding) for node in nodes])
        normalized_embeddings = self.normalize_embeddings(embeddings)

        self.embedding_dim = normalized_embeddings[0].shape[0]
        self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)  # Inner-product similarity
        self.faiss_index.add(normalized_embeddings)

    def get_index(self):
        """
        Get the FAISS index instance.

        :return: The FAISS index used for similarity search.
        :rtype: faiss.IndexFlatIP
        :raises ValueError: If the index has not been built.
        """
        if self.faiss_index is None:
            raise ValueError("Index has not been built yet. Call 'build_index' first.")
        return self.faiss_index

In [73]:
class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents

    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """

        norm_query_embedding = np.array([query_embedding])
        norm_query_embedding /= np.linalg.norm(norm_query_embedding, axis=1, keepdims=True)

        distances, indices = self.faiss_index.search(norm_query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [74]:
# embedding
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

100%|██████████| 12302/12302 [16:52<00:00, 12.15it/s]  


In [75]:
#indexing
index = FaissIndexer()
index.build_index(nodes_embed)
faiss_index = index.get_index()

faiss_retriever = FAISSVectorStoreRetriever(faiss_index=faiss_index,documents=nodes_embed)

## Sparse Embedding via BM25

In [76]:
# bm25_retriever = BM25Retriever.from_defaults(
#     nodes=nodes,
#     similarity_top_k=5,
#     stemmer=Stemmer.Stemmer("english"),
#     language="english",
# )

## Hybrid Retrieval via Reciprocal Rank

In [77]:
def hybrid_embedding(results: dict, top_k: int):
    x = QueryFusionRetriever
    ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)
    return ranked_results[:top_k]

# Post Retrieval

## Summarization

In [78]:
def summarize_each_chunk(nodes, client, query, model="llama3.2", parent=False):
    if parent:
        chunks = [doc.text for doc in nodes]
    else:
        chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Summarize the following text in one concise paragraph, focusing on key points relevant to the query: "{query}".
        
        - Emphasize information directly related to the query.
        - Exclude unrelated, redundant, or speculative details.
        - Do NOT introduce new information or answer the query itself. 
        
        Text:
        {chunk}
        
        Summary:
        """
        
        response = client.generate(model=model, prompt=prompt)
        summary = response['response'].strip()
        summaries.append(summary)

    return summaries

# Generation

In [79]:
def generate_response_with_notice(summaries, query, client, model="llama3.2"):
    # Combine summaries into context block
    context = "\n".join(summaries)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Use the following summarized information to answer the query accurately and concisely. 
    DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
    If the information is not sufficient to fully address the query, respond ONLY with:
    "The available information is insufficient to provide a complete answer to this query."

    Summarized Context:
    {context}
    
    Query:
    {query}
    
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

# Querying

## Query Transforms

## Query Generation

In [80]:
docstore = {}

# Store documents using full metadata as the key
for doc in documents:
    key = tuple(doc.metadata.items())  # Convert metadata to tuple for hashable key
    docstore[key] = doc

In [81]:
def get_document_by_chunk_metadata(chunk_node):
    # Convert chunk metadata to tuple for matching
    metadata_key = tuple(chunk_node.metadata.items())

    # Retrieve document from docstore
    document = docstore.get(metadata_key)
    return document

In [82]:
def remove_duplicate_documents(doc_list):
    seen_ids = set()
    unique_docs = []

    for doc in doc_list:
        if doc.doc_id not in seen_ids:
            seen_ids.add(doc.doc_id)
            unique_docs.append(doc)

    return unique_docs

In [83]:
def gen_query(query, top_k, client, mode='dense', summary=False, model="llama3.3", chunks_only=False):
    response = client.embeddings(prompt=query, model="mxbai-embed-large")
    query_embedding = response["embedding"]

    top_k_docs = faiss_retriever._retrieve(query_embedding, top_k=top_k)

    bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=top_k,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
    )
    retrieved_nodes = bm25_retriever.retrieve(query)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    ranked_results = hybrid_embedding(results, top_k=top_k)

    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results

    parent_flag = True
    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    if chunks_only:
        parent_flag = False
        print('using chunks only')
        context = [docs.node.text for docs in ans_nodes]
        
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])

        if chunks_only:
            context_nodes=ans_nodes
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.2:latest', query=query,parent=parent_flag)
        context = summaries

    answer = generate_response_with_notice(context, query, client, model=model)

    # Format the references
    references = []
    for i, doc in enumerate(ranked_results[:top_k], start=1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
        references.append(source_info)

    return answer, "\n".join(references)

# Evaluation

In [84]:
# Generate prompts dynamically
def generate_prompt(row):
    options = []
    for choice in ['A', 'B', 'C', 'D', 'E']:
        # Check for NaN or blank values
        if pd.notna(row[choice]) and row[choice] != '':
            options.append(f"{choice}. {row[choice]}")
    
    # Construct the prompt with few-shot examples
    prompt = f"\nActual Question: {row['Question']}\n" + "\n".join(options)
    prompt += "\nPlease answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then add comma separator if there are multiple answers ONLY IF ALLOWED."
    
    return prompt

In [85]:
from sklearn.model_selection import train_test_split
# Load the Excel file
file_path = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/LTO_EXAM.csv'
df = pd.read_csv(file_path)
df['Prompt'] = df.apply(generate_prompt, axis=1)
# Split the data into test (80%) and holdout validation (20%)
test_df, holdout_df = train_test_split(df, test_size=0.33, random_state=42)

# Display the first few rows of each set
print("Testing Data:")
display(test_df.head())

print("\nHoldout Validation Data:")
display(holdout_df.head())

test_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/test_data.csv', index=False)
holdout_df.to_csv('/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/holdout_data.csv', index=False)

Testing Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Prompt
69,"If the driver is turning left, he must: (U-tur...",have the right of way,do so slowly with caution,yield to approaching cars,,,C,\nActual Question: If the driver is turning le...
5,"When a vehicle starts to skid, what should the...",Immediately step on the brakes,Hold firmly on to the wheel while slowing down...,Turn the wheels tp the opposite the direction ...,,,B,\nActual Question: When a vehicle starts to sk...
42,"If you are parking uphill without a curb, turn...",edge of the street,other side of the street,middle of the street,,,A,\nActual Question: If you are parking uphill w...
47,"When parking downhill, you should turn your fr...",toward the curb of the sidewalk,away from the curb,any direction will do,,,A,"\nActual Question: When parking downhill, you ..."
16,Which of the following is the maximum speed li...,60 kph,80 kph,100 kph,,,C,\nActual Question: Which of the following is t...



Holdout Validation Data:


Unnamed: 0,Question,A,B,C,D,E,Answer,Prompt
40,What light shall be used when vehicles are par...,Headlight,Parking lights or lower-beam headlights,Signal lights,,,B,\nActual Question: What light shall be used wh...
22,To have one's driver's license suspended means...,have it revalidated by the LTO,have it taken away premanently by the LTO,have it taken temporarily by the LTO,,,C,\nActual Question: To have one's driver's lice...
55,"On a two-lane road, overtaking is only allowed...",left lane,both right and left lane,right lane,,,A,"\nActual Question: On a two-lane road, overtak..."
88,The driver is using a motor vehicle in committ...,revokes and will pay a fine,confiscated and will pay fine,suspended and will pay fine,,,A,\nActual Question: The driver is using a motor...
0,What should you do in case your vehicle breaks...,Open your trunk and hood,Stand on the expressway and flag down passing ...,Call for help using a mobile phone or an expre...,Park as far to the right as possible,Put your hazard warning light on,"A, C, D, E",\nActual Question: What should you do in case ...


In [86]:
df = test_df.copy()
df["AI"] = np.nan
ai_answer = []

for i in tqdm(range(len(df))):
    answ = gen_query(df["Question"].iloc[i], top_k=20, client=client, mode='hybrid', model="llama3.3")
    ai_answer.append(answ[0])  # Get the first answer

df["AI"] = ai_answer
print("Updated Testing Data with AI Answers:")
display(df.head())

  0%|          | 0/60 [00:00<?, ?it/s]

using Hybrid


  2%|▏         | 1/60 [00:33<32:50, 33.40s/it]

using Hybrid


  3%|▎         | 2/60 [00:49<22:36, 23.38s/it]

using Hybrid


  5%|▌         | 3/60 [00:59<16:23, 17.26s/it]

using Hybrid


  7%|▋         | 4/60 [01:16<15:44, 16.87s/it]

using Hybrid


  8%|▊         | 5/60 [01:25<13:10, 14.37s/it]

using Hybrid


 10%|█         | 6/60 [01:35<11:20, 12.61s/it]

using Hybrid


 12%|█▏        | 7/60 [02:11<18:02, 20.42s/it]

using Hybrid


 13%|█▎        | 8/60 [02:46<21:45, 25.10s/it]

using Hybrid


 15%|█▌        | 9/60 [03:07<20:15, 23.83s/it]

using Hybrid


 17%|█▋        | 10/60 [03:20<16:56, 20.33s/it]

using Hybrid


 18%|█▊        | 11/60 [03:31<14:17, 17.50s/it]

using Hybrid


 20%|██        | 12/60 [03:54<15:15, 19.08s/it]

using Hybrid


 22%|██▏       | 13/60 [04:17<15:53, 20.28s/it]

using Hybrid


 23%|██▎       | 14/60 [04:40<16:13, 21.16s/it]

using Hybrid


 25%|██▌       | 15/60 [05:06<17:06, 22.80s/it]

using Hybrid


 27%|██▋       | 16/60 [05:22<15:13, 20.77s/it]

using Hybrid


 28%|██▊       | 17/60 [05:56<17:44, 24.75s/it]

using Hybrid


 30%|███       | 18/60 [06:42<21:44, 31.06s/it]

using Hybrid


 32%|███▏      | 19/60 [07:14<21:16, 31.14s/it]

using Hybrid


 33%|███▎      | 20/60 [07:27<17:14, 25.86s/it]

using Hybrid


 35%|███▌      | 21/60 [07:53<16:50, 25.91s/it]

using Hybrid


 37%|███▋      | 22/60 [08:21<16:41, 26.36s/it]

using Hybrid


 38%|███▊      | 23/60 [08:29<13:01, 21.12s/it]

using Hybrid


 40%|████      | 24/60 [08:47<11:58, 19.96s/it]

using Hybrid


 42%|████▏     | 25/60 [09:07<11:42, 20.07s/it]

using Hybrid


 43%|████▎     | 26/60 [09:26<11:16, 19.88s/it]

using Hybrid


 45%|████▌     | 27/60 [10:01<13:21, 24.29s/it]

using Hybrid


 47%|████▋     | 28/60 [10:13<11:01, 20.68s/it]

using Hybrid


 48%|████▊     | 29/60 [10:22<08:51, 17.16s/it]

using Hybrid


 50%|█████     | 30/60 [10:46<09:32, 19.07s/it]

using Hybrid


 52%|█████▏    | 31/60 [11:04<09:06, 18.85s/it]

using Hybrid


 53%|█████▎    | 32/60 [11:13<07:26, 15.96s/it]

using Hybrid


 55%|█████▌    | 33/60 [11:32<07:35, 16.87s/it]

using Hybrid


 57%|█████▋    | 34/60 [11:45<06:47, 15.67s/it]

using Hybrid


 58%|█████▊    | 35/60 [12:05<06:59, 16.78s/it]

using Hybrid


 60%|██████    | 36/60 [12:13<05:44, 14.34s/it]

using Hybrid


 62%|██████▏   | 37/60 [12:29<05:37, 14.69s/it]

using Hybrid


 63%|██████▎   | 38/60 [12:45<05:32, 15.09s/it]

using Hybrid


 65%|██████▌   | 39/60 [12:59<05:11, 14.85s/it]

using Hybrid


 67%|██████▋   | 40/60 [13:24<05:57, 17.86s/it]

using Hybrid


 68%|██████▊   | 41/60 [13:40<05:29, 17.37s/it]

using Hybrid


 70%|███████   | 42/60 [14:02<05:37, 18.74s/it]

using Hybrid


 72%|███████▏  | 43/60 [14:16<04:55, 17.38s/it]

using Hybrid


 73%|███████▎  | 44/60 [14:29<04:16, 16.00s/it]

using Hybrid


 75%|███████▌  | 45/60 [15:12<06:01, 24.13s/it]

using Hybrid


 77%|███████▋  | 46/60 [15:40<05:51, 25.13s/it]

using Hybrid


 78%|███████▊  | 47/60 [16:10<05:48, 26.81s/it]

using Hybrid


 80%|████████  | 48/60 [16:48<06:01, 30.09s/it]

using Hybrid


 82%|████████▏ | 49/60 [17:16<05:24, 29.54s/it]

using Hybrid


 83%|████████▎ | 50/60 [17:32<04:15, 25.53s/it]

using Hybrid


 85%|████████▌ | 51/60 [18:02<03:59, 26.66s/it]

using Hybrid


 87%|████████▋ | 52/60 [18:46<04:16, 32.03s/it]

using Hybrid


 88%|████████▊ | 53/60 [19:16<03:39, 31.32s/it]

using Hybrid


 90%|█████████ | 54/60 [19:46<03:05, 30.97s/it]

using Hybrid


 92%|█████████▏| 55/60 [20:10<02:23, 28.74s/it]

using Hybrid


 93%|█████████▎| 56/60 [20:41<01:57, 29.46s/it]

using Hybrid


 95%|█████████▌| 57/60 [20:52<01:11, 23.84s/it]

using Hybrid


 97%|█████████▋| 58/60 [21:08<00:43, 21.50s/it]

using Hybrid


 98%|█████████▊| 59/60 [21:22<00:19, 19.40s/it]

using Hybrid


100%|██████████| 60/60 [21:34<00:00, 21.57s/it]

Updated Testing Data with AI Answers:





Unnamed: 0,Question,A,B,C,D,E,Answer,Prompt,AI
69,"If the driver is turning left, he must: (U-tur...",have the right of way,do so slowly with caution,yield to approaching cars,,,C,\nActual Question: If the driver is turning le...,"When turning left, the driver must:\n\n1. Sign..."
5,"When a vehicle starts to skid, what should the...",Immediately step on the brakes,Hold firmly on to the wheel while slowing down...,Turn the wheels tp the opposite the direction ...,,,B,\nActual Question: When a vehicle starts to sk...,"When a vehicle starts to skid, the driver shou..."
42,"If you are parking uphill without a curb, turn...",edge of the street,other side of the street,middle of the street,,,A,\nActual Question: If you are parking uphill w...,The correct response is: \n\nthe right (or awa...
47,"When parking downhill, you should turn your fr...",toward the curb of the sidewalk,away from the curb,any direction will do,,,A,"\nActual Question: When parking downhill, you ...","When parking downhill, you should turn your fr..."
16,Which of the following is the maximum speed li...,60 kph,80 kph,100 kph,,,C,\nActual Question: Which of the following is t...,The maximum speed limit on an expressway for c...


In [87]:
import re


def process_answers(answers):
    formatted_answers = []
    
    for a in answers:
        
        matches = re.findall(r'\[?\s*([A-E](?:\s*,\s*[A-E])*)\s*\]?', str(a)) # Extract answers like [A, C, D] or [A] or [B, D]
        answers = []
        for match in matches:
            answers.extend(re.split(r'\s*,\s*', match))  # Split by comma and remove spaces
        unique_sorted_answers = sorted(set(answers), key=lambda x: ['A', 'B', 'C', 'D', 'E'].index(x))
        if not unique_sorted_answers:
            formatted_answers.append(None)
        else:
            formatted_answers.append(unique_sorted_answers)
    return formatted_answers

df_results = df[["Question", "Answer", "AI"]].copy()
df_results['Answer'] = df_results['Answer'].apply(lambda x: x.split(', '))
df_results['AI'] = process_answers(df_results["AI"])
df_results['Answer'] = process_answers(df_results["Answer"])



def calculate_scores(df):
    scores = []
    for index, row in df.iterrows():
        correct_answers = set(row['Answer'] if row['Answer'] is not None else [])
        ai_answers = set(row['AI'] if row['AI'] is not None else [])
        if ai_answers == correct_answers:
            score = 1.0
        else:
            score = 0.0
        scores.append(score)
    
    df['Score'] = scores
    accuracy = scores.count(1.0) / len(scores)
    print(f'Final Score: {scores.count(1.0):.2f}/{len(scores):.2f}')
    print(f'Accuracy: {accuracy:.2f}%')
    return df

# Apply the scoring function
scored_df = calculate_scores(df_results)

# Display the dataframe to verify the results
display(scored_df[['Question', 'Answer', 'AI', 'Score']])

Final Score: 6.00/60.00
Accuracy: 0.10%


Unnamed: 0,Question,Answer,AI,Score
69,"If the driver is turning left, he must: (U-tur...",[C],,0.0
5,"When a vehicle starts to skid, what should the...",[B],,0.0
42,"If you are parking uphill without a curb, turn...",[A],,0.0
47,"When parking downhill, you should turn your fr...",[A],[A],1.0
16,Which of the following is the maximum speed li...,[C],,0.0
45,Never park or stop at the side of the road wit...,[C],,0.0
34,While driving with maximum speed and you have ...,[C],"[A, B, C, D]",0.0
7,What will happen when your rear tire blows out?,[B],"[A, E]",0.0
79,Operating a public utility vehicle equipped wi...,[A],"[A, B, C]",0.0
27,Your speed while driving at night should keep on:,[A],,0.0


# Visualization

In [88]:
# Gradio interface with dynamic model, mode selection, and top_k slider
iface = gr.Interface(
    fn=lambda query, top_k, model, mode: gen_query(
        query=query,
        top_k=top_k,
        client=client,
        mode=mode,
        model=model
    ),
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(1, 20, value=5, step=1, label="Top K Results"),  # Slider for top_k (1 to 20)
        gr.Dropdown(
            choices=["llama3.3", "llama3.2:latest", "llama3.1:8b"],
            value="llama3.3",
            label="Select Model"
        ),
        gr.Dropdown(
            choices=["hybrid", "dense", "sparse"],
            value="hybrid",
            label="Select Retrieval Mode"
        )
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="References", lines=10),
    ],
    title="RAG System with LLaMA Models",
    description=(
        "Ask questions and get answers with references from PDF documents. "
        "Adjust Top-K to control the number of retrieved chunks. "
        "Choose different models and retrieval modes for customization."
    )
)

# Launch the Gradio interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [89]:
# Handle closing behavior
def on_close():
    iface.close()
on_close()

Closing server running on port: 7860
